1 : /*
2 : +----------------------------------------------------------------------+
3 : | PHP Version 5 |
4 : +----------------------------------------------------------------------+
5 : | Copyright (c) 1997-2007 The PHP Group |
6 : +----------------------------------------------------------------------+
7 : | This source file is subject to version 3.01 of the PHP license, |
8 : | that is bundled with this package in the file LICENSE, and is |
9 : | available through the world-wide-web at the following url: |
10 : | http://www.php.net/license/3_01.txt |
11 : | If you did not receive a copy of the PHP license and are unable to |
12 : | obtain it through the world-wide-web, please send a note to |
13 : | license@php.net so we can mail you a copy immediately. |
14 : +----------------------------------------------------------------------+
15 : | Authors: Rasmus Lerdorf <rasmus@php.net> |
16 : | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> |
17 : | Wez Furlong <wez@thebrainroom.com> |
18 : +----------------------------------------------------------------------+
19 : */
20 :
21 : /* $Id: html.c,v 1.111.2.2.2.9 2007/02/27 03:28:16 iliaa Exp $ */
22 :
23 : /*
24 : * HTML entity resources:
25 : *
26 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp
27 : * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp
28 : * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT
29 : *
30 : * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2
31 : *
32 : */
33 :
34 : #include "php.h"
35 : #if PHP_WIN32
36 : #include "config.w32.h"
37 : #else
38 : #include <php_config.h>
39 : #endif
40 : #include "reg.h"
41 : #include "html.h"
42 : #include "php_string.h"
43 : #include "SAPI.h"
44 : #if HAVE_LOCALE_H
45 : #include <locale.h>
46 : #endif
47 : #if HAVE_LANGINFO_H
48 : #include <langinfo.h>
49 : #endif
50 :
51 : #if HAVE_MBSTRING
52 : # include "ext/mbstring/mbstring.h"
53 : ZEND_EXTERN_MODULE_GLOBALS(mbstring)
54 : #endif
55 :
56 : enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
57 : cs_8859_15, cs_utf_8, cs_big5, cs_gb2312,
58 : cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r,
59 : cs_cp1251, cs_8859_5, cs_cp866, cs_macroman
60 : };
61 : typedef const char *const entity_table_t;
62 :
63 : /* codepage 1252 is a Windows extension to iso-8859-1. */
64 : static entity_table_t ent_cp_1252[] = {
65 : "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger",
66 : "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig",
67 : NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo",
68 : "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo",
69 : "oelig", NULL, NULL, "Yuml"
70 : };
71 :
72 : static entity_table_t ent_iso_8859_1[] = {
73 : "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar",
74 : "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg",
75 : "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro",
76 : "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14",
77 : "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc",
78 : "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
79 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
80 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
81 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
82 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
83 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
84 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
85 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
86 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
87 : "uuml", "yacute", "thorn", "yuml"
88 : };
89 :
90 : static entity_table_t ent_iso_8859_15[] = {
91 : "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron",
92 : "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg",
93 : "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */
94 : "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm",
95 : "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute",
96 : "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave",
97 : "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc",
98 : "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde",
99 : "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml",
100 : "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc",
101 : "atilde", "auml", "aring", "aelig", "ccedil", "egrave",
102 : "eacute", "ecirc", "euml", "igrave", "iacute", "icirc",
103 : "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde",
104 : "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc",
105 : "uuml", "yacute", "thorn", "yuml"
106 : };
107 :
108 : static entity_table_t ent_uni_338_402[] = {
109 : /* 338 (0x0152) */
110 : "OElig", "oelig", NULL, NULL, NULL, NULL,
111 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
112 : /* 352 (0x0160) */
113 : "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL,
114 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
115 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
116 : /* 376 (0x0178) */
117 : "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
118 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
119 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
120 : /* 400 (0x0190) */
121 : NULL, NULL, "fnof"
122 : };
123 :
124 : static entity_table_t ent_uni_spacing[] = {
125 : /* 710 */
126 : "circ",
127 : /* 711 - 730 */
128 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
129 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
130 : /* 731 - 732 */
131 : NULL, "tilde"
132 : };
133 :
134 : static entity_table_t ent_uni_greek[] = {
135 : /* 913 */
136 : "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta",
137 : "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho",
138 : NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega",
139 : /* 938 - 944 are not mapped */
140 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
141 : "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
142 : "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
143 : "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
144 : /* 970 - 976 are not mapped */
145 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
146 : "thetasym", "upsih",
147 : NULL, NULL, NULL,
148 : "piv"
149 : };
150 :
151 : static entity_table_t ent_uni_punct[] = {
152 : /* 8194 */
153 : "ensp", "emsp", NULL, NULL, NULL, NULL, NULL,
154 : "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm",
155 : NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL,
156 : /* 8216 */
157 : "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL,
158 : "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip",
159 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL,
160 : /* 8242 */
161 : "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL,
162 : NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL,
163 : "frasl"
164 : };
165 :
166 : static entity_table_t ent_uni_euro[] = {
167 : "euro"
168 : };
169 :
170 : static entity_table_t ent_uni_8465_8501[] = {
171 : /* 8465 */
172 : "image", NULL, NULL, NULL, NULL, NULL, NULL,
173 : /* 8472 */
174 : "weierp", NULL, NULL, NULL,
175 : /* 8476 */
176 : "real", NULL, NULL, NULL, NULL, NULL,
177 : /* 8482 */
178 : "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180 : /* 8501 */
181 : "alefsym",
182 : };
183 :
184 : static entity_table_t ent_uni_8592_9002[] = {
185 : /* 8592 (0x2190) */
186 : "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL,
187 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
188 : /* 8608 (0x21a0) */
189 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
190 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
191 : /* 8624 (0x21b0) */
192 : NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL,
193 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
194 : /* 8640 (0x21c0) */
195 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
196 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
197 : /* 8656 (0x21d0) */
198 : "lArr", "uArr", "rArr", "dArr", "hArr", "vArr", NULL, NULL,
199 : NULL, NULL, "lAarr", "rAarr", NULL, "rarrw", NULL, NULL,
200 : /* 8672 (0x21e0) */
201 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
202 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
203 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
204 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
205 : /* 8704 (0x2200) */
206 : "forall", "comp", "part", "exist", "nexist", "empty", NULL, "nabla",
207 : "isin", "notin", "epsis", "ni", "notni", "bepsi", NULL, "prod",
208 : /* 8720 (0x2210) */
209 : "coprod", "sum", "minus", "mnplus", "plusdo", NULL, "setmn", "lowast",
210 : "compfn", NULL, "radic", NULL, NULL, "prop", "infin", "ang90",
211 : /* 8736 (0x2220) */
212 : "ang", "angmsd", "angsph", "mid", "nmid", "par", "npar", "and",
213 : "or", "cap", "cup", "int", NULL, NULL, "conint", NULL,
214 : /* 8752 (0x2230) */
215 : NULL, NULL, NULL, NULL, "there4", "becaus", NULL, NULL,
216 : NULL, NULL, NULL, NULL, "sim", "bsim", NULL, NULL,
217 : /* 8768 (0x2240) */
218 : "wreath", "nsim", NULL, "sime", "nsime", "cong", NULL, "ncong",
219 : "asymp", "nap", "ape", NULL, "bcong", "asymp", "bump", "bumpe",
220 : /* 8784 (0x2250) */
221 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
222 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
223 : /* 8800 (0x2260) */
224 : "ne", "equiv", NULL, NULL, "le", "ge", "lE", "gE",
225 : "lnE", "gnE", "Lt", "Gt", "twixt", NULL, "nlt", "ngt",
226 : /* 8816 (0x2270) */
227 : "nles", "nges", "lsim", "gsim", NULL, NULL, "lg", "gl",
228 : NULL, NULL, "pr", "sc", "cupre", "sscue", "prsim", "scsim",
229 : /* 8832 (0x2280) */
230 : "npr", "nsc", "sub", "sup", "nsub", "nsup", "sube", "supe",
231 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
232 : /* 8848 (0x2290) */
233 : NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes",
234 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
235 : /* 8864 (0x22a0) */
236 : NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL,
237 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
238 : /* 8880 (0x22b0) */
239 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
240 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
241 : /* 8896 (0x22c0) */
242 : NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL,
243 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
244 : /* 8912 (0x22d0) */
245 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
246 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
247 : /* 8928 (0x22e0) */
248 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
249 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
250 : /* 8944 (0x22f0) */
251 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
252 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
253 : /* 8960 (0x2300) */
254 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
255 : "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL,
256 : /* 8976 (0x2310) */
257 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
258 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
259 : /* 8992 (0x2320) */
260 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
261 : NULL, "lang", "rang"
262 : };
263 :
264 : static entity_table_t ent_uni_9674[] = {
265 : /* 9674 */
266 : "loz"
267 : };
268 :
269 : static entity_table_t ent_uni_9824_9830[] = {
270 : /* 9824 */
271 : "spades", NULL, NULL, "clubs", NULL, "hearts", "diams"
272 : };
273 :
274 : static entity_table_t ent_koi8r[] = {
275 : "#1105", /* "jo "*/
276 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
277 : NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */
278 : NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
279 : "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092",
280 : "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084",
281 : "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090",
282 : "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096",
283 : "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041",
284 : "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048",
285 : "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055",
286 : "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042",
287 : "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063",
288 : "#1066"
289 : };
290 :
291 : static entity_table_t ent_cp_1251[] = {
292 : "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger",
293 : "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036",
294 : "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220",
295 : "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250",
296 : "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118",
297 : "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy",
298 : "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn",
299 : "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105",
300 : "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111",
301 : "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046",
302 : "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053",
303 : "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060",
304 : "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067",
305 : "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074",
306 : "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081",
307 : "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088",
308 : "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095",
309 : "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102",
310 : "#1103"
311 : };
312 :
313 : static entity_table_t ent_iso_8859_5[] = {
314 : "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062",
315 : "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069",
316 : "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076",
317 : "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083",
318 : "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090",
319 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
320 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104",
321 : "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111",
322 : "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118",
323 : "#1119"
324 : };
325 :
326 : static entity_table_t ent_cp_866[] = {
327 :
328 : "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566",
329 : "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552",
330 : "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560",
331 : "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608",
332 : "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090",
333 : "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097",
334 : "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025",
335 : "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118",
336 : "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632",
337 : "#160"
338 : };
339 :
340 : /* MacRoman has a couple of low-ascii chars that need mapping too */
341 : /* Vertical tab (ASCII 11) is often used to store line breaks inside */
342 : /* DB exports, this mapping changes it to a space */
343 : static entity_table_t ent_macroman[] = {
344 : "sp", NULL, NULL, NULL,
345 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
346 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
347 : NULL, NULL, NULL, NULL, NULL, "quot", NULL,
348 : NULL, NULL, "amp", NULL, NULL, NULL, NULL,
349 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
350 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
351 : NULL, NULL, NULL, "lt", NULL, "gt", NULL,
352 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
353 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
354 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
355 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
356 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
357 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
358 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
359 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
360 : NULL, NULL, NULL, NULL, NULL, NULL, NULL,
361 : NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml",
362 : "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring",
363 : "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave",
364 : "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml",
365 : "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg",
366 : "cent", "pound", "sect", "bull", "para", "szlig", "reg",
367 : "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash",
368 : "infin", "plusmn", "le", "ge", "yen", "micro", "part",
369 : "sum", "prod", "pi", "int", "ordf", "ordm", "Omega",
370 : "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof",
371 : "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave",
372 : "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo",
373 : "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml",
374 : "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger",
375 : "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute",
376 : "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute",
377 : "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305",
378 : "circ", "tilde", "macr", "#728", "#729", "#730", "cedil",
379 : "#733", "#731", "#711"
380 : };
381 :
382 : struct html_entity_map {
383 : enum entity_charset charset; /* charset identifier */
384 : unsigned short basechar; /* char code at start of table */
385 : unsigned short endchar; /* last char code in the table */
386 : entity_table_t *table; /* the table of mappings */
387 : };
388 :
389 : static const struct html_entity_map entity_map[] = {
390 : { cs_cp1252, 0x80, 0x9f, ent_cp_1252 },
391 : { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 },
392 : { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 },
393 : { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 },
394 : { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 },
395 : { cs_utf_8, 338, 402, ent_uni_338_402 },
396 : { cs_utf_8, 710, 732, ent_uni_spacing },
397 : { cs_utf_8, 913, 982, ent_uni_greek },
398 : { cs_utf_8, 8194, 8260, ent_uni_punct },
399 : { cs_utf_8, 8364, 8364, ent_uni_euro },
400 : { cs_utf_8, 8465, 8501, ent_uni_8465_8501 },
401 : { cs_utf_8, 8592, 9002, ent_uni_8592_9002 },
402 : { cs_utf_8, 9674, 9674, ent_uni_9674 },
403 : { cs_utf_8, 9824, 9830, ent_uni_9824_9830 },
404 : { cs_big5, 0xa0, 0xff, ent_iso_8859_1 },
405 : { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 },
406 : { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 },
407 : { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 },
408 : { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 },
409 : { cs_koi8r, 0xa3, 0xff, ent_koi8r },
410 : { cs_cp1251, 0x80, 0xff, ent_cp_1251 },
411 : { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 },
412 : { cs_cp866, 0xc0, 0xff, ent_cp_866 },
413 : { cs_macroman, 0x0b, 0xff, ent_macroman },
414 : { cs_terminator }
415 : };
416 :
417 : static const struct {
418 : const char *codeset;
419 : enum entity_charset charset;
420 : } charset_map[] = {
421 : { "ISO-8859-1", cs_8859_1 },
422 : { "ISO8859-1", cs_8859_1 },
423 : { "ISO-8859-15", cs_8859_15 },
424 : { "ISO8859-15", cs_8859_15 },
425 : { "utf-8", cs_utf_8 },
426 : { "cp1252", cs_cp1252 },
427 : { "Windows-1252", cs_cp1252 },
428 : { "1252", cs_cp1252 },
429 : { "BIG5", cs_big5 },
430 : { "950", cs_big5 },
431 : { "GB2312", cs_gb2312 },
432 : { "936", cs_gb2312 },
433 : { "BIG5-HKSCS", cs_big5hkscs },
434 : { "Shift_JIS", cs_sjis },
435 : { "SJIS", cs_sjis },
436 : { "932", cs_sjis },
437 : { "EUCJP", cs_eucjp },
438 : { "EUC-JP", cs_eucjp },
439 : { "KOI8-R", cs_koi8r },
440 : { "koi8-ru", cs_koi8r },
441 : { "koi8r", cs_koi8r },
442 : { "cp1251", cs_cp1251 },
443 : { "Windows-1251", cs_cp1251 },
444 : { "win-1251", cs_cp1251 },
445 : { "iso8859-5", cs_8859_5 },
446 : { "iso-8859-5", cs_8859_5 },
447 : { "cp866", cs_cp866 },
448 : { "866", cs_cp866 },
449 : { "ibm866", cs_cp866 },
450 : { "MacRoman", cs_macroman },
451 : { NULL }
452 : };
453 :
454 : static const struct {
455 : unsigned short charcode;
456 : char *entity;
457 : int entitylen;
458 : int flags;
459 : } basic_entities[] = {
460 : { '"', """, 6, ENT_HTML_QUOTE_DOUBLE },
461 : { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE },
462 : { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE },
463 : { '<', "<", 4, 0 },
464 : { '>', ">", 4, 0 },
465 : { 0, NULL, 0, 0 }
466 : };
467 :
468 : struct basic_entities_dec {
469 : unsigned short charcode;
470 : char entity[8];
471 : int entitylen;
472 : };
473 :
474 : #define MB_RETURN { \
475 : *newpos = pos; \
476 : mbseq[mbpos] = '\0'; \
477 : *mbseqlen = mbpos; \
478 : return this_char; }
479 :
480 : #define MB_WRITE(mbchar) { \
481 : mbspace--; \
482 : if (mbspace == 0) { \
483 : MB_RETURN; \
484 : } \
485 : mbseq[mbpos++] = (mbchar); }
486 :
487 : /* {{{ get_next_char
488 : */
489 : inline static unsigned short get_next_char(enum entity_charset charset,
490 : unsigned char * str,
491 : int * newpos,
492 : unsigned char * mbseq,
493 : int * mbseqlen)
494 0 : {
495 0 : int pos = *newpos;
496 0 : int mbpos = 0;
497 0 : int mbspace = *mbseqlen;
498 0 : unsigned short this_char = str[pos++];
499 :
500 0 : if (mbspace <= 0) {
501 0 : *mbseqlen = 0;
502 0 : return this_char;
503 : }
504 :
505 0 : MB_WRITE((unsigned char)this_char);
506 :
507 0 : switch (charset) {
508 : case cs_utf_8:
509 : {
510 0 : unsigned long utf = 0;
511 0 : int stat = 0;
512 0 : int more = 1;
513 :
514 : /* unpack utf-8 encoding into a wide char.
515 : * Code stolen from the mbstring extension */
516 :
517 : do {
518 0 : if (this_char < 0x80) {
519 0 : more = 0;
520 0 : break;
521 0 : } else if (this_char < 0xc0) {
522 0 : switch (stat) {
523 : case 0x10: /* 2, 2nd */
524 : case 0x21: /* 3, 3rd */
525 : case 0x32: /* 4, 4th */
526 : case 0x43: /* 5, 5th */
527 : case 0x54: /* 6, 6th */
528 : /* last byte in sequence */
529 0 : more = 0;
530 0 : utf |= (this_char & 0x3f);
531 0 : this_char = (unsigned short)utf;
532 0 : break;
533 : case 0x20: /* 3, 2nd */
534 : case 0x31: /* 4, 3rd */
535 : case 0x42: /* 5, 4th */
536 : case 0x53: /* 6, 5th */
537 : /* penultimate char */
538 0 : utf |= ((this_char & 0x3f) << 6);
539 0 : stat++;
540 0 : break;
541 : case 0x30: /* 4, 2nd */
542 : case 0x41: /* 5, 3rd */
543 : case 0x52: /* 6, 4th */
544 0 : utf |= ((this_char & 0x3f) << 12);
545 0 : stat++;
546 0 : break;
547 : case 0x40: /* 5, 2nd */
548 : case 0x51:
549 0 : utf |= ((this_char & 0x3f) << 18);
550 0 : stat++;
551 0 : break;
552 : case 0x50: /* 6, 2nd */
553 0 : utf |= ((this_char & 0x3f) << 24);
554 0 : stat++;
555 0 : break;
556 : default:
557 : /* invalid */
558 0 : more = 0;
559 : }
560 : }
561 : /* lead byte */
562 0 : else if (this_char < 0xe0) {
563 0 : stat = 0x10; /* 2 byte */
564 0 : utf = (this_char & 0x1f) << 6;
565 0 : } else if (this_char < 0xf0) {
566 0 : stat = 0x20; /* 3 byte */
567 0 : utf = (this_char & 0xf) << 12;
568 0 : } else if (this_char < 0xf8) {
569 0 : stat = 0x30; /* 4 byte */
570 0 : utf = (this_char & 0x7) << 18;
571 0 : } else if (this_char < 0xfc) {
572 0 : stat = 0x40; /* 5 byte */
573 0 : utf = (this_char & 0x3) << 24;
574 0 : } else if (this_char < 0xfe) {
575 0 : stat = 0x50; /* 6 byte */
576 0 : utf = (this_char & 0x1) << 30;
577 : } else {
578 : /* invalid; bail */
579 0 : more = 0;
580 0 : break;
581 : }
582 :
583 0 : if (more) {
584 0 : this_char = str[pos++];
585 0 : MB_WRITE((unsigned char)this_char);
586 : }
587 0 : } while (more);
588 : }
589 0 : break;
590 : case cs_big5:
591 : case cs_gb2312:
592 : case cs_big5hkscs:
593 : {
594 : /* check if this is the first of a 2-byte sequence */
595 0 : if (this_char >= 0xa1 && this_char <= 0xfe) {
596 : /* peek at the next char */
597 0 : unsigned char next_char = str[pos];
598 0 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
599 : (next_char >= 0xa1 && next_char <= 0xfe)) {
600 : /* yes, this a wide char */
601 0 : this_char <<= 8;
602 0 : MB_WRITE(next_char);
603 0 : this_char |= next_char;
604 0 : pos++;
605 : }
606 :
607 : }
608 0 : break;
609 : }
610 : case cs_sjis:
611 : {
612 : /* check if this is the first of a 2-byte sequence */
613 0 : if ( (this_char >= 0x81 && this_char <= 0x9f) ||
614 : (this_char >= 0xe0 && this_char <= 0xef)
615 : ) {
616 : /* peek at the next char */
617 0 : unsigned char next_char = str[pos];
618 0 : if ((next_char >= 0x40 && next_char <= 0x7e) ||
619 : (next_char >= 0x80 && next_char <= 0xfc))
620 : {
621 : /* yes, this a wide char */
622 0 : this_char <<= 8;
623 0 : MB_WRITE(next_char);
624 0 : this_char |= next_char;
625 0 : pos++;
626 : }
627 :
628 : }
629 0 : break;
630 : }
631 : case cs_eucjp:
632 : {
633 : /* check if this is the first of a multi-byte sequence */
634 0 : if (this_char >= 0xa1 && this_char <= 0xfe) {
635 : /* peek at the next char */
636 0 : unsigned char next_char = str[pos];
637 0 : if (next_char >= 0xa1 && next_char <= 0xfe) {
638 : /* yes, this a jis kanji char */
639 0 : this_char <<= 8;
640 0 : MB_WRITE(next_char);
641 0 : this_char |= next_char;
642 0 : pos++;
643 : }
644 :
645 0 : } else if (this_char == 0x8e) {
646 : /* peek at the next char */
647 0 : unsigned char next_char = str[pos];
648 0 : if (next_char >= 0xa1 && next_char <= 0xdf) {
649 : /* JIS X 0201 kana */
650 0 : this_char <<= 8;
651 0 : MB_WRITE(next_char);
652 0 : this_char |= next_char;
653 0 : pos++;
654 : }
655 :
656 0 : } else if (this_char == 0x8f) {
657 : /* peek at the next two char */
658 0 : unsigned char next_char = str[pos];
659 0 : unsigned char next2_char = str[pos+1];
660 0 : if ((next_char >= 0xa1 && next_char <= 0xfe) &&
661 : (next2_char >= 0xa1 && next2_char <= 0xfe)) {
662 : /* JIS X 0212 hojo-kanji */
663 0 : this_char <<= 8;
664 0 : MB_WRITE(next_char);
665 0 : this_char |= next_char;
666 0 : pos++;
667 0 : this_char <<= 8;
668 0 : MB_WRITE(next2_char);
669 0 : this_char |= next2_char;
670 0 : pos++;
671 : }
672 :
673 : }
674 : break;
675 : }
676 : default:
677 : break;
678 : }
679 0 : MB_RETURN;
680 : }
681 : /* }}} */
682 :
683 : /* {{{ entity_charset determine_charset
684 : * returns the charset identifier based on current locale or a hint.
685 : * defaults to iso-8859-1 */
686 : static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC)
687 0 : {
688 : int i;
689 0 : enum entity_charset charset = cs_8859_1;
690 0 : int len = 0;
691 0 : zval *uf_result = NULL;
692 :
693 : /* Guarantee default behaviour for backwards compatibility */
694 0 : if (charset_hint == NULL)
695 0 : return cs_8859_1;
696 :
697 0 : if ((len = strlen(charset_hint)) != 0) {
698 0 : goto det_charset;
699 : }
700 : #if HAVE_MBSTRING
701 : #if !defined(COMPILE_DL_MBSTRING)
702 : /* XXX: Ugly things. Why don't we look for a more sophisticated way? */
703 : switch (MBSTRG(current_internal_encoding)) {
704 : case mbfl_no_encoding_8859_1:
705 : return cs_8859_1;
706 :
707 : case mbfl_no_encoding_utf8:
708 : return cs_utf_8;
709 :
710 : case mbfl_no_encoding_euc_jp:
711 : case mbfl_no_encoding_eucjp_win:
712 : return cs_eucjp;
713 :
714 : case mbfl_no_encoding_sjis:
715 : case mbfl_no_encoding_sjis_win:
716 : case mbfl_no_encoding_sjis_mac:
717 : return cs_sjis;
718 :
719 : case mbfl_no_encoding_cp1252:
720 : return cs_cp1252;
721 :
722 : case mbfl_no_encoding_8859_15:
723 : return cs_8859_15;
724 :
725 : case mbfl_no_encoding_big5:
726 : return cs_big5;
727 :
728 : case mbfl_no_encoding_euc_cn:
729 : case mbfl_no_encoding_hz:
730 : case mbfl_no_encoding_cp936:
731 : return cs_gb2312;
732 :
733 : case mbfl_no_encoding_koi8r:
734 : return cs_koi8r;
735 :
736 : case mbfl_no_encoding_cp866:
737 : return cs_cp866;
738 :
739 : case mbfl_no_encoding_cp1251:
740 : return cs_cp1251;
741 :
742 : case mbfl_no_encoding_8859_5:
743 : return cs_8859_5;
744 :
745 : default:
746 : ;
747 : }
748 : #else
749 : {
750 : zval nm_mb_internal_encoding;
751 :
752 : ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0);
753 :
754 : if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) {
755 :
756 : charset_hint = Z_STRVAL_P(uf_result);
757 : len = Z_STRLEN_P(uf_result);
758 :
759 : if (len == 4) { /* sizeof(none|auto|pass)-1 */
760 : if (!memcmp("pass", charset_hint, sizeof("pass") - 1) ||
761 : !memcmp("auto", charset_hint, sizeof("auto") - 1) ||
762 : !memcmp("none", charset_hint, sizeof("none") - 1)) {
763 :
764 : charset_hint = NULL;
765 : len = 0;
766 : }
767 : }
768 : goto det_charset;
769 : }
770 : }
771 : #endif
772 : #endif
773 :
774 0 : charset_hint = SG(default_charset);
775 0 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
776 0 : goto det_charset;
777 : }
778 :
779 : /* try to detect the charset for the locale */
780 : #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET)
781 0 : charset_hint = nl_langinfo(CODESET);
782 0 : if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
783 0 : goto det_charset;
784 : }
785 : #endif
786 :
787 : #if HAVE_LOCALE_H
788 : /* try to figure out the charset from the locale */
789 : {
790 : char *localename;
791 : char *dot, *at;
792 :
793 : /* lang[_territory][.codeset][@modifier] */
794 0 : localename = setlocale(LC_CTYPE, NULL);
795 :
796 0 : dot = strchr(localename, '.');
797 0 : if (dot) {
798 0 : dot++;
799 : /* locale specifies a codeset */
800 0 : at = strchr(dot, '@');
801 0 : if (at)
802 0 : len = at - dot;
803 : else
804 0 : len = strlen(dot);
805 0 : charset_hint = dot;
806 : } else {
807 : /* no explicit name; see if the name itself
808 : * is the charset */
809 0 : charset_hint = localename;
810 0 : len = strlen(charset_hint);
811 : }
812 : }
813 : #endif
814 :
815 0 : det_charset:
816 :
817 0 : if (charset_hint) {
818 0 : int found = 0;
819 :
820 : /* now walk the charset map and look for the codeset */
821 0 : for (i = 0; charset_map[i].codeset; i++) {
822 0 : if (strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) {
823 0 : charset = charset_map[i].charset;
824 0 : found = 1;
825 0 : break;
826 : }
827 : }
828 0 : if (!found) {
829 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1",
830 : charset_hint);
831 : }
832 : }
833 0 : if (uf_result != NULL) {
834 0 : zval_ptr_dtor(&uf_result);
835 : }
836 0 : return charset;
837 : }
838 : /* }}} */
839 :
840 : /* {{{ php_utf32_utf8 */
841 : size_t php_utf32_utf8(unsigned char *buf, int k)
842 0 : {
843 0 : size_t retval = 0;
844 :
845 0 : if (k < 0x80) {
846 0 : buf[0] = k;
847 0 : retval = 1;
848 0 : } else if (k < 0x800) {
849 0 : buf[0] = 0xc0 | (k >> 6);
850 0 : buf[1] = 0x80 | (k & 0x3f);
851 0 : retval = 2;
852 0 : } else if (k < 0x10000) {
853 0 : buf[0] = 0xe0 | (k >> 12);
854 0 : buf[1] = 0x80 | ((k >> 6) & 0x3f);
855 0 : buf[2] = 0x80 | (k & 0x3f);
856 0 : retval = 3;
857 0 : } else if (k < 0x200000) {
858 0 : buf[0] = 0xf0 | (k >> 18);
859 0 : buf[1] = 0x80 | ((k >> 12) & 0x3f);
860 0 : buf[2] = 0x80 | ((k >> 6) & 0x3f);
861 0 : buf[3] = 0x80 | (k & 0x3f);
862 0 : retval = 4;
863 0 : } else if (k < 0x4000000) {
864 0 : buf[0] = 0xf8 | (k >> 24);
865 0 : buf[1] = 0x80 | ((k >> 18) & 0x3f);
866 0 : buf[2] = 0x80 | ((k >> 12) & 0x3f);
867 0 : buf[3] = 0x80 | ((k >> 6) & 0x3f);
868 0 : buf[4] = 0x80 | (k & 0x3f);
869 0 : retval = 5;
870 : } else {
871 0 : buf[0] = 0xfc | (k >> 30);
872 0 : buf[1] = 0x80 | ((k >> 24) & 0x3f);
873 0 : buf[2] = 0x80 | ((k >> 18) & 0x3f);
874 0 : buf[3] = 0x80 | ((k >> 12) & 0x3f);
875 0 : buf[4] = 0x80 | ((k >> 6) & 0x3f);
876 0 : buf[5] = 0x80 | (k & 0x3f);
877 0 : retval = 6;
878 : }
879 0 : buf[retval] = '\0';
880 :
881 0 : return retval;
882 : }
883 : /* }}} */
884 :
885 : /* {{{ php_unescape_html_entities
886 : */
887 : PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
888 0 : {
889 : int retlen;
890 : int j, k;
891 : char *replaced, *ret, *p, *q, *lim, *next;
892 0 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
893 : unsigned char replacement[15];
894 : int replacement_len;
895 :
896 0 : ret = estrndup(old, oldlen);
897 0 : retlen = oldlen;
898 0 : if (!retlen) {
899 0 : goto empty_source;
900 : }
901 :
902 0 : if (all) {
903 : /* look for a match in the maps for this charset */
904 0 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
905 0 : if (entity_map[j].charset != charset)
906 0 : continue;
907 :
908 0 : for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) {
909 : unsigned char entity[32];
910 0 : int entity_length = 0;
911 :
912 0 : if (entity_map[j].table[k - entity_map[j].basechar] == NULL)
913 0 : continue;
914 :
915 0 : entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]);
916 0 : if (entity_length >= sizeof(entity)) {
917 0 : continue;
918 : }
919 :
920 : /* When we have MBCS entities in the tables above, this will need to handle it */
921 0 : replacement_len = 0;
922 0 : switch (charset) {
923 : case cs_8859_1:
924 : case cs_cp1252:
925 : case cs_8859_15:
926 : case cs_cp1251:
927 : case cs_8859_5:
928 : case cs_cp866:
929 0 : replacement[0] = k;
930 0 : replacement[1] = '\0';
931 0 : replacement_len = 1;
932 0 : break;
933 :
934 : case cs_big5:
935 : case cs_gb2312:
936 : case cs_big5hkscs:
937 : case cs_sjis:
938 : case cs_eucjp:
939 : /* we cannot properly handle those multibyte encodings
940 : * with php_str_to_str. skip it. */
941 0 : continue;
942 :
943 : case cs_utf_8:
944 0 : replacement_len = php_utf32_utf8(replacement, k);
945 0 : break;
946 :
947 : default:
948 0 : php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!");
949 0 : efree(ret);
950 0 : return NULL;
951 : }
952 :
953 0 : if (php_memnstr(ret, entity, entity_length, ret+retlen)) {
954 0 : replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen);
955 0 : efree(ret);
956 0 : ret = replaced;
957 : }
958 : }
959 : }
960 : }
961 :
962 0 : for (j = 0; basic_entities[j].charcode != 0; j++) {
963 :
964 0 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
965 0 : continue;
966 :
967 0 : replacement[0] = (unsigned char)basic_entities[j].charcode;
968 0 : replacement[1] = '\0';
969 :
970 0 : if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) {
971 0 : replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen);
972 0 : efree(ret);
973 0 : ret = replaced;
974 : }
975 : }
976 :
977 : /* replace numeric entities & "&" */
978 0 : lim = ret + retlen;
979 0 : for (p = ret, q = ret; p < lim;) {
980 : int code;
981 :
982 0 : if (p[0] == '&') {
983 0 : if (p + 2 < lim) {
984 0 : if (p[1] == '#') {
985 0 : int invalid_code = 0;
986 :
987 0 : if (p[2] == 'x' || p[2] == 'X') {
988 0 : code = strtol(p + 3, &next, 16);
989 : } else {
990 0 : code = strtol(p + 2, &next, 10);
991 : }
992 :
993 0 : if (next != NULL && *next == ';') {
994 0 : switch (charset) {
995 : case cs_utf_8:
996 0 : q += php_utf32_utf8(q, code);
997 0 : break;
998 :
999 : case cs_8859_1:
1000 : case cs_8859_5:
1001 : case cs_8859_15:
1002 0 : if ((code >= 0x80 && code < 0xa0) || code > 0xff) {
1003 0 : invalid_code = 1;
1004 : } else {
1005 0 : if (code == 39 || !quote_style) {
1006 0 : invalid_code = 1;
1007 : } else {
1008 0 : *(q++) = code;
1009 : }
1010 : }
1011 0 : break;
1012 :
1013 : case cs_cp1252:
1014 : case cs_cp1251:
1015 : case cs_cp866:
1016 0 : if (code > 0xff) {
1017 0 : invalid_code = 1;
1018 : } else {
1019 0 : *(q++) = code;
1020 : }
1021 0 : break;
1022 :
1023 : case cs_big5:
1024 : case cs_big5hkscs:
1025 : case cs_sjis:
1026 : case cs_eucjp:
1027 0 : if (code >= 0x80) {
1028 0 : invalid_code = 1;
1029 : } else {
1030 0 : *(q++) = code;
1031 : }
1032 0 : break;
1033 :
1034 : case cs_gb2312:
1035 0 : if (code >= 0x81) {
1036 0 : invalid_code = 1;
1037 : } else {
1038 0 : *(q++) = code;
1039 : }
1040 0 : break;
1041 :
1042 : default:
1043 : /* for backwards compatilibity */
1044 0 : invalid_code = 1;
1045 : break;
1046 : }
1047 0 : if (invalid_code) {
1048 0 : for (; p <= next; p++) {
1049 0 : *(q++) = *p;
1050 : }
1051 : }
1052 0 : p = next + 1;
1053 : } else {
1054 0 : *(q++) = *(p++);
1055 0 : *(q++) = *(p++);
1056 : }
1057 0 : } else if (p + 4 < lim &&
1058 : p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' &&
1059 : p[4] == ';') {
1060 0 : *(q++) = '&';
1061 0 : p += 5;
1062 : } else {
1063 0 : *(q++) = *(p++);
1064 0 : *(q++) = *(p++);
1065 : }
1066 : } else {
1067 0 : *(q++) = *(p++);
1068 : }
1069 : } else {
1070 0 : *(q++) = *(p++);
1071 : }
1072 : }
1073 0 : *q = '\0';
1074 0 : retlen = (size_t)(q - ret);
1075 0 : empty_source:
1076 0 : *newlen = retlen;
1077 0 : return ret;
1078 : }
1079 : /* }}} */
1080 :
1081 :
1082 :
1083 :
1084 : /* {{{ php_escape_html_entities
1085 : */
1086 : PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
1087 0 : {
1088 : int i, j, maxlen, len;
1089 : char *replaced;
1090 0 : enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
1091 : int matches_map;
1092 :
1093 0 : maxlen = 2 * oldlen;
1094 0 : if (maxlen < 128)
1095 0 : maxlen = 128;
1096 0 : replaced = emalloc (maxlen);
1097 0 : len = 0;
1098 :
1099 0 : i = 0;
1100 0 : while (i < oldlen) {
1101 : unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
1102 0 : int mbseqlen = sizeof(mbsequence);
1103 0 : unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
1104 :
1105 0 : matches_map = 0;
1106 :
1107 0 : if (len + 16 > maxlen)
1108 0 : replaced = erealloc (replaced, maxlen += 128);
1109 :
1110 0 : if (all) {
1111 : /* look for a match in the maps for this charset */
1112 0 : unsigned char *rep = NULL;
1113 :
1114 :
1115 0 : for (j = 0; entity_map[j].charset != cs_terminator; j++) {
1116 0 : if (entity_map[j].charset == charset
1117 : && this_char >= entity_map[j].basechar
1118 : && this_char <= entity_map[j].endchar) {
1119 0 : rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar];
1120 0 : if (rep == NULL) {
1121 : /* there is no entity for this position; fall through and
1122 : * just output the character itself */
1123 0 : break;
1124 : }
1125 :
1126 0 : matches_map = 1;
1127 0 : break;
1128 : }
1129 : }
1130 :
1131 0 : if (matches_map) {
1132 0 : int l = strlen(rep);
1133 : /* increase the buffer size */
1134 0 : if (len + 2 + l >= maxlen) {
1135 0 : replaced = erealloc(replaced, maxlen += 128);
1136 : }
1137 :
1138 0 : replaced[len++] = '&';
1139 0 : strlcpy(replaced + len, rep, maxlen);
1140 0 : len += l;
1141 0 : replaced[len++] = ';';
1142 : }
1143 : }
1144 0 : if (!matches_map) {
1145 0 : int is_basic = 0;
1146 :
1147 0 : if (this_char == '&') {
1148 0 : memcpy(replaced + len, "&", sizeof("&") - 1);
1149 0 : len += sizeof("&") - 1;
1150 0 : is_basic = 1;
1151 : } else {
1152 0 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1153 0 : if ((basic_entities[j].charcode != this_char) ||
1154 : (basic_entities[j].flags &&
1155 : (quote_style & basic_entities[j].flags) == 0)) {
1156 : continue;
1157 : }
1158 :
1159 0 : memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen);
1160 0 : len += basic_entities[j].entitylen;
1161 :
1162 0 : is_basic = 1;
1163 0 : break;
1164 : }
1165 : }
1166 :
1167 0 : if (!is_basic) {
1168 : /* a wide char without a named entity; pass through the original sequence */
1169 0 : if (mbseqlen > 1) {
1170 0 : memcpy(replaced + len, mbsequence, mbseqlen);
1171 0 : len += mbseqlen;
1172 : } else {
1173 0 : replaced[len++] = (unsigned char)this_char;
1174 : }
1175 : }
1176 : }
1177 : }
1178 0 : replaced[len] = '\0';
1179 0 : *newlen = len;
1180 :
1181 0 : return replaced;
1182 :
1183 :
1184 : }
1185 : /* }}} */
1186 :
1187 : /* {{{ php_html_entities
1188 : */
1189 : static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
1190 0 : {
1191 0 : char *str, *hint_charset = NULL;
1192 0 : int str_len, hint_charset_len = 0;
1193 : int len;
1194 0 : long quote_style = ENT_COMPAT;
1195 : char *replaced;
1196 :
1197 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1198 0 : return;
1199 : }
1200 :
1201 0 : replaced = php_escape_html_entities(str, str_len, &len, all, quote_style, hint_charset TSRMLS_CC);
1202 0 : RETVAL_STRINGL(replaced, len, 0);
1203 : }
1204 : /* }}} */
1205 :
1206 : #define HTML_SPECIALCHARS 0
1207 : #define HTML_ENTITIES 1
1208 :
1209 : /* {{{ register_html_constants
1210 : */
1211 : void register_html_constants(INIT_FUNC_ARGS)
1212 220 : {
1213 220 : REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS);
1214 220 : REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS);
1215 220 : REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS);
1216 220 : REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS);
1217 220 : REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS);
1218 220 : }
1219 : /* }}} */
1220 :
1221 : /* {{{ proto string htmlspecialchars(string string [, int quote_style][, string charset])
1222 : Convert special characters to HTML entities */
1223 : PHP_FUNCTION(htmlspecialchars)
1224 0 : {
1225 0 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1226 0 : }
1227 : /* }}} */
1228 :
1229 : /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style])
1230 : Convert special HTML entities back to characters */
1231 : PHP_FUNCTION(htmlspecialchars_decode)
1232 0 : {
1233 : char *str, *new_str, *e, *p;
1234 : int len, j, i, new_len;
1235 0 : long quote_style = ENT_COMPAT;
1236 : struct basic_entities_dec basic_entities_dec[8];
1237 :
1238 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) {
1239 0 : return;
1240 : }
1241 :
1242 0 : new_str = estrndup(str, len);
1243 0 : new_len = len;
1244 0 : e = new_str + new_len;
1245 :
1246 0 : if (!(p = memchr(new_str, '&', new_len))) {
1247 0 : RETURN_STRINGL(new_str, new_len, 0);
1248 : }
1249 :
1250 0 : for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) {
1251 0 : if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) {
1252 0 : continue;
1253 : }
1254 0 : basic_entities_dec[j].charcode = basic_entities[i].charcode;
1255 0 : memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1);
1256 0 : basic_entities_dec[j].entitylen = basic_entities[i].entitylen;
1257 0 : j++;
1258 : }
1259 0 : basic_entities_dec[j].charcode = '&';
1260 0 : basic_entities_dec[j].entitylen = sizeof("&") - 1;
1261 0 : memcpy(basic_entities_dec[j].entity, "&", sizeof("&"));
1262 0 : i = j + 1;
1263 :
1264 : do {
1265 0 : int l = e - p;
1266 :
1267 0 : for (j = 0; j < i; j++) {
1268 0 : if (basic_entities_dec[j].entitylen > l) {
1269 0 : continue;
1270 : }
1271 0 : if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) {
1272 0 : int e_len = basic_entities_dec[j].entitylen - 1;
1273 :
1274 0 : *p++ = basic_entities_dec[j].charcode;
1275 0 : memmove(p, p + e_len, (e - p - e_len));
1276 0 : e -= e_len;
1277 0 : goto done;
1278 : }
1279 : }
1280 0 : p++;
1281 :
1282 0 : done:
1283 0 : if (p >= e) {
1284 0 : break;
1285 : }
1286 0 : } while ((p = memchr(p, '&', (e - p))));
1287 :
1288 0 : new_len = e - new_str;
1289 :
1290 0 : new_str[new_len] = '\0';
1291 0 : RETURN_STRINGL(new_str, new_len, 0);
1292 : }
1293 : /* }}} */
1294 :
1295 : /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset])
1296 : Convert all HTML entities to their applicable characters */
1297 : PHP_FUNCTION(html_entity_decode)
1298 0 : {
1299 0 : char *str, *hint_charset = NULL;
1300 : int str_len, hint_charset_len, len;
1301 0 : long quote_style = ENT_COMPAT;
1302 : char *replaced;
1303 :
1304 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len,
1305 : "e_style, &hint_charset, &hint_charset_len) == FAILURE) {
1306 0 : return;
1307 : }
1308 :
1309 0 : replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC);
1310 0 : if (replaced) {
1311 0 : RETURN_STRINGL(replaced, len, 0);
1312 : }
1313 0 : RETURN_FALSE;
1314 : }
1315 : /* }}} */
1316 :
1317 :
1318 : /* {{{ proto string htmlentities(string string [, int quote_style][, string charset])
1319 : Convert all applicable characters to HTML entities */
1320 : PHP_FUNCTION(htmlentities)
1321 0 : {
1322 0 : php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1323 0 : }
1324 : /* }}} */
1325 :
1326 : /* {{{ proto array get_html_translation_table([int table [, int quote_style]])
1327 : Returns the internal translation table used by htmlspecialchars and htmlentities */
1328 : PHP_FUNCTION(get_html_translation_table)
1329 0 : {
1330 0 : long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT;
1331 : int i, j;
1332 : char ind[2];
1333 0 : enum entity_charset charset = determine_charset(NULL TSRMLS_CC);
1334 :
1335 0 : if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|ll", &which, "e_style) == FAILURE) {
1336 0 : return;
1337 : }
1338 :
1339 0 : array_init(return_value);
1340 :
1341 0 : ind[1] = 0;
1342 :
1343 0 : switch (which) {
1344 : case HTML_ENTITIES:
1345 0 : for (j=0; entity_map[j].charset != cs_terminator; j++) {
1346 0 : if (entity_map[j].charset != charset)
1347 0 : continue;
1348 0 : for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) {
1349 : char buffer[16];
1350 :
1351 0 : if (entity_map[j].table[i] == NULL)
1352 0 : continue;
1353 : /* what about wide chars here ?? */
1354 0 : ind[0] = i + entity_map[j].basechar;
1355 0 : snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]);
1356 0 : add_assoc_string(return_value, ind, buffer, 1);
1357 :
1358 : }
1359 : }
1360 : /* break thru */
1361 :
1362 : case HTML_SPECIALCHARS:
1363 0 : for (j = 0; basic_entities[j].charcode != 0; j++) {
1364 :
1365 0 : if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0)
1366 0 : continue;
1367 :
1368 0 : ind[0] = (unsigned char)basic_entities[j].charcode;
1369 0 : add_assoc_stringl(return_value, ind, basic_entities[j].entity, basic_entities[j].entitylen, 1);
1370 : }
1371 0 : add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1);
1372 :
1373 : break;
1374 : }
1375 : }
1376 : /* }}} */
1377 :
1378 : /*
1379 : * Local variables:
1380 : * tab-width: 4
1381 : * c-basic-offset: 4
1382 : * End:
1383 : * vim600: sw=4 ts=4 fdm=marker
1384 : * vim<600: sw=4 ts=4
1385 : */
|