]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Forgivingly lexes HTML (SGML-style) markup into tokens.\r | |
5 | *\r | |
6 | * A lexer parses a string of SGML-style markup and converts them into\r | |
7 | * corresponding tokens. It doesn't check for well-formedness, although its\r | |
8 | * internal mechanism may make this automatic (such as the case of\r | |
9 | * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose\r | |
10 | * from.\r | |
11 | *\r | |
12 | * A lexer is HTML-oriented: it might work with XML, but it's not\r | |
13 | * recommended, as we adhere to a subset of the specification for optimization\r | |
14 | * reasons. This might change in the future. Also, most tokenizers are not\r | |
15 | * expected to handle DTDs or PIs.\r | |
16 | *\r | |
17 | * This class should not be directly instantiated, but you may use create() to\r | |
18 | * retrieve a default copy of the lexer. Being a supertype, this class\r | |
19 | * does not actually define any implementation, but offers commonly used\r | |
20 | * convenience functions for subclasses.\r | |
21 | *\r | |
22 | * @note The unit tests will instantiate this class for testing purposes, as\r | |
23 | * many of the utility functions require a class to be instantiated.\r | |
24 | * This means that, even though this class is not runnable, it will\r | |
25 | * not be declared abstract.\r | |
26 | *\r | |
27 | * @par\r | |
28 | *\r | |
29 | * @note\r | |
30 | * We use tokens rather than create a DOM representation because DOM would:\r | |
31 | *\r | |
32 | * @par\r | |
33 | * -# Require more processing and memory to create,\r | |
34 | * -# Is not streamable, and\r | |
35 | * -# Has the entire document structure (html and body not needed).\r | |
36 | *\r | |
37 | * @par\r | |
38 | * However, DOM is helpful in that it makes it easy to move around nodes\r | |
39 | * without a lot of lookaheads to see when a tag is closed. This is a\r | |
40 | * limitation of the token system and some workarounds would be nice.\r | |
41 | */\r | |
42 | class HTMLPurifier_Lexer\r | |
43 | {\r | |
44 | \r | |
45 | /**\r | |
46 | * Whether or not this lexer implements line-number/column-number tracking.\r | |
47 | * If it does, set to true.\r | |
48 | */\r | |
49 | public $tracksLineNumbers = false;\r | |
50 | \r | |
51 | // -- STATIC ----------------------------------------------------------\r | |
52 | \r | |
53 | /**\r | |
54 | * Retrieves or sets the default Lexer as a Prototype Factory.\r | |
55 | *\r | |
56 | * By default HTMLPurifier_Lexer_DOMLex will be returned. There are\r | |
57 | * a few exceptions involving special features that only DirectLex\r | |
58 | * implements.\r | |
59 | *\r | |
60 | * @note The behavior of this class has changed, rather than accepting\r | |
61 | * a prototype object, it now accepts a configuration object.\r | |
62 | * To specify your own prototype, set %Core.LexerImpl to it.\r | |
63 | * This change in behavior de-singletonizes the lexer object.\r | |
64 | *\r | |
65 | * @param HTMLPurifier_Config $config\r | |
66 | * @return HTMLPurifier_Lexer\r | |
67 | * @throws HTMLPurifier_Exception\r | |
68 | */\r | |
69 | public static function create($config)\r | |
70 | {\r | |
71 | if (!($config instanceof HTMLPurifier_Config)) {\r | |
72 | $lexer = $config;\r | |
73 | trigger_error(\r | |
74 | "Passing a prototype to\r | |
75 | HTMLPurifier_Lexer::create() is deprecated, please instead\r | |
76 | use %Core.LexerImpl",\r | |
77 | E_USER_WARNING\r | |
78 | );\r | |
79 | } else {\r | |
80 | $lexer = $config->get('Core.LexerImpl');\r | |
81 | }\r | |
82 | \r | |
83 | $needs_tracking =\r | |
84 | $config->get('Core.MaintainLineNumbers') ||\r | |
85 | $config->get('Core.CollectErrors');\r | |
86 | \r | |
87 | $inst = null;\r | |
88 | if (is_object($lexer)) {\r | |
89 | $inst = $lexer;\r | |
90 | } else {\r | |
91 | if (is_null($lexer)) {\r | |
92 | do {\r | |
93 | // auto-detection algorithm\r | |
94 | if ($needs_tracking) {\r | |
95 | $lexer = 'DirectLex';\r | |
96 | break;\r | |
97 | }\r | |
98 | \r | |
99 | if (class_exists('DOMDocument') &&\r | |
100 | method_exists('DOMDocument', 'loadHTML') &&\r | |
101 | !extension_loaded('domxml')\r | |
102 | ) {\r | |
103 | // check for DOM support, because while it's part of the\r | |
104 | // core, it can be disabled compile time. Also, the PECL\r | |
105 | // domxml extension overrides the default DOM, and is evil\r | |
106 | // and nasty and we shan't bother to support it\r | |
107 | $lexer = 'DOMLex';\r | |
108 | } else {\r | |
109 | $lexer = 'DirectLex';\r | |
110 | }\r | |
111 | } while (0);\r | |
112 | } // do..while so we can break\r | |
113 | \r | |
114 | // instantiate recognized string names\r | |
115 | switch ($lexer) {\r | |
116 | case 'DOMLex':\r | |
117 | $inst = new HTMLPurifier_Lexer_DOMLex();\r | |
118 | break;\r | |
119 | case 'DirectLex':\r | |
120 | $inst = new HTMLPurifier_Lexer_DirectLex();\r | |
121 | break;\r | |
122 | case 'PH5P':\r | |
123 | $inst = new HTMLPurifier_Lexer_PH5P();\r | |
124 | break;\r | |
125 | default:\r | |
126 | throw new HTMLPurifier_Exception(\r | |
127 | "Cannot instantiate unrecognized Lexer type " .\r | |
128 | htmlspecialchars($lexer)\r | |
129 | );\r | |
130 | }\r | |
131 | }\r | |
132 | \r | |
133 | if (!$inst) {\r | |
134 | throw new HTMLPurifier_Exception('No lexer was instantiated');\r | |
135 | }\r | |
136 | \r | |
137 | // once PHP DOM implements native line numbers, or we\r | |
138 | // hack out something using XSLT, remove this stipulation\r | |
139 | if ($needs_tracking && !$inst->tracksLineNumbers) {\r | |
140 | throw new HTMLPurifier_Exception(\r | |
141 | 'Cannot use lexer that does not support line numbers with ' .\r | |
142 | 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'\r | |
143 | );\r | |
144 | }\r | |
145 | \r | |
146 | return $inst;\r | |
147 | \r | |
148 | }\r | |
149 | \r | |
150 | // -- CONVENIENCE MEMBERS ---------------------------------------------\r | |
151 | \r | |
152 | public function __construct()\r | |
153 | {\r | |
154 | $this->_entity_parser = new HTMLPurifier_EntityParser();\r | |
155 | }\r | |
156 | \r | |
157 | /**\r | |
158 | * Most common entity to raw value conversion table for special entities.\r | |
159 | * @type array\r | |
160 | */\r | |
161 | protected $_special_entity2str =\r | |
162 | array(\r | |
163 | '"' => '"',\r | |
164 | '&' => '&',\r | |
165 | '<' => '<',\r | |
166 | '>' => '>',\r | |
167 | ''' => "'",\r | |
168 | ''' => "'",\r | |
169 | ''' => "'"\r | |
170 | );\r | |
171 | \r | |
172 | /**\r | |
173 | * Parses special entities into the proper characters.\r | |
174 | *\r | |
175 | * This string will translate escaped versions of the special characters\r | |
176 | * into the correct ones.\r | |
177 | *\r | |
178 | * @warning\r | |
179 | * You should be able to treat the output of this function as\r | |
180 | * completely parsed, but that's only because all other entities should\r | |
181 | * have been handled previously in substituteNonSpecialEntities()\r | |
182 | *\r | |
183 | * @param string $string String character data to be parsed.\r | |
184 | * @return string Parsed character data.\r | |
185 | */\r | |
186 | public function parseData($string)\r | |
187 | {\r | |
188 | // following functions require at least one character\r | |
189 | if ($string === '') {\r | |
190 | return '';\r | |
191 | }\r | |
192 | \r | |
193 | // subtracts amps that cannot possibly be escaped\r | |
194 | $num_amp = substr_count($string, '&') - substr_count($string, '& ') -\r | |
195 | ($string[strlen($string) - 1] === '&' ? 1 : 0);\r | |
196 | \r | |
197 | if (!$num_amp) {\r | |
198 | return $string;\r | |
199 | } // abort if no entities\r | |
200 | $num_esc_amp = substr_count($string, '&');\r | |
201 | $string = strtr($string, $this->_special_entity2str);\r | |
202 | \r | |
203 | // code duplication for sake of optimization, see above\r | |
204 | $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -\r | |
205 | ($string[strlen($string) - 1] === '&' ? 1 : 0);\r | |
206 | \r | |
207 | if ($num_amp_2 <= $num_esc_amp) {\r | |
208 | return $string;\r | |
209 | }\r | |
210 | \r | |
211 | // hmm... now we have some uncommon entities. Use the callback.\r | |
212 | $string = $this->_entity_parser->substituteSpecialEntities($string);\r | |
213 | return $string;\r | |
214 | }\r | |
215 | \r | |
216 | /**\r | |
217 | * Lexes an HTML string into tokens.\r | |
218 | * @param $string String HTML.\r | |
219 | * @param HTMLPurifier_Config $config\r | |
220 | * @param HTMLPurifier_Context $context\r | |
221 | * @return HTMLPurifier_Token[] array representation of HTML.\r | |
222 | */\r | |
223 | public function tokenizeHTML($string, $config, $context)\r | |
224 | {\r | |
225 | trigger_error('Call to abstract class', E_USER_ERROR);\r | |
226 | }\r | |
227 | \r | |
228 | /**\r | |
229 | * Translates CDATA sections into regular sections (through escaping).\r | |
230 | * @param string $string HTML string to process.\r | |
231 | * @return string HTML with CDATA sections escaped.\r | |
232 | */\r | |
233 | protected static function escapeCDATA($string)\r | |
234 | {\r | |
235 | return preg_replace_callback(\r | |
236 | '/<!\[CDATA\[(.+?)\]\]>/s',\r | |
237 | array('HTMLPurifier_Lexer', 'CDATACallback'),\r | |
238 | $string\r | |
239 | );\r | |
240 | }\r | |
241 | \r | |
242 | /**\r | |
243 | * Special CDATA case that is especially convoluted for <script>\r | |
244 | * @param string $string HTML string to process.\r | |
245 | * @return string HTML with CDATA sections escaped.\r | |
246 | */\r | |
247 | protected static function escapeCommentedCDATA($string)\r | |
248 | {\r | |
249 | return preg_replace_callback(\r | |
250 | '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',\r | |
251 | array('HTMLPurifier_Lexer', 'CDATACallback'),\r | |
252 | $string\r | |
253 | );\r | |
254 | }\r | |
255 | \r | |
256 | /**\r | |
257 | * Special Internet Explorer conditional comments should be removed.\r | |
258 | * @param string $string HTML string to process.\r | |
259 | * @return string HTML with conditional comments removed.\r | |
260 | */\r | |
261 | protected static function removeIEConditional($string)\r | |
262 | {\r | |
263 | return preg_replace(\r | |
264 | '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings\r | |
265 | '',\r | |
266 | $string\r | |
267 | );\r | |
268 | }\r | |
269 | \r | |
270 | /**\r | |
271 | * Callback function for escapeCDATA() that does the work.\r | |
272 | *\r | |
273 | * @warning Though this is public in order to let the callback happen,\r | |
274 | * calling it directly is not recommended.\r | |
275 | * @param array $matches PCRE matches array, with index 0 the entire match\r | |
276 | * and 1 the inside of the CDATA section.\r | |
277 | * @return string Escaped internals of the CDATA section.\r | |
278 | */\r | |
279 | protected static function CDATACallback($matches)\r | |
280 | {\r | |
281 | // not exactly sure why the character set is needed, but whatever\r | |
282 | return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');\r | |
283 | }\r | |
284 | \r | |
285 | /**\r | |
286 | * Takes a piece of HTML and normalizes it by converting entities, fixing\r | |
287 | * encoding, extracting bits, and other good stuff.\r | |
288 | * @param string $html HTML.\r | |
289 | * @param HTMLPurifier_Config $config\r | |
290 | * @param HTMLPurifier_Context $context\r | |
291 | * @return string\r | |
292 | * @todo Consider making protected\r | |
293 | */\r | |
294 | public function normalize($html, $config, $context)\r | |
295 | {\r | |
296 | // normalize newlines to \n\r | |
297 | if ($config->get('Core.NormalizeNewlines')) {\r | |
298 | $html = str_replace("\r\n", "\n", $html);\r | |
299 | $html = str_replace("\r", "\n", $html);\r | |
300 | }\r | |
301 | \r | |
302 | if ($config->get('HTML.Trusted')) {\r | |
303 | // escape convoluted CDATA\r | |
304 | $html = $this->escapeCommentedCDATA($html);\r | |
305 | }\r | |
306 | \r | |
307 | // escape CDATA\r | |
308 | $html = $this->escapeCDATA($html);\r | |
309 | \r | |
310 | $html = $this->removeIEConditional($html);\r | |
311 | \r | |
312 | // extract body from document if applicable\r | |
313 | if ($config->get('Core.ConvertDocumentToFragment')) {\r | |
314 | $e = false;\r | |
315 | if ($config->get('Core.CollectErrors')) {\r | |
316 | $e =& $context->get('ErrorCollector');\r | |
317 | }\r | |
318 | $new_html = $this->extractBody($html);\r | |
319 | if ($e && $new_html != $html) {\r | |
320 | $e->send(E_WARNING, 'Lexer: Extracted body');\r | |
321 | }\r | |
322 | $html = $new_html;\r | |
323 | }\r | |
324 | \r | |
325 | // expand entities that aren't the big five\r | |
326 | $html = $this->_entity_parser->substituteNonSpecialEntities($html);\r | |
327 | \r | |
328 | // clean into wellformed UTF-8 string for an SGML context: this has\r | |
329 | // to be done after entity expansion because the entities sometimes\r | |
330 | // represent non-SGML characters (horror, horror!)\r | |
331 | $html = HTMLPurifier_Encoder::cleanUTF8($html);\r | |
332 | \r | |
333 | // if processing instructions are to removed, remove them now\r | |
334 | if ($config->get('Core.RemoveProcessingInstructions')) {\r | |
335 | $html = preg_replace('#<\?.+?\?>#s', '', $html);\r | |
336 | }\r | |
337 | \r | |
338 | return $html;\r | |
339 | }\r | |
340 | \r | |
341 | /**\r | |
342 | * Takes a string of HTML (fragment or document) and returns the content\r | |
343 | * @todo Consider making protected\r | |
344 | */\r | |
345 | public function extractBody($html)\r | |
346 | {\r | |
347 | $matches = array();\r | |
348 | $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);\r | |
349 | if ($result) {\r | |
350 | return $matches[1];\r | |
351 | } else {\r | |
352 | return $html;\r | |
353 | }\r | |
354 | }\r | |
355 | }\r | |
356 | \r | |
357 | // vim: et sw=4 sts=4\r |