]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Our in-house implementation of a parser.\r | |
5 | *\r | |
6 | * A pure PHP parser, DirectLex has absolutely no dependencies, making\r | |
7 | * it a reasonably good default for PHP4. Written with efficiency in mind,\r | |
8 | * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it\r | |
9 | * pales in comparison to HTMLPurifier_Lexer_DOMLex.\r | |
10 | *\r | |
11 | * @todo Reread XML spec and document differences.\r | |
12 | */\r | |
13 | class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer\r | |
14 | {\r | |
15 | /**\r | |
16 | * @type bool\r | |
17 | */\r | |
18 | public $tracksLineNumbers = true;\r | |
19 | \r | |
20 | /**\r | |
21 | * Whitespace characters for str(c)spn.\r | |
22 | * @type string\r | |
23 | */\r | |
24 | protected $_whitespace = "\x20\x09\x0D\x0A";\r | |
25 | \r | |
26 | /**\r | |
27 | * Callback function for script CDATA fudge\r | |
28 | * @param array $matches, in form of array(opening tag, contents, closing tag)\r | |
29 | * @return string\r | |
30 | */\r | |
31 | protected function scriptCallback($matches)\r | |
32 | {\r | |
33 | return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];\r | |
34 | }\r | |
35 | \r | |
36 | /**\r | |
37 | * @param String $html\r | |
38 | * @param HTMLPurifier_Config $config\r | |
39 | * @param HTMLPurifier_Context $context\r | |
40 | * @return array|HTMLPurifier_Token[]\r | |
41 | */\r | |
42 | public function tokenizeHTML($html, $config, $context)\r | |
43 | {\r | |
44 | // special normalization for script tags without any armor\r | |
45 | // our "armor" heurstic is a < sign any number of whitespaces after\r | |
46 | // the first script tag\r | |
47 | if ($config->get('HTML.Trusted')) {\r | |
48 | $html = preg_replace_callback(\r | |
49 | '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',\r | |
50 | array($this, 'scriptCallback'),\r | |
51 | $html\r | |
52 | );\r | |
53 | }\r | |
54 | \r | |
55 | $html = $this->normalize($html, $config, $context);\r | |
56 | \r | |
57 | $cursor = 0; // our location in the text\r | |
58 | $inside_tag = false; // whether or not we're parsing the inside of a tag\r | |
59 | $array = array(); // result array\r | |
60 | \r | |
61 | // This is also treated to mean maintain *column* numbers too\r | |
62 | $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');\r | |
63 | \r | |
64 | if ($maintain_line_numbers === null) {\r | |
65 | // automatically determine line numbering by checking\r | |
66 | // if error collection is on\r | |
67 | $maintain_line_numbers = $config->get('Core.CollectErrors');\r | |
68 | }\r | |
69 | \r | |
70 | if ($maintain_line_numbers) {\r | |
71 | $current_line = 1;\r | |
72 | $current_col = 0;\r | |
73 | $length = strlen($html);\r | |
74 | } else {\r | |
75 | $current_line = false;\r | |
76 | $current_col = false;\r | |
77 | $length = false;\r | |
78 | }\r | |
79 | $context->register('CurrentLine', $current_line);\r | |
80 | $context->register('CurrentCol', $current_col);\r | |
81 | $nl = "\n";\r | |
82 | // how often to manually recalculate. This will ALWAYS be right,\r | |
83 | // but it's pretty wasteful. Set to 0 to turn off\r | |
84 | $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');\r | |
85 | \r | |
86 | $e = false;\r | |
87 | if ($config->get('Core.CollectErrors')) {\r | |
88 | $e =& $context->get('ErrorCollector');\r | |
89 | }\r | |
90 | \r | |
91 | // for testing synchronization\r | |
92 | $loops = 0;\r | |
93 | \r | |
94 | while (++$loops) {\r | |
95 | // $cursor is either at the start of a token, or inside of\r | |
96 | // a tag (i.e. there was a < immediately before it), as indicated\r | |
97 | // by $inside_tag\r | |
98 | \r | |
99 | if ($maintain_line_numbers) {\r | |
100 | // $rcursor, however, is always at the start of a token.\r | |
101 | $rcursor = $cursor - (int)$inside_tag;\r | |
102 | \r | |
103 | // Column number is cheap, so we calculate it every round.\r | |
104 | // We're interested at the *end* of the newline string, so\r | |
105 | // we need to add strlen($nl) == 1 to $nl_pos before subtracting it\r | |
106 | // from our "rcursor" position.\r | |
107 | $nl_pos = strrpos($html, $nl, $rcursor - $length);\r | |
108 | $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);\r | |
109 | \r | |
110 | // recalculate lines\r | |
111 | if ($synchronize_interval && // synchronization is on\r | |
112 | $cursor > 0 && // cursor is further than zero\r | |
113 | $loops % $synchronize_interval === 0) { // time to synchronize!\r | |
114 | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);\r | |
115 | }\r | |
116 | }\r | |
117 | \r | |
118 | $position_next_lt = strpos($html, '<', $cursor);\r | |
119 | $position_next_gt = strpos($html, '>', $cursor);\r | |
120 | \r | |
121 | // triggers on "<b>asdf</b>" but not "asdf <b></b>"\r | |
122 | // special case to set up context\r | |
123 | if ($position_next_lt === $cursor) {\r | |
124 | $inside_tag = true;\r | |
125 | $cursor++;\r | |
126 | }\r | |
127 | \r | |
128 | if (!$inside_tag && $position_next_lt !== false) {\r | |
129 | // We are not inside tag and there still is another tag to parse\r | |
130 | $token = new\r | |
131 | HTMLPurifier_Token_Text(\r | |
132 | $this->parseData(\r | |
133 | substr(\r | |
134 | $html,\r | |
135 | $cursor,\r | |
136 | $position_next_lt - $cursor\r | |
137 | )\r | |
138 | )\r | |
139 | );\r | |
140 | if ($maintain_line_numbers) {\r | |
141 | $token->rawPosition($current_line, $current_col);\r | |
142 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);\r | |
143 | }\r | |
144 | $array[] = $token;\r | |
145 | $cursor = $position_next_lt + 1;\r | |
146 | $inside_tag = true;\r | |
147 | continue;\r | |
148 | } elseif (!$inside_tag) {\r | |
149 | // We are not inside tag but there are no more tags\r | |
150 | // If we're already at the end, break\r | |
151 | if ($cursor === strlen($html)) {\r | |
152 | break;\r | |
153 | }\r | |
154 | // Create Text of rest of string\r | |
155 | $token = new\r | |
156 | HTMLPurifier_Token_Text(\r | |
157 | $this->parseData(\r | |
158 | substr(\r | |
159 | $html,\r | |
160 | $cursor\r | |
161 | )\r | |
162 | )\r | |
163 | );\r | |
164 | if ($maintain_line_numbers) {\r | |
165 | $token->rawPosition($current_line, $current_col);\r | |
166 | }\r | |
167 | $array[] = $token;\r | |
168 | break;\r | |
169 | } elseif ($inside_tag && $position_next_gt !== false) {\r | |
170 | // We are in tag and it is well formed\r | |
171 | // Grab the internals of the tag\r | |
172 | $strlen_segment = $position_next_gt - $cursor;\r | |
173 | \r | |
174 | if ($strlen_segment < 1) {\r | |
175 | // there's nothing to process!\r | |
176 | $token = new HTMLPurifier_Token_Text('<');\r | |
177 | $cursor++;\r | |
178 | continue;\r | |
179 | }\r | |
180 | \r | |
181 | $segment = substr($html, $cursor, $strlen_segment);\r | |
182 | \r | |
183 | if ($segment === false) {\r | |
184 | // somehow, we attempted to access beyond the end of\r | |
185 | // the string, defense-in-depth, reported by Nate Abele\r | |
186 | break;\r | |
187 | }\r | |
188 | \r | |
189 | // Check if it's a comment\r | |
190 | if (substr($segment, 0, 3) === '!--') {\r | |
191 | // re-determine segment length, looking for -->\r | |
192 | $position_comment_end = strpos($html, '-->', $cursor);\r | |
193 | if ($position_comment_end === false) {\r | |
194 | // uh oh, we have a comment that extends to\r | |
195 | // infinity. Can't be helped: set comment\r | |
196 | // end position to end of string\r | |
197 | if ($e) {\r | |
198 | $e->send(E_WARNING, 'Lexer: Unclosed comment');\r | |
199 | }\r | |
200 | $position_comment_end = strlen($html);\r | |
201 | $end = true;\r | |
202 | } else {\r | |
203 | $end = false;\r | |
204 | }\r | |
205 | $strlen_segment = $position_comment_end - $cursor;\r | |
206 | $segment = substr($html, $cursor, $strlen_segment);\r | |
207 | $token = new\r | |
208 | HTMLPurifier_Token_Comment(\r | |
209 | substr(\r | |
210 | $segment,\r | |
211 | 3,\r | |
212 | $strlen_segment - 3\r | |
213 | )\r | |
214 | );\r | |
215 | if ($maintain_line_numbers) {\r | |
216 | $token->rawPosition($current_line, $current_col);\r | |
217 | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);\r | |
218 | }\r | |
219 | $array[] = $token;\r | |
220 | $cursor = $end ? $position_comment_end : $position_comment_end + 3;\r | |
221 | $inside_tag = false;\r | |
222 | continue;\r | |
223 | }\r | |
224 | \r | |
225 | // Check if it's an end tag\r | |
226 | $is_end_tag = (strpos($segment, '/') === 0);\r | |
227 | if ($is_end_tag) {\r | |
228 | $type = substr($segment, 1);\r | |
229 | $token = new HTMLPurifier_Token_End($type);\r | |
230 | if ($maintain_line_numbers) {\r | |
231 | $token->rawPosition($current_line, $current_col);\r | |
232 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);\r | |
233 | }\r | |
234 | $array[] = $token;\r | |
235 | $inside_tag = false;\r | |
236 | $cursor = $position_next_gt + 1;\r | |
237 | continue;\r | |
238 | }\r | |
239 | \r | |
240 | // Check leading character is alnum, if not, we may\r | |
241 | // have accidently grabbed an emoticon. Translate into\r | |
242 | // text and go our merry way\r | |
243 | if (!ctype_alpha($segment[0])) {\r | |
244 | // XML: $segment[0] !== '_' && $segment[0] !== ':'\r | |
245 | if ($e) {\r | |
246 | $e->send(E_NOTICE, 'Lexer: Unescaped lt');\r | |
247 | }\r | |
248 | $token = new HTMLPurifier_Token_Text('<');\r | |
249 | if ($maintain_line_numbers) {\r | |
250 | $token->rawPosition($current_line, $current_col);\r | |
251 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);\r | |
252 | }\r | |
253 | $array[] = $token;\r | |
254 | $inside_tag = false;\r | |
255 | continue;\r | |
256 | }\r | |
257 | \r | |
258 | // Check if it is explicitly self closing, if so, remove\r | |
259 | // trailing slash. Remember, we could have a tag like <br>, so\r | |
260 | // any later token processing scripts must convert improperly\r | |
261 | // classified EmptyTags from StartTags.\r | |
262 | $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);\r | |
263 | if ($is_self_closing) {\r | |
264 | $strlen_segment--;\r | |
265 | $segment = substr($segment, 0, $strlen_segment);\r | |
266 | }\r | |
267 | \r | |
268 | // Check if there are any attributes\r | |
269 | $position_first_space = strcspn($segment, $this->_whitespace);\r | |
270 | \r | |
271 | if ($position_first_space >= $strlen_segment) {\r | |
272 | if ($is_self_closing) {\r | |
273 | $token = new HTMLPurifier_Token_Empty($segment);\r | |
274 | } else {\r | |
275 | $token = new HTMLPurifier_Token_Start($segment);\r | |
276 | }\r | |
277 | if ($maintain_line_numbers) {\r | |
278 | $token->rawPosition($current_line, $current_col);\r | |
279 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);\r | |
280 | }\r | |
281 | $array[] = $token;\r | |
282 | $inside_tag = false;\r | |
283 | $cursor = $position_next_gt + 1;\r | |
284 | continue;\r | |
285 | }\r | |
286 | \r | |
287 | // Grab out all the data\r | |
288 | $type = substr($segment, 0, $position_first_space);\r | |
289 | $attribute_string =\r | |
290 | trim(\r | |
291 | substr(\r | |
292 | $segment,\r | |
293 | $position_first_space\r | |
294 | )\r | |
295 | );\r | |
296 | if ($attribute_string) {\r | |
297 | $attr = $this->parseAttributeString(\r | |
298 | $attribute_string,\r | |
299 | $config,\r | |
300 | $context\r | |
301 | );\r | |
302 | } else {\r | |
303 | $attr = array();\r | |
304 | }\r | |
305 | \r | |
306 | if ($is_self_closing) {\r | |
307 | $token = new HTMLPurifier_Token_Empty($type, $attr);\r | |
308 | } else {\r | |
309 | $token = new HTMLPurifier_Token_Start($type, $attr);\r | |
310 | }\r | |
311 | if ($maintain_line_numbers) {\r | |
312 | $token->rawPosition($current_line, $current_col);\r | |
313 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);\r | |
314 | }\r | |
315 | $array[] = $token;\r | |
316 | $cursor = $position_next_gt + 1;\r | |
317 | $inside_tag = false;\r | |
318 | continue;\r | |
319 | } else {\r | |
320 | // inside tag, but there's no ending > sign\r | |
321 | if ($e) {\r | |
322 | $e->send(E_WARNING, 'Lexer: Missing gt');\r | |
323 | }\r | |
324 | $token = new\r | |
325 | HTMLPurifier_Token_Text(\r | |
326 | '<' .\r | |
327 | $this->parseData(\r | |
328 | substr($html, $cursor)\r | |
329 | )\r | |
330 | );\r | |
331 | if ($maintain_line_numbers) {\r | |
332 | $token->rawPosition($current_line, $current_col);\r | |
333 | }\r | |
334 | // no cursor scroll? Hmm...\r | |
335 | $array[] = $token;\r | |
336 | break;\r | |
337 | }\r | |
338 | break;\r | |
339 | }\r | |
340 | \r | |
341 | $context->destroy('CurrentLine');\r | |
342 | $context->destroy('CurrentCol');\r | |
343 | return $array;\r | |
344 | }\r | |
345 | \r | |
346 | /**\r | |
347 | * PHP 5.0.x compatible substr_count that implements offset and length\r | |
348 | * @param string $haystack\r | |
349 | * @param string $needle\r | |
350 | * @param int $offset\r | |
351 | * @param int $length\r | |
352 | * @return int\r | |
353 | */\r | |
354 | protected function substrCount($haystack, $needle, $offset, $length)\r | |
355 | {\r | |
356 | static $oldVersion;\r | |
357 | if ($oldVersion === null) {\r | |
358 | $oldVersion = version_compare(PHP_VERSION, '5.1', '<');\r | |
359 | }\r | |
360 | if ($oldVersion) {\r | |
361 | $haystack = substr($haystack, $offset, $length);\r | |
362 | return substr_count($haystack, $needle);\r | |
363 | } else {\r | |
364 | return substr_count($haystack, $needle, $offset, $length);\r | |
365 | }\r | |
366 | }\r | |
367 | \r | |
368 | /**\r | |
369 | * Takes the inside of an HTML tag and makes an assoc array of attributes.\r | |
370 | *\r | |
371 | * @param string $string Inside of tag excluding name.\r | |
372 | * @param HTMLPurifier_Config $config\r | |
373 | * @param HTMLPurifier_Context $context\r | |
374 | * @return array Assoc array of attributes.\r | |
375 | */\r | |
376 | public function parseAttributeString($string, $config, $context)\r | |
377 | {\r | |
378 | $string = (string)$string; // quick typecast\r | |
379 | \r | |
380 | if ($string == '') {\r | |
381 | return array();\r | |
382 | } // no attributes\r | |
383 | \r | |
384 | $e = false;\r | |
385 | if ($config->get('Core.CollectErrors')) {\r | |
386 | $e =& $context->get('ErrorCollector');\r | |
387 | }\r | |
388 | \r | |
389 | // let's see if we can abort as quickly as possible\r | |
390 | // one equal sign, no spaces => one attribute\r | |
391 | $num_equal = substr_count($string, '=');\r | |
392 | $has_space = strpos($string, ' ');\r | |
393 | if ($num_equal === 0 && !$has_space) {\r | |
394 | // bool attribute\r | |
395 | return array($string => $string);\r | |
396 | } elseif ($num_equal === 1 && !$has_space) {\r | |
397 | // only one attribute\r | |
398 | list($key, $quoted_value) = explode('=', $string);\r | |
399 | $quoted_value = trim($quoted_value);\r | |
400 | if (!$key) {\r | |
401 | if ($e) {\r | |
402 | $e->send(E_ERROR, 'Lexer: Missing attribute key');\r | |
403 | }\r | |
404 | return array();\r | |
405 | }\r | |
406 | if (!$quoted_value) {\r | |
407 | return array($key => '');\r | |
408 | }\r | |
409 | $first_char = @$quoted_value[0];\r | |
410 | $last_char = @$quoted_value[strlen($quoted_value) - 1];\r | |
411 | \r | |
412 | $same_quote = ($first_char == $last_char);\r | |
413 | $open_quote = ($first_char == '"' || $first_char == "'");\r | |
414 | \r | |
415 | if ($same_quote && $open_quote) {\r | |
416 | // well behaved\r | |
417 | $value = substr($quoted_value, 1, strlen($quoted_value) - 2);\r | |
418 | } else {\r | |
419 | // not well behaved\r | |
420 | if ($open_quote) {\r | |
421 | if ($e) {\r | |
422 | $e->send(E_ERROR, 'Lexer: Missing end quote');\r | |
423 | }\r | |
424 | $value = substr($quoted_value, 1);\r | |
425 | } else {\r | |
426 | $value = $quoted_value;\r | |
427 | }\r | |
428 | }\r | |
429 | if ($value === false) {\r | |
430 | $value = '';\r | |
431 | }\r | |
432 | return array($key => $this->parseData($value));\r | |
433 | }\r | |
434 | \r | |
435 | // setup loop environment\r | |
436 | $array = array(); // return assoc array of attributes\r | |
437 | $cursor = 0; // current position in string (moves forward)\r | |
438 | $size = strlen($string); // size of the string (stays the same)\r | |
439 | \r | |
440 | // if we have unquoted attributes, the parser expects a terminating\r | |
441 | // space, so let's guarantee that there's always a terminating space.\r | |
442 | $string .= ' ';\r | |
443 | \r | |
444 | $old_cursor = -1;\r | |
445 | while ($cursor < $size) {\r | |
446 | if ($old_cursor >= $cursor) {\r | |
447 | throw new Exception("Infinite loop detected");\r | |
448 | }\r | |
449 | $old_cursor = $cursor;\r | |
450 | \r | |
451 | $cursor += ($value = strspn($string, $this->_whitespace, $cursor));\r | |
452 | // grab the key\r | |
453 | \r | |
454 | $key_begin = $cursor; //we're currently at the start of the key\r | |
455 | \r | |
456 | // scroll past all characters that are the key (not whitespace or =)\r | |
457 | $cursor += strcspn($string, $this->_whitespace . '=', $cursor);\r | |
458 | \r | |
459 | $key_end = $cursor; // now at the end of the key\r | |
460 | \r | |
461 | $key = substr($string, $key_begin, $key_end - $key_begin);\r | |
462 | \r | |
463 | if (!$key) {\r | |
464 | if ($e) {\r | |
465 | $e->send(E_ERROR, 'Lexer: Missing attribute key');\r | |
466 | }\r | |
467 | $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop\r | |
468 | continue; // empty key\r | |
469 | }\r | |
470 | \r | |
471 | // scroll past all whitespace\r | |
472 | $cursor += strspn($string, $this->_whitespace, $cursor);\r | |
473 | \r | |
474 | if ($cursor >= $size) {\r | |
475 | $array[$key] = $key;\r | |
476 | break;\r | |
477 | }\r | |
478 | \r | |
479 | // if the next character is an equal sign, we've got a regular\r | |
480 | // pair, otherwise, it's a bool attribute\r | |
481 | $first_char = @$string[$cursor];\r | |
482 | \r | |
483 | if ($first_char == '=') {\r | |
484 | // key="value"\r | |
485 | \r | |
486 | $cursor++;\r | |
487 | $cursor += strspn($string, $this->_whitespace, $cursor);\r | |
488 | \r | |
489 | if ($cursor === false) {\r | |
490 | $array[$key] = '';\r | |
491 | break;\r | |
492 | }\r | |
493 | \r | |
494 | // we might be in front of a quote right now\r | |
495 | \r | |
496 | $char = @$string[$cursor];\r | |
497 | \r | |
498 | if ($char == '"' || $char == "'") {\r | |
499 | // it's quoted, end bound is $char\r | |
500 | $cursor++;\r | |
501 | $value_begin = $cursor;\r | |
502 | $cursor = strpos($string, $char, $cursor);\r | |
503 | $value_end = $cursor;\r | |
504 | } else {\r | |
505 | // it's not quoted, end bound is whitespace\r | |
506 | $value_begin = $cursor;\r | |
507 | $cursor += strcspn($string, $this->_whitespace, $cursor);\r | |
508 | $value_end = $cursor;\r | |
509 | }\r | |
510 | \r | |
511 | // we reached a premature end\r | |
512 | if ($cursor === false) {\r | |
513 | $cursor = $size;\r | |
514 | $value_end = $cursor;\r | |
515 | }\r | |
516 | \r | |
517 | $value = substr($string, $value_begin, $value_end - $value_begin);\r | |
518 | if ($value === false) {\r | |
519 | $value = '';\r | |
520 | }\r | |
521 | $array[$key] = $this->parseData($value);\r | |
522 | $cursor++;\r | |
523 | } else {\r | |
524 | // boolattr\r | |
525 | if ($key !== '') {\r | |
526 | $array[$key] = $key;\r | |
527 | } else {\r | |
528 | // purely theoretical\r | |
529 | if ($e) {\r | |
530 | $e->send(E_ERROR, 'Lexer: Missing attribute key');\r | |
531 | }\r | |
532 | }\r | |
533 | }\r | |
534 | }\r | |
535 | return $array;\r | |
536 | }\r | |
537 | }\r | |
538 | \r | |
539 | // vim: et sw=4 sts=4\r |