diff options
Diffstat (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php')
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php | 539 |
1 files changed, 539 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php new file mode 100644 index 00000000..a07f4973 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php | |||
@@ -0,0 +1,539 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Our in-house implementation of a parser. | ||
5 | * | ||
6 | * A pure PHP parser, DirectLex has absolutely no dependencies, making | ||
7 | * it a reasonably good default for PHP4. Written with efficiency in mind, | ||
8 | * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it | ||
9 | * pales in comparison to HTMLPurifier_Lexer_DOMLex. | ||
10 | * | ||
11 | * @todo Reread XML spec and document differences. | ||
12 | */ | ||
13 | class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer | ||
14 | { | ||
15 | /** | ||
16 | * @type bool | ||
17 | */ | ||
18 | public $tracksLineNumbers = true; | ||
19 | |||
20 | /** | ||
21 | * Whitespace characters for str(c)spn. | ||
22 | * @type string | ||
23 | */ | ||
24 | protected $_whitespace = "\x20\x09\x0D\x0A"; | ||
25 | |||
26 | /** | ||
27 | * Callback function for script CDATA fudge | ||
28 | * @param array $matches, in form of array(opening tag, contents, closing tag) | ||
29 | * @return string | ||
30 | */ | ||
31 | protected function scriptCallback($matches) | ||
32 | { | ||
33 | return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; | ||
34 | } | ||
35 | |||
36 | /** | ||
37 | * @param String $html | ||
38 | * @param HTMLPurifier_Config $config | ||
39 | * @param HTMLPurifier_Context $context | ||
40 | * @return array|HTMLPurifier_Token[] | ||
41 | */ | ||
42 | public function tokenizeHTML($html, $config, $context) | ||
43 | { | ||
44 | // special normalization for script tags without any armor | ||
45 | // our "armor" heurstic is a < sign any number of whitespaces after | ||
46 | // the first script tag | ||
47 | if ($config->get('HTML.Trusted')) { | ||
48 | $html = preg_replace_callback( | ||
49 | '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', | ||
50 | array($this, 'scriptCallback'), | ||
51 | $html | ||
52 | ); | ||
53 | } | ||
54 | |||
55 | $html = $this->normalize($html, $config, $context); | ||
56 | |||
57 | $cursor = 0; // our location in the text | ||
58 | $inside_tag = false; // whether or not we're parsing the inside of a tag | ||
59 | $array = array(); // result array | ||
60 | |||
61 | // This is also treated to mean maintain *column* numbers too | ||
62 | $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); | ||
63 | |||
64 | if ($maintain_line_numbers === null) { | ||
65 | // automatically determine line numbering by checking | ||
66 | // if error collection is on | ||
67 | $maintain_line_numbers = $config->get('Core.CollectErrors'); | ||
68 | } | ||
69 | |||
70 | if ($maintain_line_numbers) { | ||
71 | $current_line = 1; | ||
72 | $current_col = 0; | ||
73 | $length = strlen($html); | ||
74 | } else { | ||
75 | $current_line = false; | ||
76 | $current_col = false; | ||
77 | $length = false; | ||
78 | } | ||
79 | $context->register('CurrentLine', $current_line); | ||
80 | $context->register('CurrentCol', $current_col); | ||
81 | $nl = "\n"; | ||
82 | // how often to manually recalculate. This will ALWAYS be right, | ||
83 | // but it's pretty wasteful. Set to 0 to turn off | ||
84 | $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); | ||
85 | |||
86 | $e = false; | ||
87 | if ($config->get('Core.CollectErrors')) { | ||
88 | $e =& $context->get('ErrorCollector'); | ||
89 | } | ||
90 | |||
91 | // for testing synchronization | ||
92 | $loops = 0; | ||
93 | |||
94 | while (++$loops) { | ||
95 | // $cursor is either at the start of a token, or inside of | ||
96 | // a tag (i.e. there was a < immediately before it), as indicated | ||
97 | // by $inside_tag | ||
98 | |||
99 | if ($maintain_line_numbers) { | ||
100 | // $rcursor, however, is always at the start of a token. | ||
101 | $rcursor = $cursor - (int)$inside_tag; | ||
102 | |||
103 | // Column number is cheap, so we calculate it every round. | ||
104 | // We're interested at the *end* of the newline string, so | ||
105 | // we need to add strlen($nl) == 1 to $nl_pos before subtracting it | ||
106 | // from our "rcursor" position. | ||
107 | $nl_pos = strrpos($html, $nl, $rcursor - $length); | ||
108 | $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); | ||
109 | |||
110 | // recalculate lines | ||
111 | if ($synchronize_interval && // synchronization is on | ||
112 | $cursor > 0 && // cursor is further than zero | ||
113 | $loops % $synchronize_interval === 0) { // time to synchronize! | ||
114 | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | $position_next_lt = strpos($html, '<', $cursor); | ||
119 | $position_next_gt = strpos($html, '>', $cursor); | ||
120 | |||
121 | // triggers on "<b>asdf</b>" but not "asdf <b></b>" | ||
122 | // special case to set up context | ||
123 | if ($position_next_lt === $cursor) { | ||
124 | $inside_tag = true; | ||
125 | $cursor++; | ||
126 | } | ||
127 | |||
128 | if (!$inside_tag && $position_next_lt !== false) { | ||
129 | // We are not inside tag and there still is another tag to parse | ||
130 | $token = new | ||
131 | HTMLPurifier_Token_Text( | ||
132 | $this->parseData( | ||
133 | substr( | ||
134 | $html, | ||
135 | $cursor, | ||
136 | $position_next_lt - $cursor | ||
137 | ) | ||
138 | ) | ||
139 | ); | ||
140 | if ($maintain_line_numbers) { | ||
141 | $token->rawPosition($current_line, $current_col); | ||
142 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); | ||
143 | } | ||
144 | $array[] = $token; | ||
145 | $cursor = $position_next_lt + 1; | ||
146 | $inside_tag = true; | ||
147 | continue; | ||
148 | } elseif (!$inside_tag) { | ||
149 | // We are not inside tag but there are no more tags | ||
150 | // If we're already at the end, break | ||
151 | if ($cursor === strlen($html)) { | ||
152 | break; | ||
153 | } | ||
154 | // Create Text of rest of string | ||
155 | $token = new | ||
156 | HTMLPurifier_Token_Text( | ||
157 | $this->parseData( | ||
158 | substr( | ||
159 | $html, | ||
160 | $cursor | ||
161 | ) | ||
162 | ) | ||
163 | ); | ||
164 | if ($maintain_line_numbers) { | ||
165 | $token->rawPosition($current_line, $current_col); | ||
166 | } | ||
167 | $array[] = $token; | ||
168 | break; | ||
169 | } elseif ($inside_tag && $position_next_gt !== false) { | ||
170 | // We are in tag and it is well formed | ||
171 | // Grab the internals of the tag | ||
172 | $strlen_segment = $position_next_gt - $cursor; | ||
173 | |||
174 | if ($strlen_segment < 1) { | ||
175 | // there's nothing to process! | ||
176 | $token = new HTMLPurifier_Token_Text('<'); | ||
177 | $cursor++; | ||
178 | continue; | ||
179 | } | ||
180 | |||
181 | $segment = substr($html, $cursor, $strlen_segment); | ||
182 | |||
183 | if ($segment === false) { | ||
184 | // somehow, we attempted to access beyond the end of | ||
185 | // the string, defense-in-depth, reported by Nate Abele | ||
186 | break; | ||
187 | } | ||
188 | |||
189 | // Check if it's a comment | ||
190 | if (substr($segment, 0, 3) === '!--') { | ||
191 | // re-determine segment length, looking for --> | ||
192 | $position_comment_end = strpos($html, '-->', $cursor); | ||
193 | if ($position_comment_end === false) { | ||
194 | // uh oh, we have a comment that extends to | ||
195 | // infinity. Can't be helped: set comment | ||
196 | // end position to end of string | ||
197 | if ($e) { | ||
198 | $e->send(E_WARNING, 'Lexer: Unclosed comment'); | ||
199 | } | ||
200 | $position_comment_end = strlen($html); | ||
201 | $end = true; | ||
202 | } else { | ||
203 | $end = false; | ||
204 | } | ||
205 | $strlen_segment = $position_comment_end - $cursor; | ||
206 | $segment = substr($html, $cursor, $strlen_segment); | ||
207 | $token = new | ||
208 | HTMLPurifier_Token_Comment( | ||
209 | substr( | ||
210 | $segment, | ||
211 | 3, | ||
212 | $strlen_segment - 3 | ||
213 | ) | ||
214 | ); | ||
215 | if ($maintain_line_numbers) { | ||
216 | $token->rawPosition($current_line, $current_col); | ||
217 | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); | ||
218 | } | ||
219 | $array[] = $token; | ||
220 | $cursor = $end ? $position_comment_end : $position_comment_end + 3; | ||
221 | $inside_tag = false; | ||
222 | continue; | ||
223 | } | ||
224 | |||
225 | // Check if it's an end tag | ||
226 | $is_end_tag = (strpos($segment, '/') === 0); | ||
227 | if ($is_end_tag) { | ||
228 | $type = substr($segment, 1); | ||
229 | $token = new HTMLPurifier_Token_End($type); | ||
230 | if ($maintain_line_numbers) { | ||
231 | $token->rawPosition($current_line, $current_col); | ||
232 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
233 | } | ||
234 | $array[] = $token; | ||
235 | $inside_tag = false; | ||
236 | $cursor = $position_next_gt + 1; | ||
237 | continue; | ||
238 | } | ||
239 | |||
240 | // Check leading character is alnum, if not, we may | ||
241 | // have accidently grabbed an emoticon. Translate into | ||
242 | // text and go our merry way | ||
243 | if (!ctype_alpha($segment[0])) { | ||
244 | // XML: $segment[0] !== '_' && $segment[0] !== ':' | ||
245 | if ($e) { | ||
246 | $e->send(E_NOTICE, 'Lexer: Unescaped lt'); | ||
247 | } | ||
248 | $token = new HTMLPurifier_Token_Text('<'); | ||
249 | if ($maintain_line_numbers) { | ||
250 | $token->rawPosition($current_line, $current_col); | ||
251 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
252 | } | ||
253 | $array[] = $token; | ||
254 | $inside_tag = false; | ||
255 | continue; | ||
256 | } | ||
257 | |||
258 | // Check if it is explicitly self closing, if so, remove | ||
259 | // trailing slash. Remember, we could have a tag like <br>, so | ||
260 | // any later token processing scripts must convert improperly | ||
261 | // classified EmptyTags from StartTags. | ||
262 | $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); | ||
263 | if ($is_self_closing) { | ||
264 | $strlen_segment--; | ||
265 | $segment = substr($segment, 0, $strlen_segment); | ||
266 | } | ||
267 | |||
268 | // Check if there are any attributes | ||
269 | $position_first_space = strcspn($segment, $this->_whitespace); | ||
270 | |||
271 | if ($position_first_space >= $strlen_segment) { | ||
272 | if ($is_self_closing) { | ||
273 | $token = new HTMLPurifier_Token_Empty($segment); | ||
274 | } else { | ||
275 | $token = new HTMLPurifier_Token_Start($segment); | ||
276 | } | ||
277 | if ($maintain_line_numbers) { | ||
278 | $token->rawPosition($current_line, $current_col); | ||
279 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
280 | } | ||
281 | $array[] = $token; | ||
282 | $inside_tag = false; | ||
283 | $cursor = $position_next_gt + 1; | ||
284 | continue; | ||
285 | } | ||
286 | |||
287 | // Grab out all the data | ||
288 | $type = substr($segment, 0, $position_first_space); | ||
289 | $attribute_string = | ||
290 | trim( | ||
291 | substr( | ||
292 | $segment, | ||
293 | $position_first_space | ||
294 | ) | ||
295 | ); | ||
296 | if ($attribute_string) { | ||
297 | $attr = $this->parseAttributeString( | ||
298 | $attribute_string, | ||
299 | $config, | ||
300 | $context | ||
301 | ); | ||
302 | } else { | ||
303 | $attr = array(); | ||
304 | } | ||
305 | |||
306 | if ($is_self_closing) { | ||
307 | $token = new HTMLPurifier_Token_Empty($type, $attr); | ||
308 | } else { | ||
309 | $token = new HTMLPurifier_Token_Start($type, $attr); | ||
310 | } | ||
311 | if ($maintain_line_numbers) { | ||
312 | $token->rawPosition($current_line, $current_col); | ||
313 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
314 | } | ||
315 | $array[] = $token; | ||
316 | $cursor = $position_next_gt + 1; | ||
317 | $inside_tag = false; | ||
318 | continue; | ||
319 | } else { | ||
320 | // inside tag, but there's no ending > sign | ||
321 | if ($e) { | ||
322 | $e->send(E_WARNING, 'Lexer: Missing gt'); | ||
323 | } | ||
324 | $token = new | ||
325 | HTMLPurifier_Token_Text( | ||
326 | '<' . | ||
327 | $this->parseData( | ||
328 | substr($html, $cursor) | ||
329 | ) | ||
330 | ); | ||
331 | if ($maintain_line_numbers) { | ||
332 | $token->rawPosition($current_line, $current_col); | ||
333 | } | ||
334 | // no cursor scroll? Hmm... | ||
335 | $array[] = $token; | ||
336 | break; | ||
337 | } | ||
338 | break; | ||
339 | } | ||
340 | |||
341 | $context->destroy('CurrentLine'); | ||
342 | $context->destroy('CurrentCol'); | ||
343 | return $array; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * PHP 5.0.x compatible substr_count that implements offset and length | ||
348 | * @param string $haystack | ||
349 | * @param string $needle | ||
350 | * @param int $offset | ||
351 | * @param int $length | ||
352 | * @return int | ||
353 | */ | ||
354 | protected function substrCount($haystack, $needle, $offset, $length) | ||
355 | { | ||
356 | static $oldVersion; | ||
357 | if ($oldVersion === null) { | ||
358 | $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); | ||
359 | } | ||
360 | if ($oldVersion) { | ||
361 | $haystack = substr($haystack, $offset, $length); | ||
362 | return substr_count($haystack, $needle); | ||
363 | } else { | ||
364 | return substr_count($haystack, $needle, $offset, $length); | ||
365 | } | ||
366 | } | ||
367 | |||
368 | /** | ||
369 | * Takes the inside of an HTML tag and makes an assoc array of attributes. | ||
370 | * | ||
371 | * @param string $string Inside of tag excluding name. | ||
372 | * @param HTMLPurifier_Config $config | ||
373 | * @param HTMLPurifier_Context $context | ||
374 | * @return array Assoc array of attributes. | ||
375 | */ | ||
376 | public function parseAttributeString($string, $config, $context) | ||
377 | { | ||
378 | $string = (string)$string; // quick typecast | ||
379 | |||
380 | if ($string == '') { | ||
381 | return array(); | ||
382 | } // no attributes | ||
383 | |||
384 | $e = false; | ||
385 | if ($config->get('Core.CollectErrors')) { | ||
386 | $e =& $context->get('ErrorCollector'); | ||
387 | } | ||
388 | |||
389 | // let's see if we can abort as quickly as possible | ||
390 | // one equal sign, no spaces => one attribute | ||
391 | $num_equal = substr_count($string, '='); | ||
392 | $has_space = strpos($string, ' '); | ||
393 | if ($num_equal === 0 && !$has_space) { | ||
394 | // bool attribute | ||
395 | return array($string => $string); | ||
396 | } elseif ($num_equal === 1 && !$has_space) { | ||
397 | // only one attribute | ||
398 | list($key, $quoted_value) = explode('=', $string); | ||
399 | $quoted_value = trim($quoted_value); | ||
400 | if (!$key) { | ||
401 | if ($e) { | ||
402 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
403 | } | ||
404 | return array(); | ||
405 | } | ||
406 | if (!$quoted_value) { | ||
407 | return array($key => ''); | ||
408 | } | ||
409 | $first_char = @$quoted_value[0]; | ||
410 | $last_char = @$quoted_value[strlen($quoted_value) - 1]; | ||
411 | |||
412 | $same_quote = ($first_char == $last_char); | ||
413 | $open_quote = ($first_char == '"' || $first_char == "'"); | ||
414 | |||
415 | if ($same_quote && $open_quote) { | ||
416 | // well behaved | ||
417 | $value = substr($quoted_value, 1, strlen($quoted_value) - 2); | ||
418 | } else { | ||
419 | // not well behaved | ||
420 | if ($open_quote) { | ||
421 | if ($e) { | ||
422 | $e->send(E_ERROR, 'Lexer: Missing end quote'); | ||
423 | } | ||
424 | $value = substr($quoted_value, 1); | ||
425 | } else { | ||
426 | $value = $quoted_value; | ||
427 | } | ||
428 | } | ||
429 | if ($value === false) { | ||
430 | $value = ''; | ||
431 | } | ||
432 | return array($key => $this->parseData($value)); | ||
433 | } | ||
434 | |||
435 | // setup loop environment | ||
436 | $array = array(); // return assoc array of attributes | ||
437 | $cursor = 0; // current position in string (moves forward) | ||
438 | $size = strlen($string); // size of the string (stays the same) | ||
439 | |||
440 | // if we have unquoted attributes, the parser expects a terminating | ||
441 | // space, so let's guarantee that there's always a terminating space. | ||
442 | $string .= ' '; | ||
443 | |||
444 | $old_cursor = -1; | ||
445 | while ($cursor < $size) { | ||
446 | if ($old_cursor >= $cursor) { | ||
447 | throw new Exception("Infinite loop detected"); | ||
448 | } | ||
449 | $old_cursor = $cursor; | ||
450 | |||
451 | $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); | ||
452 | // grab the key | ||
453 | |||
454 | $key_begin = $cursor; //we're currently at the start of the key | ||
455 | |||
456 | // scroll past all characters that are the key (not whitespace or =) | ||
457 | $cursor += strcspn($string, $this->_whitespace . '=', $cursor); | ||
458 | |||
459 | $key_end = $cursor; // now at the end of the key | ||
460 | |||
461 | $key = substr($string, $key_begin, $key_end - $key_begin); | ||
462 | |||
463 | if (!$key) { | ||
464 | if ($e) { | ||
465 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
466 | } | ||
467 | $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop | ||
468 | continue; // empty key | ||
469 | } | ||
470 | |||
471 | // scroll past all whitespace | ||
472 | $cursor += strspn($string, $this->_whitespace, $cursor); | ||
473 | |||
474 | if ($cursor >= $size) { | ||
475 | $array[$key] = $key; | ||
476 | break; | ||
477 | } | ||
478 | |||
479 | // if the next character is an equal sign, we've got a regular | ||
480 | // pair, otherwise, it's a bool attribute | ||
481 | $first_char = @$string[$cursor]; | ||
482 | |||
483 | if ($first_char == '=') { | ||
484 | // key="value" | ||
485 | |||
486 | $cursor++; | ||
487 | $cursor += strspn($string, $this->_whitespace, $cursor); | ||
488 | |||
489 | if ($cursor === false) { | ||
490 | $array[$key] = ''; | ||
491 | break; | ||
492 | } | ||
493 | |||
494 | // we might be in front of a quote right now | ||
495 | |||
496 | $char = @$string[$cursor]; | ||
497 | |||
498 | if ($char == '"' || $char == "'") { | ||
499 | // it's quoted, end bound is $char | ||
500 | $cursor++; | ||
501 | $value_begin = $cursor; | ||
502 | $cursor = strpos($string, $char, $cursor); | ||
503 | $value_end = $cursor; | ||
504 | } else { | ||
505 | // it's not quoted, end bound is whitespace | ||
506 | $value_begin = $cursor; | ||
507 | $cursor += strcspn($string, $this->_whitespace, $cursor); | ||
508 | $value_end = $cursor; | ||
509 | } | ||
510 | |||
511 | // we reached a premature end | ||
512 | if ($cursor === false) { | ||
513 | $cursor = $size; | ||
514 | $value_end = $cursor; | ||
515 | } | ||
516 | |||
517 | $value = substr($string, $value_begin, $value_end - $value_begin); | ||
518 | if ($value === false) { | ||
519 | $value = ''; | ||
520 | } | ||
521 | $array[$key] = $this->parseData($value); | ||
522 | $cursor++; | ||
523 | } else { | ||
524 | // boolattr | ||
525 | if ($key !== '') { | ||
526 | $array[$key] = $key; | ||
527 | } else { | ||
528 | // purely theoretical | ||
529 | if ($e) { | ||
530 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
531 | } | ||
532 | } | ||
533 | } | ||
534 | } | ||
535 | return $array; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | // vim: et sw=4 sts=4 | ||