aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/libraries/html5/Tokenizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/3rdparty/libraries/html5/Tokenizer.php')
-rw-r--r--inc/3rdparty/libraries/html5/Tokenizer.php2422
1 files changed, 2422 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/html5/Tokenizer.php b/inc/3rdparty/libraries/html5/Tokenizer.php
new file mode 100644
index 00000000..0af07164
--- /dev/null
+++ b/inc/3rdparty/libraries/html5/Tokenizer.php
@@ -0,0 +1,2422 @@
1<?php
2
3/*
4
5Copyright 2007 Jeroen van der Meer <http://jero.net/>
6Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9Permission is hereby granted, free of charge, to any person obtaining a
10copy of this software and associated documentation files (the
11"Software"), to deal in the Software without restriction, including
12without limitation the rights to use, copy, modify, merge, publish,
13distribute, sublicense, and/or sell copies of the Software, and to
14permit persons to whom the Software is furnished to do so, subject to
15the following conditions:
16
17The above copyright notice and this permission notice shall be included
18in all copies or substantial portions of the Software.
19
20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28*/
29
30// Some conventions:
31// /* */ indicates verbatim text from the HTML 5 specification
32// // indicates regular comments
33
34// all flags are in hyphenated form
35
36class HTML5_Tokenizer {
37 /**
38 * Points to an InputStream object.
39 */
40 protected $stream;
41
42 /**
43 * Tree builder that the tokenizer emits token to.
44 */
45 private $tree;
46
47 /**
48 * Current content model we are parsing as.
49 */
50 protected $content_model;
51
52 /**
53 * Current token that is being built, but not yet emitted. Also
54 * is the last token emitted, if applicable.
55 */
56 protected $token;
57
58 // These are constants describing the content model
59 const PCDATA = 0;
60 const RCDATA = 1;
61 const CDATA = 2;
62 const PLAINTEXT = 3;
63
64 // These are constants describing tokens
65 // XXX should probably be moved somewhere else, probably the
66 // HTML5 class.
67 const DOCTYPE = 0;
68 const STARTTAG = 1;
69 const ENDTAG = 2;
70 const COMMENT = 3;
71 const CHARACTER = 4;
72 const SPACECHARACTER = 5;
73 const EOF = 6;
74 const PARSEERROR = 7;
75
76 // These are constants representing bunches of characters.
77 const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
78 const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
79 const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
80 const DIGIT = '0123456789';
81 const HEX = '0123456789ABCDEFabcdef';
82 const WHITESPACE = "\t\n\x0c ";
83
84 /**
85 * @param $data Data to parse
86 */
87 public function __construct($data, $builder = null) {
88 $this->stream = new HTML5_InputStream($data);
89 if (!$builder) $this->tree = new HTML5_TreeBuilder;
90 else $this->tree = $builder;
91 $this->content_model = self::PCDATA;
92 }
93
94 public function parseFragment($context = null) {
95 $this->tree->setupContext($context);
96 if ($this->tree->content_model) {
97 $this->content_model = $this->tree->content_model;
98 $this->tree->content_model = null;
99 }
100 $this->parse();
101 }
102
103 // XXX maybe convert this into an iterator? regardless, this function
104 // and the save function should go into a Parser facade of some sort
105 /**
106 * Performs the actual parsing of the document.
107 */
108 public function parse() {
109 // Current state
110 $state = 'data';
111 // This is used to avoid having to have look-behind in the data state.
112 $lastFourChars = '';
113 /**
114 * Escape flag as specified by the HTML5 specification: "used to
115 * control the behavior of the tokeniser. It is either true or
116 * false, and initially must be set to the false state."
117 */
118 $escape = false;
119 //echo "\n\n";
120 while($state !== null) {
121
122 /*echo $state . ' ';
123 switch ($this->content_model) {
124 case self::PCDATA: echo 'PCDATA'; break;
125 case self::RCDATA: echo 'RCDATA'; break;
126 case self::CDATA: echo 'CDATA'; break;
127 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
128 }
129 if ($escape) echo " escape";
130 echo "\n";*/
131
132 switch($state) {
133 case 'data':
134
135 /* Consume the next input character */
136 $char = $this->stream->char();
137 $lastFourChars .= $char;
138 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
139
140 // see below for meaning
141 $hyp_cond =
142 !$escape &&
143 (
144 $this->content_model === self::RCDATA ||
145 $this->content_model === self::CDATA
146 );
147 $amp_cond =
148 !$escape &&
149 (
150 $this->content_model === self::PCDATA ||
151 $this->content_model === self::RCDATA
152 );
153 $lt_cond =
154 $this->content_model === self::PCDATA ||
155 (
156 (
157 $this->content_model === self::RCDATA ||
158 $this->content_model === self::CDATA
159 ) &&
160 !$escape
161 );
162 $gt_cond =
163 $escape &&
164 (
165 $this->content_model === self::RCDATA ||
166 $this->content_model === self::CDATA
167 );
168
169 if($char === '&' && $amp_cond) {
170 /* U+0026 AMPERSAND (&)
171 When the content model flag is set to one of the PCDATA or RCDATA
172 states and the escape flag is false: switch to the
173 character reference data state. Otherwise: treat it as per
174 the "anything else" entry below. */
175 $state = 'character reference data';
176
177 } elseif(
178 $char === '-' &&
179 $hyp_cond &&
180 $lastFourChars === '<!--'
181 ) {
182 /*
183 U+002D HYPHEN-MINUS (-)
184 If the content model flag is set to either the RCDATA state or
185 the CDATA state, and the escape flag is false, and there are at
186 least three characters before this one in the input stream, and the
187 last four characters in the input stream, including this one, are
188 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
189 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
190 $escape = true;
191
192 /* In any case, emit the input character as a character token. Stay
193 in the data state. */
194 $this->emitToken(array(
195 'type' => self::CHARACTER,
196 'data' => '-'
197 ));
198 // We do the "any case" part as part of "anything else".
199
200 /* U+003C LESS-THAN SIGN (<) */
201 } elseif($char === '<' && $lt_cond) {
202 /* When the content model flag is set to the PCDATA state: switch
203 to the tag open state.
204
205 When the content model flag is set to either the RCDATA state or
206 the CDATA state and the escape flag is false: switch to the tag
207 open state.
208
209 Otherwise: treat it as per the "anything else" entry below. */
210 $state = 'tag open';
211
212 /* U+003E GREATER-THAN SIGN (>) */
213 } elseif(
214 $char === '>' &&
215 $gt_cond &&
216 substr($lastFourChars, 1) === '-->'
217 ) {
218 /* If the content model flag is set to either the RCDATA state or
219 the CDATA state, and the escape flag is true, and the last three
220 characters in the input stream including this one are U+002D
221 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
222 set the escape flag to false. */
223 $escape = false;
224
225 /* In any case, emit the input character as a character token.
226 Stay in the data state. */
227 $this->emitToken(array(
228 'type' => self::CHARACTER,
229 'data' => '>'
230 ));
231 // We do the "any case" part as part of "anything else".
232
233 } elseif($char === false) {
234 /* EOF
235 Emit an end-of-file token. */
236 $state = null;
237 $this->tree->emitToken(array(
238 'type' => self::EOF
239 ));
240
241 } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
242 // Directly after emitting a token you switch back to the "data
243 // state". At that point spaceCharacters are important so they are
244 // emitted separately.
245 $chars = $this->stream->charsWhile(self::WHITESPACE);
246 $this->emitToken(array(
247 'type' => self::SPACECHARACTER,
248 'data' => $char . $chars
249 ));
250 $lastFourChars .= $chars;
251 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
252
253 } else {
254 /* Anything else
255 THIS IS AN OPTIMIZATION: Get as many character that
256 otherwise would also be treated as a character token and emit it
257 as a single character token. Stay in the data state. */
258
259 $mask = '';
260 if ($hyp_cond) $mask .= '-';
261 if ($amp_cond) $mask .= '&';
262 if ($lt_cond) $mask .= '<';
263 if ($gt_cond) $mask .= '>';
264
265 if ($mask === '') {
266 $chars = $this->stream->remainingChars();
267 } else {
268 $chars = $this->stream->charsUntil($mask);
269 }
270
271 $this->emitToken(array(
272 'type' => self::CHARACTER,
273 'data' => $char . $chars
274 ));
275
276 $lastFourChars .= $chars;
277 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
278
279 $state = 'data';
280 }
281 break;
282
283 case 'character reference data':
284 /* (This cannot happen if the content model flag
285 is set to the CDATA state.) */
286
287 /* Attempt to consume a character reference, with no
288 additional allowed character. */
289 $entity = $this->consumeCharacterReference();
290
291 /* If nothing is returned, emit a U+0026 AMPERSAND
292 character token. Otherwise, emit the character token that
293 was returned. */
294 // This is all done when consuming the character reference.
295 $this->emitToken(array(
296 'type' => self::CHARACTER,
297 'data' => $entity
298 ));
299
300 /* Finally, switch to the data state. */
301 $state = 'data';
302 break;
303
304 case 'tag open':
305 $char = $this->stream->char();
306
307 switch($this->content_model) {
308 case self::RCDATA:
309 case self::CDATA:
310 /* Consume the next input character. If it is a
311 U+002F SOLIDUS (/) character, switch to the close
312 tag open state. Otherwise, emit a U+003C LESS-THAN
313 SIGN character token and reconsume the current input
314 character in the data state. */
315 // We consumed above.
316
317 if($char === '/') {
318 $state = 'close tag open';
319
320 } else {
321 $this->emitToken(array(
322 'type' => self::CHARACTER,
323 'data' => '<'
324 ));
325
326 $this->stream->unget();
327
328 $state = 'data';
329 }
330 break;
331
332 case self::PCDATA:
333 /* If the content model flag is set to the PCDATA state
334 Consume the next input character: */
335 // We consumed above.
336
337 if($char === '!') {
338 /* U+0021 EXCLAMATION MARK (!)
339 Switch to the markup declaration open state. */
340 $state = 'markup declaration open';
341
342 } elseif($char === '/') {
343 /* U+002F SOLIDUS (/)
344 Switch to the close tag open state. */
345 $state = 'close tag open';
346
347 } elseif('A' <= $char && $char <= 'Z') {
348 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
349 Create a new start tag token, set its tag name to the lowercase
350 version of the input character (add 0x0020 to the character's code
351 point), then switch to the tag name state. (Don't emit the token
352 yet; further details will be filled in before it is emitted.) */
353 $this->token = array(
354 'name' => strtolower($char),
355 'type' => self::STARTTAG,
356 'attr' => array()
357 );
358
359 $state = 'tag name';
360
361 } elseif('a' <= $char && $char <= 'z') {
362 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
363 Create a new start tag token, set its tag name to the input
364 character, then switch to the tag name state. (Don't emit
365 the token yet; further details will be filled in before it
366 is emitted.) */
367 $this->token = array(
368 'name' => $char,
369 'type' => self::STARTTAG,
370 'attr' => array()
371 );
372
373 $state = 'tag name';
374
375 } elseif($char === '>') {
376 /* U+003E GREATER-THAN SIGN (>)
377 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
378 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
379 $this->emitToken(array(
380 'type' => self::PARSEERROR,
381 'data' => 'expected-tag-name-but-got-right-bracket'
382 ));
383 $this->emitToken(array(
384 'type' => self::CHARACTER,
385 'data' => '<>'
386 ));
387
388 $state = 'data';
389
390 } elseif($char === '?') {
391 /* U+003F QUESTION MARK (?)
392 Parse error. Switch to the bogus comment state. */
393 $this->emitToken(array(
394 'type' => self::PARSEERROR,
395 'data' => 'expected-tag-name-but-got-question-mark'
396 ));
397 $this->token = array(
398 'data' => '?',
399 'type' => self::COMMENT
400 );
401 $state = 'bogus comment';
402
403 } else {
404 /* Anything else
405 Parse error. Emit a U+003C LESS-THAN SIGN character token and
406 reconsume the current input character in the data state. */
407 $this->emitToken(array(
408 'type' => self::PARSEERROR,
409 'data' => 'expected-tag-name'
410 ));
411 $this->emitToken(array(
412 'type' => self::CHARACTER,
413 'data' => '<'
414 ));
415
416 $state = 'data';
417 $this->stream->unget();
418 }
419 break;
420 }
421 break;
422
423 case 'close tag open':
424 if (
425 $this->content_model === self::RCDATA ||
426 $this->content_model === self::CDATA
427 ) {
428 /* If the content model flag is set to the RCDATA or CDATA
429 states... */
430 $name = strtolower($this->stream->charsWhile(self::ALPHA));
431 $following = $this->stream->char();
432 $this->stream->unget();
433 if (
434 !$this->token ||
435 $this->token['name'] !== $name ||
436 $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
437 ) {
438 /* if no start tag token has ever been emitted by this instance
439 of the tokenizer (fragment case), or, if the next few
440 characters do not match the tag name of the last start tag
441 token emitted (compared in an ASCII case-insensitive manner),
442 or if they do but they are not immediately followed by one of
443 the following characters:
444
445 * U+0009 CHARACTER TABULATION
446 * U+000A LINE FEED (LF)
447 * U+000C FORM FEED (FF)
448 * U+0020 SPACE
449 * U+003E GREATER-THAN SIGN (>)
450 * U+002F SOLIDUS (/)
451 * EOF
452
453 ...then emit a U+003C LESS-THAN SIGN character token, a
454 U+002F SOLIDUS character token, and switch to the data
455 state to process the next input character. */
456 // XXX: Probably ought to replace in_array with $following === x ||...
457
458 // We also need to emit $name now we've consumed that, as we
459 // know it'll just be emitted as a character token.
460 $this->emitToken(array(
461 'type' => self::CHARACTER,
462 'data' => '</' . $name
463 ));
464
465 $state = 'data';
466 } else {
467 // This matches what would happen if we actually did the
468 // otherwise below (but we can't because we've consumed too
469 // much).
470
471 // Start the end tag token with the name we already have.
472 $this->token = array(
473 'name' => $name,
474 'type' => self::ENDTAG
475 );
476
477 // Change to tag name state.
478 $state = 'tag name';
479 }
480 } elseif ($this->content_model === self::PCDATA) {
481 /* Otherwise, if the content model flag is set to the PCDATA
482 state [...]: */
483 $char = $this->stream->char();
484
485 if ('A' <= $char && $char <= 'Z') {
486 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
487 Create a new end tag token, set its tag name to the lowercase version
488 of the input character (add 0x0020 to the character's code point), then
489 switch to the tag name state. (Don't emit the token yet; further details
490 will be filled in before it is emitted.) */
491 $this->token = array(
492 'name' => strtolower($char),
493 'type' => self::ENDTAG
494 );
495
496 $state = 'tag name';
497
498 } elseif ('a' <= $char && $char <= 'z') {
499 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
500 Create a new end tag token, set its tag name to the
501 input character, then switch to the tag name state.
502 (Don't emit the token yet; further details will be
503 filled in before it is emitted.) */
504 $this->token = array(
505 'name' => $char,
506 'type' => self::ENDTAG
507 );
508
509 $state = 'tag name';
510
511 } elseif($char === '>') {
512 /* U+003E GREATER-THAN SIGN (>)
513 Parse error. Switch to the data state. */
514 $this->emitToken(array(
515 'type' => self::PARSEERROR,
516 'data' => 'expected-closing-tag-but-got-right-bracket'
517 ));
518 $state = 'data';
519
520 } elseif($char === false) {
521 /* EOF
522 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
523 SOLIDUS character token. Reconsume the EOF character in the data state. */
524 $this->emitToken(array(
525 'type' => self::PARSEERROR,
526 'data' => 'expected-closing-tag-but-got-eof'
527 ));
528 $this->emitToken(array(
529 'type' => self::CHARACTER,
530 'data' => '</'
531 ));
532
533 $this->stream->unget();
534 $state = 'data';
535
536 } else {
537 /* Parse error. Switch to the bogus comment state. */
538 $this->emitToken(array(
539 'type' => self::PARSEERROR,
540 'data' => 'expected-closing-tag-but-got-char'
541 ));
542 $this->token = array(
543 'data' => $char,
544 'type' => self::COMMENT
545 );
546 $state = 'bogus comment';
547 }
548 }
549 break;
550
551 case 'tag name':
552 /* Consume the next input character: */
553 $char = $this->stream->char();
554
555 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
556 /* U+0009 CHARACTER TABULATION
557 U+000A LINE FEED (LF)
558 U+000C FORM FEED (FF)
559 U+0020 SPACE
560 Switch to the before attribute name state. */
561 $state = 'before attribute name';
562
563 } elseif($char === '/') {
564 /* U+002F SOLIDUS (/)
565 Switch to the self-closing start tag state. */
566 $state = 'self-closing start tag';
567
568 } elseif($char === '>') {
569 /* U+003E GREATER-THAN SIGN (>)
570 Emit the current tag token. Switch to the data state. */
571 $this->emitToken($this->token);
572 $state = 'data';
573
574 } elseif('A' <= $char && $char <= 'Z') {
575 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
576 Append the lowercase version of the current input
577 character (add 0x0020 to the character's code point) to
578 the current tag token's tag name. Stay in the tag name state. */
579 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
580
581 $this->token['name'] .= strtolower($char . $chars);
582 $state = 'tag name';
583
584 } elseif($char === false) {
585 /* EOF
586 Parse error. Reconsume the EOF character in the data state. */
587 $this->emitToken(array(
588 'type' => self::PARSEERROR,
589 'data' => 'eof-in-tag-name'
590 ));
591
592 $this->stream->unget();
593 $state = 'data';
594
595 } else {
596 /* Anything else
597 Append the current input character to the current tag token's tag name.
598 Stay in the tag name state. */
599 $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
600
601 $this->token['name'] .= $char . $chars;
602 $state = 'tag name';
603 }
604 break;
605
606 case 'before attribute name':
607 /* Consume the next input character: */
608 $char = $this->stream->char();
609
610 // this conditional is optimized, check bottom
611 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
612 /* U+0009 CHARACTER TABULATION
613 U+000A LINE FEED (LF)
614 U+000C FORM FEED (FF)
615 U+0020 SPACE
616 Stay in the before attribute name state. */
617 $state = 'before attribute name';
618
619 } elseif($char === '/') {
620 /* U+002F SOLIDUS (/)
621 Switch to the self-closing start tag state. */
622 $state = 'self-closing start tag';
623
624 } elseif($char === '>') {
625 /* U+003E GREATER-THAN SIGN (>)
626 Emit the current tag token. Switch to the data state. */
627 $this->emitToken($this->token);
628 $state = 'data';
629
630 } elseif('A' <= $char && $char <= 'Z') {
631 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
632 Start a new attribute in the current tag token. Set that
633 attribute's name to the lowercase version of the current
634 input character (add 0x0020 to the character's code
635 point), and its value to the empty string. Switch to the
636 attribute name state.*/
637 $this->token['attr'][] = array(
638 'name' => strtolower($char),
639 'value' => ''
640 );
641
642 $state = 'attribute name';
643
644 } elseif($char === false) {
645 /* EOF
646 Parse error. Reconsume the EOF character in the data state. */
647 $this->emitToken(array(
648 'type' => self::PARSEERROR,
649 'data' => 'expected-attribute-name-but-got-eof'
650 ));
651
652 $this->stream->unget();
653 $state = 'data';
654
655 } else {
656 /* U+0022 QUOTATION MARK (")
657 U+0027 APOSTROPHE (')
658 U+003C LESS-THAN SIGN (<)
659 U+003D EQUALS SIGN (=)
660 Parse error. Treat it as per the "anything else" entry
661 below. */
662 if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
663 $this->emitToken(array(
664 'type' => self::PARSEERROR,
665 'data' => 'invalid-character-in-attribute-name'
666 ));
667 }
668
669 /* Anything else
670 Start a new attribute in the current tag token. Set that attribute's
671 name to the current input character, and its value to the empty string.
672 Switch to the attribute name state. */
673 $this->token['attr'][] = array(
674 'name' => $char,
675 'value' => ''
676 );
677
678 $state = 'attribute name';
679 }
680 break;
681
682 case 'attribute name':
683 // Consume the next input character:
684 $char = $this->stream->char();
685
686 // this conditional is optimized, check bottom
687 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
688 /* U+0009 CHARACTER TABULATION
689 U+000A LINE FEED (LF)
690 U+000C FORM FEED (FF)
691 U+0020 SPACE
692 Switch to the after attribute name state. */
693 $state = 'after attribute name';
694
695 } elseif($char === '/') {
696 /* U+002F SOLIDUS (/)
697 Switch to the self-closing start tag state. */
698 $state = 'self-closing start tag';
699
700 } elseif($char === '=') {
701 /* U+003D EQUALS SIGN (=)
702 Switch to the before attribute value state. */
703 $state = 'before attribute value';
704
705 } elseif($char === '>') {
706 /* U+003E GREATER-THAN SIGN (>)
707 Emit the current tag token. Switch to the data state. */
708 $this->emitToken($this->token);
709 $state = 'data';
710
711 } elseif('A' <= $char && $char <= 'Z') {
712 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
713 Append the lowercase version of the current input
714 character (add 0x0020 to the character's code point) to
715 the current attribute's name. Stay in the attribute name
716 state. */
717 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
718
719 $last = count($this->token['attr']) - 1;
720 $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
721
722 $state = 'attribute name';
723
724 } elseif($char === false) {
725 /* EOF
726 Parse error. Reconsume the EOF character in the data state. */
727 $this->emitToken(array(
728 'type' => self::PARSEERROR,
729 'data' => 'eof-in-attribute-name'
730 ));
731
732 $this->stream->unget();
733 $state = 'data';
734
735 } else {
736 /* U+0022 QUOTATION MARK (")
737 U+0027 APOSTROPHE (')
738 U+003C LESS-THAN SIGN (<)
739 Parse error. Treat it as per the "anything else"
740 entry below. */
741 if($char === '"' || $char === "'" || $char === '<') {
742 $this->emitToken(array(
743 'type' => self::PARSEERROR,
744 'data' => 'invalid-character-in-attribute-name'
745 ));
746 }
747
748 /* Anything else
749 Append the current input character to the current attribute's name.
750 Stay in the attribute name state. */
751 $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
752
753 $last = count($this->token['attr']) - 1;
754 $this->token['attr'][$last]['name'] .= $char . $chars;
755
756 $state = 'attribute name';
757 }
758
759 /* When the user agent leaves the attribute name state
760 (and before emitting the tag token, if appropriate), the
761 complete attribute's name must be compared to the other
762 attributes on the same token; if there is already an
763 attribute on the token with the exact same name, then this
764 is a parse error and the new attribute must be dropped, along
765 with the value that gets associated with it (if any). */
766 // this might be implemented in the emitToken method
767 break;
768
769 case 'after attribute name':
770 // Consume the next input character:
771 $char = $this->stream->char();
772
773 // this is an optimized conditional, check the bottom
774 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
775 /* U+0009 CHARACTER TABULATION
776 U+000A LINE FEED (LF)
777 U+000C FORM FEED (FF)
778 U+0020 SPACE
779 Stay in the after attribute name state. */
780 $state = 'after attribute name';
781
782 } elseif($char === '/') {
783 /* U+002F SOLIDUS (/)
784 Switch to the self-closing start tag state. */
785 $state = 'self-closing start tag';
786
787 } elseif($char === '=') {
788 /* U+003D EQUALS SIGN (=)
789 Switch to the before attribute value state. */
790 $state = 'before attribute value';
791
792 } elseif($char === '>') {
793 /* U+003E GREATER-THAN SIGN (>)
794 Emit the current tag token. Switch to the data state. */
795 $this->emitToken($this->token);
796 $state = 'data';
797
798 } elseif('A' <= $char && $char <= 'Z') {
799 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
800 Start a new attribute in the current tag token. Set that
801 attribute's name to the lowercase version of the current
802 input character (add 0x0020 to the character's code
803 point), and its value to the empty string. Switch to the
804 attribute name state. */
805 $this->token['attr'][] = array(
806 'name' => strtolower($char),
807 'value' => ''
808 );
809
810 $state = 'attribute name';
811
812 } elseif($char === false) {
813 /* EOF
814 Parse error. Reconsume the EOF character in the data state. */
815 $this->emitToken(array(
816 'type' => self::PARSEERROR,
817 'data' => 'expected-end-of-tag-but-got-eof'
818 ));
819
820 $this->stream->unget();
821 $state = 'data';
822
823 } else {
824 /* U+0022 QUOTATION MARK (")
825 U+0027 APOSTROPHE (')
826 U+003C LESS-THAN SIGN(<)
827 Parse error. Treat it as per the "anything else"
828 entry below. */
829 if($char === '"' || $char === "'" || $char === "<") {
830 $this->emitToken(array(
831 'type' => self::PARSEERROR,
832 'data' => 'invalid-character-after-attribute-name'
833 ));
834 }
835
836 /* Anything else
837 Start a new attribute in the current tag token. Set that attribute's
838 name to the current input character, and its value to the empty string.
839 Switch to the attribute name state. */
840 $this->token['attr'][] = array(
841 'name' => $char,
842 'value' => ''
843 );
844
845 $state = 'attribute name';
846 }
847 break;
848
849 case 'before attribute value':
850 // Consume the next input character:
851 $char = $this->stream->char();
852
853 // this is an optimized conditional
854 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
855 /* U+0009 CHARACTER TABULATION
856 U+000A LINE FEED (LF)
857 U+000C FORM FEED (FF)
858 U+0020 SPACE
859 Stay in the before attribute value state. */
860 $state = 'before attribute value';
861
862 } elseif($char === '"') {
863 /* U+0022 QUOTATION MARK (")
864 Switch to the attribute value (double-quoted) state. */
865 $state = 'attribute value (double-quoted)';
866
867 } elseif($char === '&') {
868 /* U+0026 AMPERSAND (&)
869 Switch to the attribute value (unquoted) state and reconsume
870 this input character. */
871 $this->stream->unget();
872 $state = 'attribute value (unquoted)';
873
874 } elseif($char === '\'') {
875 /* U+0027 APOSTROPHE (')
876 Switch to the attribute value (single-quoted) state. */
877 $state = 'attribute value (single-quoted)';
878
879 } elseif($char === '>') {
880 /* U+003E GREATER-THAN SIGN (>)
881 Parse error. Emit the current tag token. Switch to the data state. */
882 $this->emitToken(array(
883 'type' => self::PARSEERROR,
884 'data' => 'expected-attribute-value-but-got-right-bracket'
885 ));
886 $this->emitToken($this->token);
887 $state = 'data';
888
889 } elseif($char === false) {
890 /* EOF
891 Parse error. Reconsume the EOF character in the data state. */
892 $this->emitToken(array(
893 'type' => self::PARSEERROR,
894 'data' => 'expected-attribute-value-but-got-eof'
895 ));
896 $this->stream->unget();
897 $state = 'data';
898
899 } else {
900 /* U+003D EQUALS SIGN (=)
901 * U+003C LESS-THAN SIGN (<)
902 Parse error. Treat it as per the "anything else" entry below. */
903 if($char === '=' || $char === '<') {
904 $this->emitToken(array(
905 'type' => self::PARSEERROR,
906 'data' => 'equals-in-unquoted-attribute-value'
907 ));
908 }
909
910 /* Anything else
911 Append the current input character to the current attribute's value.
912 Switch to the attribute value (unquoted) state. */
913 $last = count($this->token['attr']) - 1;
914 $this->token['attr'][$last]['value'] .= $char;
915
916 $state = 'attribute value (unquoted)';
917 }
918 break;
919
920 case 'attribute value (double-quoted)':
921 // Consume the next input character:
922 $char = $this->stream->char();
923
924 if($char === '"') {
925 /* U+0022 QUOTATION MARK (")
926 Switch to the after attribute value (quoted) state. */
927 $state = 'after attribute value (quoted)';
928
929 } elseif($char === '&') {
930 /* U+0026 AMPERSAND (&)
931 Switch to the character reference in attribute value
932 state, with the additional allowed character
933 being U+0022 QUOTATION MARK ("). */
934 $this->characterReferenceInAttributeValue('"');
935
936 } elseif($char === false) {
937 /* EOF
938 Parse error. Reconsume the EOF character in the data state. */
939 $this->emitToken(array(
940 'type' => self::PARSEERROR,
941 'data' => 'eof-in-attribute-value-double-quote'
942 ));
943
944 $this->stream->unget();
945 $state = 'data';
946
947 } else {
948 /* Anything else
949 Append the current input character to the current attribute's value.
950 Stay in the attribute value (double-quoted) state. */
951 $chars = $this->stream->charsUntil('"&');
952
953 $last = count($this->token['attr']) - 1;
954 $this->token['attr'][$last]['value'] .= $char . $chars;
955
956 $state = 'attribute value (double-quoted)';
957 }
958 break;
959
960 case 'attribute value (single-quoted)':
961 // Consume the next input character:
962 $char = $this->stream->char();
963
964 if($char === "'") {
965 /* U+0022 QUOTATION MARK (')
966 Switch to the after attribute value state. */
967 $state = 'after attribute value (quoted)';
968
969 } elseif($char === '&') {
970 /* U+0026 AMPERSAND (&)
971 Switch to the entity in attribute value state. */
972 $this->characterReferenceInAttributeValue("'");
973
974 } elseif($char === false) {
975 /* EOF
976 Parse error. Reconsume the EOF character in the data state. */
977 $this->emitToken(array(
978 'type' => self::PARSEERROR,
979 'data' => 'eof-in-attribute-value-single-quote'
980 ));
981
982 $this->stream->unget();
983 $state = 'data';
984
985 } else {
986 /* Anything else
987 Append the current input character to the current attribute's value.
988 Stay in the attribute value (single-quoted) state. */
989 $chars = $this->stream->charsUntil("'&");
990
991 $last = count($this->token['attr']) - 1;
992 $this->token['attr'][$last]['value'] .= $char . $chars;
993
994 $state = 'attribute value (single-quoted)';
995 }
996 break;
997
998 case 'attribute value (unquoted)':
999 // Consume the next input character:
1000 $char = $this->stream->char();
1001
1002 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1003 /* U+0009 CHARACTER TABULATION
1004 U+000A LINE FEED (LF)
1005 U+000C FORM FEED (FF)
1006 U+0020 SPACE
1007 Switch to the before attribute name state. */
1008 $state = 'before attribute name';
1009
1010 } elseif($char === '&') {
1011 /* U+0026 AMPERSAND (&)
1012 Switch to the entity in attribute value state, with the
1013 additional allowed character being U+003E
1014 GREATER-THAN SIGN (>). */
1015 $this->characterReferenceInAttributeValue('>');
1016
1017 } elseif($char === '>') {
1018 /* U+003E GREATER-THAN SIGN (>)
1019 Emit the current tag token. Switch to the data state. */
1020 $this->emitToken($this->token);
1021 $state = 'data';
1022
1023 } elseif ($char === false) {
1024 /* EOF
1025 Parse error. Reconsume the EOF character in the data state. */
1026 $this->emitToken(array(
1027 'type' => self::PARSEERROR,
1028 'data' => 'eof-in-attribute-value-no-quotes'
1029 ));
1030 $this->stream->unget();
1031 $state = 'data';
1032
1033 } else {
1034 /* U+0022 QUOTATION MARK (")
1035 U+0027 APOSTROPHE (')
1036 U+003C LESS-THAN SIGN (<)
1037 U+003D EQUALS SIGN (=)
1038 Parse error. Treat it as per the "anything else"
1039 entry below. */
1040 if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1041 $this->emitToken(array(
1042 'type' => self::PARSEERROR,
1043 'data' => 'unexpected-character-in-unquoted-attribute-value'
1044 ));
1045 }
1046
1047 /* Anything else
1048 Append the current input character to the current attribute's value.
1049 Stay in the attribute value (unquoted) state. */
1050 $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1051
1052 $last = count($this->token['attr']) - 1;
1053 $this->token['attr'][$last]['value'] .= $char . $chars;
1054
1055 $state = 'attribute value (unquoted)';
1056 }
1057 break;
1058
1059 case 'after attribute value (quoted)':
1060 /* Consume the next input character: */
1061 $char = $this->stream->char();
1062
1063 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1064 /* U+0009 CHARACTER TABULATION
1065 U+000A LINE FEED (LF)
1066 U+000C FORM FEED (FF)
1067 U+0020 SPACE
1068 Switch to the before attribute name state. */
1069 $state = 'before attribute name';
1070
1071 } elseif ($char === '/') {
1072 /* U+002F SOLIDUS (/)
1073 Switch to the self-closing start tag state. */
1074 $state = 'self-closing start tag';
1075
1076 } elseif ($char === '>') {
1077 /* U+003E GREATER-THAN SIGN (>)
1078 Emit the current tag token. Switch to the data state. */
1079 $this->emitToken($this->token);
1080 $state = 'data';
1081
1082 } elseif ($char === false) {
1083 /* EOF
1084 Parse error. Reconsume the EOF character in the data state. */
1085 $this->emitToken(array(
1086 'type' => self::PARSEERROR,
1087 'data' => 'unexpected-EOF-after-attribute-value'
1088 ));
1089 $this->stream->unget();
1090 $state = 'data';
1091
1092 } else {
1093 /* Anything else
1094 Parse error. Reconsume the character in the before attribute
1095 name state. */
1096 $this->emitToken(array(
1097 'type' => self::PARSEERROR,
1098 'data' => 'unexpected-character-after-attribute-value'
1099 ));
1100 $this->stream->unget();
1101 $state = 'before attribute name';
1102 }
1103 break;
1104
1105 case 'self-closing start tag':
1106 /* Consume the next input character: */
1107 $char = $this->stream->char();
1108
1109 if ($char === '>') {
1110 /* U+003E GREATER-THAN SIGN (>)
1111 Set the self-closing flag of the current tag token.
1112 Emit the current tag token. Switch to the data state. */
1113 // not sure if this is the name we want
1114 $this->token['self-closing'] = true;
1115 $this->emitToken($this->token);
1116 $state = 'data';
1117
1118 } elseif ($char === false) {
1119 /* EOF
1120 Parse error. Reconsume the EOF character in the data state. */
1121 $this->emitToken(array(
1122 'type' => self::PARSEERROR,
1123 'data' => 'unexpected-eof-after-self-closing'
1124 ));
1125 $this->stream->unget();
1126 $state = 'data';
1127
1128 } else {
1129 /* Anything else
1130 Parse error. Reconsume the character in the before attribute name state. */
1131 $this->emitToken(array(
1132 'type' => self::PARSEERROR,
1133 'data' => 'unexpected-character-after-self-closing'
1134 ));
1135 $this->stream->unget();
1136 $state = 'before attribute name';
1137 }
1138 break;
1139
1140 case 'bogus comment':
1141 /* (This can only happen if the content model flag is set to the PCDATA state.) */
1142 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1143 character (>) or the end of the file (EOF), whichever comes first. Emit
1144 a comment token whose data is the concatenation of all the characters
1145 starting from and including the character that caused the state machine
1146 to switch into the bogus comment state, up to and including the last
1147 consumed character before the U+003E character, if any, or up to the
1148 end of the file otherwise. (If the comment was started by the end of
1149 the file (EOF), the token is empty.) */
1150 $this->token['data'] .= (string) $this->stream->charsUntil('>');
1151 $this->stream->char();
1152
1153 $this->emitToken($this->token);
1154
1155 /* Switch to the data state. */
1156 $state = 'data';
1157 break;
1158
1159 case 'markup declaration open':
1160 // Consume for below
1161 $hyphens = $this->stream->charsWhile('-', 2);
1162 if ($hyphens === '-') {
1163 $this->stream->unget();
1164 }
1165 if ($hyphens !== '--') {
1166 $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1167 }
1168
1169 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1170 characters, consume those two characters, create a comment token whose
1171 data is the empty string, and switch to the comment state. */
1172 if($hyphens === '--') {
1173 $state = 'comment start';
1174 $this->token = array(
1175 'data' => '',
1176 'type' => self::COMMENT
1177 );
1178
1179 /* Otherwise if the next seven characters are a case-insensitive match
1180 for the word "DOCTYPE", then consume those characters and switch to the
1181 DOCTYPE state. */
1182 } elseif(strtoupper($alpha) === 'DOCTYPE') {
1183 $state = 'DOCTYPE';
1184
1185 // XXX not implemented
1186 /* Otherwise, if the insertion mode is "in foreign content"
1187 and the current node is not an element in the HTML namespace
1188 and the next seven characters are an ASCII case-sensitive
1189 match for the string "[CDATA[" (the five uppercase letters
1190 "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1191 and after), then consume those characters and switch to the
1192 CDATA section state (which is unrelated to the content model
1193 flag's CDATA state). */
1194
1195 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1196 The next character that is consumed, if any, is the first character
1197 that will be in the comment. */
1198 } else {
1199 $this->emitToken(array(
1200 'type' => self::PARSEERROR,
1201 'data' => 'expected-dashes-or-doctype'
1202 ));
1203 $this->token = array(
1204 'data' => (string) $alpha,
1205 'type' => self::COMMENT
1206 );
1207 $state = 'bogus comment';
1208 }
1209 break;
1210
1211 case 'comment start':
1212 /* Consume the next input character: */
1213 $char = $this->stream->char();
1214
1215 if ($char === '-') {
1216 /* U+002D HYPHEN-MINUS (-)
1217 Switch to the comment start dash state. */
1218 $state = 'comment start dash';
1219 } elseif ($char === '>') {
1220 /* U+003E GREATER-THAN SIGN (>)
1221 Parse error. Emit the comment token. Switch to the
1222 data state. */
1223 $this->emitToken(array(
1224 'type' => self::PARSEERROR,
1225 'data' => 'incorrect-comment'
1226 ));
1227 $this->emitToken($this->token);
1228 $state = 'data';
1229 } elseif ($char === false) {
1230 /* EOF
1231 Parse error. Emit the comment token. Reconsume the
1232 EOF character in the data state. */
1233 $this->emitToken(array(
1234 'type' => self::PARSEERROR,
1235 'data' => 'eof-in-comment'
1236 ));
1237 $this->emitToken($this->token);
1238 $this->stream->unget();
1239 $state = 'data';
1240 } else {
1241 /* Anything else
1242 Append the input character to the comment token's
1243 data. Switch to the comment state. */
1244 $this->token['data'] .= $char;
1245 $state = 'comment';
1246 }
1247 break;
1248
1249 case 'comment start dash':
1250 /* Consume the next input character: */
1251 $char = $this->stream->char();
1252 if ($char === '-') {
1253 /* U+002D HYPHEN-MINUS (-)
1254 Switch to the comment end state */
1255 $state = 'comment end';
1256 } elseif ($char === '>') {
1257 /* U+003E GREATER-THAN SIGN (>)
1258 Parse error. Emit the comment token. Switch to the
1259 data state. */
1260 $this->emitToken(array(
1261 'type' => self::PARSEERROR,
1262 'data' => 'incorrect-comment'
1263 ));
1264 $this->emitToken($this->token);
1265 $state = 'data';
1266 } elseif ($char === false) {
1267 /* Parse error. Emit the comment token. Reconsume the
1268 EOF character in the data state. */
1269 $this->emitToken(array(
1270 'type' => self::PARSEERROR,
1271 'data' => 'eof-in-comment'
1272 ));
1273 $this->emitToken($this->token);
1274 $this->stream->unget();
1275 $state = 'data';
1276 } else {
1277 $this->token['data'] .= '-' . $char;
1278 $state = 'comment';
1279 }
1280 break;
1281
1282 case 'comment':
1283 /* Consume the next input character: */
1284 $char = $this->stream->char();
1285
1286 if($char === '-') {
1287 /* U+002D HYPHEN-MINUS (-)
1288 Switch to the comment end dash state */
1289 $state = 'comment end dash';
1290
1291 } elseif($char === false) {
1292 /* EOF
1293 Parse error. Emit the comment token. Reconsume the EOF character
1294 in the data state. */
1295 $this->emitToken(array(
1296 'type' => self::PARSEERROR,
1297 'data' => 'eof-in-comment'
1298 ));
1299 $this->emitToken($this->token);
1300 $this->stream->unget();
1301 $state = 'data';
1302
1303 } else {
1304 /* Anything else
1305 Append the input character to the comment token's data. Stay in
1306 the comment state. */
1307 $chars = $this->stream->charsUntil('-');
1308
1309 $this->token['data'] .= $char . $chars;
1310 }
1311 break;
1312
1313 case 'comment end dash':
1314 /* Consume the next input character: */
1315 $char = $this->stream->char();
1316
1317 if($char === '-') {
1318 /* U+002D HYPHEN-MINUS (-)
1319 Switch to the comment end state */
1320 $state = 'comment end';
1321
1322 } elseif($char === false) {
1323 /* EOF
1324 Parse error. Emit the comment token. Reconsume the EOF character
1325 in the data state. */
1326 $this->emitToken(array(
1327 'type' => self::PARSEERROR,
1328 'data' => 'eof-in-comment-end-dash'
1329 ));
1330 $this->emitToken($this->token);
1331 $this->stream->unget();
1332 $state = 'data';
1333
1334 } else {
1335 /* Anything else
1336 Append a U+002D HYPHEN-MINUS (-) character and the input
1337 character to the comment token's data. Switch to the comment state. */
1338 $this->token['data'] .= '-'.$char;
1339 $state = 'comment';
1340 }
1341 break;
1342
1343 case 'comment end':
1344 /* Consume the next input character: */
1345 $char = $this->stream->char();
1346
1347 if($char === '>') {
1348 /* U+003E GREATER-THAN SIGN (>)
1349 Emit the comment token. Switch to the data state. */
1350 $this->emitToken($this->token);
1351 $state = 'data';
1352
1353 } elseif($char === '-') {
1354 /* U+002D HYPHEN-MINUS (-)
1355 Parse error. Append a U+002D HYPHEN-MINUS (-) character
1356 to the comment token's data. Stay in the comment end
1357 state. */
1358 $this->emitToken(array(
1359 'type' => self::PARSEERROR,
1360 'data' => 'unexpected-dash-after-double-dash-in-comment'
1361 ));
1362 $this->token['data'] .= '-';
1363
1364 } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1365 $this->emitToken(array(
1366 'type' => self::PARSEERROR,
1367 'data' => 'unexpected-space-after-double-dash-in-comment'
1368 ));
1369 $this->token['data'] .= '--' . $char;
1370 $state = 'comment end space';
1371
1372 } elseif($char === '!') {
1373 $this->emitToken(array(
1374 'type' => self::PARSEERROR,
1375 'data' => 'unexpected-bang-after-double-dash-in-comment'
1376 ));
1377 $state = 'comment end bang';
1378
1379 } elseif($char === false) {
1380 /* EOF
1381 Parse error. Emit the comment token. Reconsume the
1382 EOF character in the data state. */
1383 $this->emitToken(array(
1384 'type' => self::PARSEERROR,
1385 'data' => 'eof-in-comment-double-dash'
1386 ));
1387 $this->emitToken($this->token);
1388 $this->stream->unget();
1389 $state = 'data';
1390
1391 } else {
1392 /* Anything else
1393 Parse error. Append two U+002D HYPHEN-MINUS (-)
1394 characters and the input character to the comment token's
1395 data. Switch to the comment state. */
1396 $this->emitToken(array(
1397 'type' => self::PARSEERROR,
1398 'data' => 'unexpected-char-in-comment'
1399 ));
1400 $this->token['data'] .= '--'.$char;
1401 $state = 'comment';
1402 }
1403 break;
1404
1405 case 'comment end bang':
1406 $char = $this->stream->char();
1407 if ($char === '>') {
1408 $this->emitToken($this->token);
1409 $state = 'data';
1410 } elseif ($char === "-") {
1411 $this->token['data'] .= '--!';
1412 $state = 'comment end dash';
1413 } elseif ($char === false) {
1414 $this->emitToken(array(
1415 'type' => self::PARSEERROR,
1416 'data' => 'eof-in-comment-end-bang'
1417 ));
1418 $this->emitToken($this->token);
1419 $this->stream->unget();
1420 $state = 'data';
1421 } else {
1422 $this->token['data'] .= '--!' . $char;
1423 $state = 'comment';
1424 }
1425 break;
1426
1427 case 'comment end space':
1428 $char = $this->stream->char();
1429 if ($char === '>') {
1430 $this->emitToken($this->token);
1431 $state = 'data';
1432 } elseif ($char === '-') {
1433 $state = 'comment end dash';
1434 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1435 $this->token['data'] .= $char;
1436 } elseif ($char === false) {
1437 $this->emitToken(array(
1438 'type' => self::PARSEERROR,
1439 'data' => 'unexpected-eof-in-comment-end-space',
1440 ));
1441 $this->emitToken($this->token);
1442 $this->stream->unget();
1443 $state = 'data';
1444 } else {
1445 $this->token['data'] .= $char;
1446 $state = 'comment';
1447 }
1448 break;
1449
1450 case 'DOCTYPE':
1451 /* Consume the next input character: */
1452 $char = $this->stream->char();
1453
1454 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1455 /* U+0009 CHARACTER TABULATION
1456 U+000A LINE FEED (LF)
1457 U+000C FORM FEED (FF)
1458 U+0020 SPACE
1459 Switch to the before DOCTYPE name state. */
1460 $state = 'before DOCTYPE name';
1461
1462 } elseif($char === false) {
1463 /* EOF
1464 Parse error. Create a new DOCTYPE token. Set its
1465 force-quirks flag to on. Emit the token. Reconsume the
1466 EOF character in the data state. */
1467 $this->emitToken(array(
1468 'type' => self::PARSEERROR,
1469 'data' => 'need-space-after-doctype-but-got-eof'
1470 ));
1471 $this->emitToken(array(
1472 'name' => '',
1473 'type' => self::DOCTYPE,
1474 'force-quirks' => true,
1475 'error' => true
1476 ));
1477 $this->stream->unget();
1478 $state = 'data';
1479
1480 } else {
1481 /* Anything else
1482 Parse error. Reconsume the current character in the
1483 before DOCTYPE name state. */
1484 $this->emitToken(array(
1485 'type' => self::PARSEERROR,
1486 'data' => 'need-space-after-doctype'
1487 ));
1488 $this->stream->unget();
1489 $state = 'before DOCTYPE name';
1490 }
1491 break;
1492
1493 case 'before DOCTYPE name':
1494 /* Consume the next input character: */
1495 $char = $this->stream->char();
1496
1497 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1498 /* U+0009 CHARACTER TABULATION
1499 U+000A LINE FEED (LF)
1500 U+000C FORM FEED (FF)
1501 U+0020 SPACE
1502 Stay in the before DOCTYPE name state. */
1503
1504 } elseif($char === '>') {
1505 /* U+003E GREATER-THAN SIGN (>)
1506 Parse error. Create a new DOCTYPE token. Set its
1507 force-quirks flag to on. Emit the token. Switch to the
1508 data state. */
1509 $this->emitToken(array(
1510 'type' => self::PARSEERROR,
1511 'data' => 'expected-doctype-name-but-got-right-bracket'
1512 ));
1513 $this->emitToken(array(
1514 'name' => '',
1515 'type' => self::DOCTYPE,
1516 'force-quirks' => true,
1517 'error' => true
1518 ));
1519
1520 $state = 'data';
1521
1522 } elseif('A' <= $char && $char <= 'Z') {
1523 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1524 Create a new DOCTYPE token. Set the token's name to the
1525 lowercase version of the input character (add 0x0020 to
1526 the character's code point). Switch to the DOCTYPE name
1527 state. */
1528 $this->token = array(
1529 'name' => strtolower($char),
1530 'type' => self::DOCTYPE,
1531 'error' => true
1532 );
1533
1534 $state = 'DOCTYPE name';
1535
1536 } elseif($char === false) {
1537 /* EOF
1538 Parse error. Create a new DOCTYPE token. Set its
1539 force-quirks flag to on. Emit the token. Reconsume the
1540 EOF character in the data state. */
1541 $this->emitToken(array(
1542 'type' => self::PARSEERROR,
1543 'data' => 'expected-doctype-name-but-got-eof'
1544 ));
1545 $this->emitToken(array(
1546 'name' => '',
1547 'type' => self::DOCTYPE,
1548 'force-quirks' => true,
1549 'error' => true
1550 ));
1551
1552 $this->stream->unget();
1553 $state = 'data';
1554
1555 } else {
1556 /* Anything else
1557 Create a new DOCTYPE token. Set the token's name to the
1558 current input character. Switch to the DOCTYPE name state. */
1559 $this->token = array(
1560 'name' => $char,
1561 'type' => self::DOCTYPE,
1562 'error' => true
1563 );
1564
1565 $state = 'DOCTYPE name';
1566 }
1567 break;
1568
1569 case 'DOCTYPE name':
1570 /* Consume the next input character: */
1571 $char = $this->stream->char();
1572
1573 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1574 /* U+0009 CHARACTER TABULATION
1575 U+000A LINE FEED (LF)
1576 U+000C FORM FEED (FF)
1577 U+0020 SPACE
1578 Switch to the after DOCTYPE name state. */
1579 $state = 'after DOCTYPE name';
1580
1581 } elseif($char === '>') {
1582 /* U+003E GREATER-THAN SIGN (>)
1583 Emit the current DOCTYPE token. Switch to the data state. */
1584 $this->emitToken($this->token);
1585 $state = 'data';
1586
1587 } elseif('A' <= $char && $char <= 'Z') {
1588 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1589 Append the lowercase version of the input character
1590 (add 0x0020 to the character's code point) to the current
1591 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1592 $this->token['name'] .= strtolower($char);
1593
1594 } elseif($char === false) {
1595 /* EOF
1596 Parse error. Set the DOCTYPE token's force-quirks flag
1597 to on. Emit that DOCTYPE token. Reconsume the EOF
1598 character in the data state. */
1599 $this->emitToken(array(
1600 'type' => self::PARSEERROR,
1601 'data' => 'eof-in-doctype-name'
1602 ));
1603 $this->token['force-quirks'] = true;
1604 $this->emitToken($this->token);
1605 $this->stream->unget();
1606 $state = 'data';
1607
1608 } else {
1609 /* Anything else
1610 Append the current input character to the current
1611 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1612 $this->token['name'] .= $char;
1613 }
1614
1615 // XXX this is probably some sort of quirks mode designation,
1616 // check tree-builder to be sure. In general 'error' needs
1617 // to be specc'ified, this probably means removing it at the end
1618 $this->token['error'] = ($this->token['name'] === 'HTML')
1619 ? false
1620 : true;
1621 break;
1622
1623 case 'after DOCTYPE name':
1624 /* Consume the next input character: */
1625 $char = $this->stream->char();
1626
1627 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1628 /* U+0009 CHARACTER TABULATION
1629 U+000A LINE FEED (LF)
1630 U+000C FORM FEED (FF)
1631 U+0020 SPACE
1632 Stay in the after DOCTYPE name state. */
1633
1634 } elseif($char === '>') {
1635 /* U+003E GREATER-THAN SIGN (>)
1636 Emit the current DOCTYPE token. Switch to the data state. */
1637 $this->emitToken($this->token);
1638 $state = 'data';
1639
1640 } elseif($char === false) {
1641 /* EOF
1642 Parse error. Set the DOCTYPE token's force-quirks flag
1643 to on. Emit that DOCTYPE token. Reconsume the EOF
1644 character in the data state. */
1645 $this->emitToken(array(
1646 'type' => self::PARSEERROR,
1647 'data' => 'eof-in-doctype'
1648 ));
1649 $this->token['force-quirks'] = true;
1650 $this->emitToken($this->token);
1651 $this->stream->unget();
1652 $state = 'data';
1653
1654 } else {
1655 /* Anything else */
1656
1657 $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1658 if ($nextSix === 'PUBLIC') {
1659 /* If the next six characters are an ASCII
1660 case-insensitive match for the word "PUBLIC", then
1661 consume those characters and switch to the before
1662 DOCTYPE public identifier state. */
1663 $state = 'before DOCTYPE public identifier';
1664
1665 } elseif ($nextSix === 'SYSTEM') {
1666 /* Otherwise, if the next six characters are an ASCII
1667 case-insensitive match for the word "SYSTEM", then
1668 consume those characters and switch to the before
1669 DOCTYPE system identifier state. */
1670 $state = 'before DOCTYPE system identifier';
1671
1672 } else {
1673 /* Otherwise, this is the parse error. Set the DOCTYPE
1674 token's force-quirks flag to on. Switch to the bogus
1675 DOCTYPE state. */
1676 $this->emitToken(array(
1677 'type' => self::PARSEERROR,
1678 'data' => 'expected-space-or-right-bracket-in-doctype'
1679 ));
1680 $this->token['force-quirks'] = true;
1681 $this->token['error'] = true;
1682 $state = 'bogus DOCTYPE';
1683 }
1684 }
1685 break;
1686
1687 case 'before DOCTYPE public identifier':
1688 /* Consume the next input character: */
1689 $char = $this->stream->char();
1690
1691 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1692 /* U+0009 CHARACTER TABULATION
1693 U+000A LINE FEED (LF)
1694 U+000C FORM FEED (FF)
1695 U+0020 SPACE
1696 Stay in the before DOCTYPE public identifier state. */
1697 } elseif ($char === '"') {
1698 /* U+0022 QUOTATION MARK (")
1699 Set the DOCTYPE token's public identifier to the empty
1700 string (not missing), then switch to the DOCTYPE public
1701 identifier (double-quoted) state. */
1702 $this->token['public'] = '';
1703 $state = 'DOCTYPE public identifier (double-quoted)';
1704 } elseif ($char === "'") {
1705 /* U+0027 APOSTROPHE (')
1706 Set the DOCTYPE token's public identifier to the empty
1707 string (not missing), then switch to the DOCTYPE public
1708 identifier (single-quoted) state. */
1709 $this->token['public'] = '';
1710 $state = 'DOCTYPE public identifier (single-quoted)';
1711 } elseif ($char === '>') {
1712 /* Parse error. Set the DOCTYPE token's force-quirks flag
1713 to on. Emit that DOCTYPE token. Switch to the data state. */
1714 $this->emitToken(array(
1715 'type' => self::PARSEERROR,
1716 'data' => 'unexpected-end-of-doctype'
1717 ));
1718 $this->token['force-quirks'] = true;
1719 $this->emitToken($this->token);
1720 $state = 'data';
1721 } elseif ($char === false) {
1722 /* Parse error. Set the DOCTYPE token's force-quirks
1723 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1724 character in the data state. */
1725 $this->emitToken(array(
1726 'type' => self::PARSEERROR,
1727 'data' => 'eof-in-doctype'
1728 ));
1729 $this->token['force-quirks'] = true;
1730 $this->emitToken($this->token);
1731 $this->stream->unget();
1732 $state = 'data';
1733 } else {
1734 /* Parse error. Set the DOCTYPE token's force-quirks flag
1735 to on. Switch to the bogus DOCTYPE state. */
1736 $this->emitToken(array(
1737 'type' => self::PARSEERROR,
1738 'data' => 'unexpected-char-in-doctype'
1739 ));
1740 $this->token['force-quirks'] = true;
1741 $state = 'bogus DOCTYPE';
1742 }
1743 break;
1744
1745 case 'DOCTYPE public identifier (double-quoted)':
1746 /* Consume the next input character: */
1747 $char = $this->stream->char();
1748
1749 if ($char === '"') {
1750 /* U+0022 QUOTATION MARK (")
1751 Switch to the after DOCTYPE public identifier state. */
1752 $state = 'after DOCTYPE public identifier';
1753 } elseif ($char === '>') {
1754 /* U+003E GREATER-THAN SIGN (>)
1755 Parse error. Set the DOCTYPE token's force-quirks flag
1756 to on. Emit that DOCTYPE token. Switch to the data state. */
1757 $this->emitToken(array(
1758 'type' => self::PARSEERROR,
1759 'data' => 'unexpected-end-of-doctype'
1760 ));
1761 $this->token['force-quirks'] = true;
1762 $this->emitToken($this->token);
1763 $state = 'data';
1764 } elseif ($char === false) {
1765 /* EOF
1766 Parse error. Set the DOCTYPE token's force-quirks flag
1767 to on. Emit that DOCTYPE token. Reconsume the EOF
1768 character in the data state. */
1769 $this->emitToken(array(
1770 'type' => self::PARSEERROR,
1771 'data' => 'eof-in-doctype'
1772 ));
1773 $this->token['force-quirks'] = true;
1774 $this->emitToken($this->token);
1775 $this->stream->unget();
1776 $state = 'data';
1777 } else {
1778 /* Anything else
1779 Append the current input character to the current
1780 DOCTYPE token's public identifier. Stay in the DOCTYPE
1781 public identifier (double-quoted) state. */
1782 $this->token['public'] .= $char;
1783 }
1784 break;
1785
1786 case 'DOCTYPE public identifier (single-quoted)':
1787 /* Consume the next input character: */
1788 $char = $this->stream->char();
1789
1790 if ($char === "'") {
1791 /* U+0027 APOSTROPHE (')
1792 Switch to the after DOCTYPE public identifier state. */
1793 $state = 'after DOCTYPE public identifier';
1794 } elseif ($char === '>') {
1795 /* U+003E GREATER-THAN SIGN (>)
1796 Parse error. Set the DOCTYPE token's force-quirks flag
1797 to on. Emit that DOCTYPE token. Switch to the data state. */
1798 $this->emitToken(array(
1799 'type' => self::PARSEERROR,
1800 'data' => 'unexpected-end-of-doctype'
1801 ));
1802 $this->token['force-quirks'] = true;
1803 $this->emitToken($this->token);
1804 $state = 'data';
1805 } elseif ($char === false) {
1806 /* EOF
1807 Parse error. Set the DOCTYPE token's force-quirks flag
1808 to on. Emit that DOCTYPE token. Reconsume the EOF
1809 character in the data state. */
1810 $this->emitToken(array(
1811 'type' => self::PARSEERROR,
1812 'data' => 'eof-in-doctype'
1813 ));
1814 $this->token['force-quirks'] = true;
1815 $this->emitToken($this->token);
1816 $this->stream->unget();
1817 $state = 'data';
1818 } else {
1819 /* Anything else
1820 Append the current input character to the current
1821 DOCTYPE token's public identifier. Stay in the DOCTYPE
1822 public identifier (double-quoted) state. */
1823 $this->token['public'] .= $char;
1824 }
1825 break;
1826
1827 case 'after DOCTYPE public identifier':
1828 /* Consume the next input character: */
1829 $char = $this->stream->char();
1830
1831 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1832 /* U+0009 CHARACTER TABULATION
1833 U+000A LINE FEED (LF)
1834 U+000C FORM FEED (FF)
1835 U+0020 SPACE
1836 Stay in the after DOCTYPE public identifier state. */
1837 } elseif ($char === '"') {
1838 /* U+0022 QUOTATION MARK (")
1839 Set the DOCTYPE token's system identifier to the
1840 empty string (not missing), then switch to the DOCTYPE
1841 system identifier (double-quoted) state. */
1842 $this->token['system'] = '';
1843 $state = 'DOCTYPE system identifier (double-quoted)';
1844 } elseif ($char === "'") {
1845 /* U+0027 APOSTROPHE (')
1846 Set the DOCTYPE token's system identifier to the
1847 empty string (not missing), then switch to the DOCTYPE
1848 system identifier (single-quoted) state. */
1849 $this->token['system'] = '';
1850 $state = 'DOCTYPE system identifier (single-quoted)';
1851 } elseif ($char === '>') {
1852 /* U+003E GREATER-THAN SIGN (>)
1853 Emit the current DOCTYPE token. Switch to the data state. */
1854 $this->emitToken($this->token);
1855 $state = 'data';
1856 } elseif ($char === false) {
1857 /* Parse error. Set the DOCTYPE token's force-quirks
1858 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1859 character in the data state. */
1860 $this->emitToken(array(
1861 'type' => self::PARSEERROR,
1862 'data' => 'eof-in-doctype'
1863 ));
1864 $this->token['force-quirks'] = true;
1865 $this->emitToken($this->token);
1866 $this->stream->unget();
1867 $state = 'data';
1868 } else {
1869 /* Anything else
1870 Parse error. Set the DOCTYPE token's force-quirks flag
1871 to on. Switch to the bogus DOCTYPE state. */
1872 $this->emitToken(array(
1873 'type' => self::PARSEERROR,
1874 'data' => 'unexpected-char-in-doctype'
1875 ));
1876 $this->token['force-quirks'] = true;
1877 $state = 'bogus DOCTYPE';
1878 }
1879 break;
1880
1881 case 'before DOCTYPE system identifier':
1882 /* Consume the next input character: */
1883 $char = $this->stream->char();
1884
1885 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1886 /* U+0009 CHARACTER TABULATION
1887 U+000A LINE FEED (LF)
1888 U+000C FORM FEED (FF)
1889 U+0020 SPACE
1890 Stay in the before DOCTYPE system identifier state. */
1891 } elseif ($char === '"') {
1892 /* U+0022 QUOTATION MARK (")
1893 Set the DOCTYPE token's system identifier to the empty
1894 string (not missing), then switch to the DOCTYPE system
1895 identifier (double-quoted) state. */
1896 $this->token['system'] = '';
1897 $state = 'DOCTYPE system identifier (double-quoted)';
1898 } elseif ($char === "'") {
1899 /* U+0027 APOSTROPHE (')
1900 Set the DOCTYPE token's system identifier to the empty
1901 string (not missing), then switch to the DOCTYPE system
1902 identifier (single-quoted) state. */
1903 $this->token['system'] = '';
1904 $state = 'DOCTYPE system identifier (single-quoted)';
1905 } elseif ($char === '>') {
1906 /* Parse error. Set the DOCTYPE token's force-quirks flag
1907 to on. Emit that DOCTYPE token. Switch to the data state. */
1908 $this->emitToken(array(
1909 'type' => self::PARSEERROR,
1910 'data' => 'unexpected-char-in-doctype'
1911 ));
1912 $this->token['force-quirks'] = true;
1913 $this->emitToken($this->token);
1914 $state = 'data';
1915 } elseif ($char === false) {
1916 /* Parse error. Set the DOCTYPE token's force-quirks
1917 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1918 character in the data state. */
1919 $this->emitToken(array(
1920 'type' => self::PARSEERROR,
1921 'data' => 'eof-in-doctype'
1922 ));
1923 $this->token['force-quirks'] = true;
1924 $this->emitToken($this->token);
1925 $this->stream->unget();
1926 $state = 'data';
1927 } else {
1928 /* Parse error. Set the DOCTYPE token's force-quirks flag
1929 to on. Switch to the bogus DOCTYPE state. */
1930 $this->emitToken(array(
1931 'type' => self::PARSEERROR,
1932 'data' => 'unexpected-char-in-doctype'
1933 ));
1934 $this->token['force-quirks'] = true;
1935 $state = 'bogus DOCTYPE';
1936 }
1937 break;
1938
1939 case 'DOCTYPE system identifier (double-quoted)':
1940 /* Consume the next input character: */
1941 $char = $this->stream->char();
1942
1943 if ($char === '"') {
1944 /* U+0022 QUOTATION MARK (")
1945 Switch to the after DOCTYPE system identifier state. */
1946 $state = 'after DOCTYPE system identifier';
1947 } elseif ($char === '>') {
1948 /* U+003E GREATER-THAN SIGN (>)
1949 Parse error. Set the DOCTYPE token's force-quirks flag
1950 to on. Emit that DOCTYPE token. Switch to the data state. */
1951 $this->emitToken(array(
1952 'type' => self::PARSEERROR,
1953 'data' => 'unexpected-end-of-doctype'
1954 ));
1955 $this->token['force-quirks'] = true;
1956 $this->emitToken($this->token);
1957 $state = 'data';
1958 } elseif ($char === false) {
1959 /* EOF
1960 Parse error. Set the DOCTYPE token's force-quirks flag
1961 to on. Emit that DOCTYPE token. Reconsume the EOF
1962 character in the data state. */
1963 $this->emitToken(array(
1964 'type' => self::PARSEERROR,
1965 'data' => 'eof-in-doctype'
1966 ));
1967 $this->token['force-quirks'] = true;
1968 $this->emitToken($this->token);
1969 $this->stream->unget();
1970 $state = 'data';
1971 } else {
1972 /* Anything else
1973 Append the current input character to the current
1974 DOCTYPE token's system identifier. Stay in the DOCTYPE
1975 system identifier (double-quoted) state. */
1976 $this->token['system'] .= $char;
1977 }
1978 break;
1979
1980 case 'DOCTYPE system identifier (single-quoted)':
1981 /* Consume the next input character: */
1982 $char = $this->stream->char();
1983
1984 if ($char === "'") {
1985 /* U+0027 APOSTROPHE (')
1986 Switch to the after DOCTYPE system identifier state. */
1987 $state = 'after DOCTYPE system identifier';
1988 } elseif ($char === '>') {
1989 /* U+003E GREATER-THAN SIGN (>)
1990 Parse error. Set the DOCTYPE token's force-quirks flag
1991 to on. Emit that DOCTYPE token. Switch to the data state. */
1992 $this->emitToken(array(
1993 'type' => self::PARSEERROR,
1994 'data' => 'unexpected-end-of-doctype'
1995 ));
1996 $this->token['force-quirks'] = true;
1997 $this->emitToken($this->token);
1998 $state = 'data';
1999 } elseif ($char === false) {
2000 /* EOF
2001 Parse error. Set the DOCTYPE token's force-quirks flag
2002 to on. Emit that DOCTYPE token. Reconsume the EOF
2003 character in the data state. */
2004 $this->emitToken(array(
2005 'type' => self::PARSEERROR,
2006 'data' => 'eof-in-doctype'
2007 ));
2008 $this->token['force-quirks'] = true;
2009 $this->emitToken($this->token);
2010 $this->stream->unget();
2011 $state = 'data';
2012 } else {
2013 /* Anything else
2014 Append the current input character to the current
2015 DOCTYPE token's system identifier. Stay in the DOCTYPE
2016 system identifier (double-quoted) state. */
2017 $this->token['system'] .= $char;
2018 }
2019 break;
2020
2021 case 'after DOCTYPE system identifier':
2022 /* Consume the next input character: */
2023 $char = $this->stream->char();
2024
2025 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2026 /* U+0009 CHARACTER TABULATION
2027 U+000A LINE FEED (LF)
2028 U+000C FORM FEED (FF)
2029 U+0020 SPACE
2030 Stay in the after DOCTYPE system identifier state. */
2031 } elseif ($char === '>') {
2032 /* U+003E GREATER-THAN SIGN (>)
2033 Emit the current DOCTYPE token. Switch to the data state. */
2034 $this->emitToken($this->token);
2035 $state = 'data';
2036 } elseif ($char === false) {
2037 /* Parse error. Set the DOCTYPE token's force-quirks
2038 flag to on. Emit that DOCTYPE token. Reconsume the EOF
2039 character in the data state. */
2040 $this->emitToken(array(
2041 'type' => self::PARSEERROR,
2042 'data' => 'eof-in-doctype'
2043 ));
2044 $this->token['force-quirks'] = true;
2045 $this->emitToken($this->token);
2046 $this->stream->unget();
2047 $state = 'data';
2048 } else {
2049 /* Anything else
2050 Parse error. Switch to the bogus DOCTYPE state.
2051 (This does not set the DOCTYPE token's force-quirks
2052 flag to on.) */
2053 $this->emitToken(array(
2054 'type' => self::PARSEERROR,
2055 'data' => 'unexpected-char-in-doctype'
2056 ));
2057 $state = 'bogus DOCTYPE';
2058 }
2059 break;
2060
2061 case 'bogus DOCTYPE':
2062 /* Consume the next input character: */
2063 $char = $this->stream->char();
2064
2065 if ($char === '>') {
2066 /* U+003E GREATER-THAN SIGN (>)
2067 Emit the DOCTYPE token. Switch to the data state. */
2068 $this->emitToken($this->token);
2069 $state = 'data';
2070
2071 } elseif($char === false) {
2072 /* EOF
2073 Emit the DOCTYPE token. Reconsume the EOF character in
2074 the data state. */
2075 $this->emitToken($this->token);
2076 $this->stream->unget();
2077 $state = 'data';
2078
2079 } else {
2080 /* Anything else
2081 Stay in the bogus DOCTYPE state. */
2082 }
2083 break;
2084
2085 // case 'cdataSection':
2086
2087 }
2088 }
2089 }
2090
2091 /**
2092 * Returns a serialized representation of the tree.
2093 */
2094 public function save() {
2095 return $this->tree->save();
2096 }
2097
2098 /**
2099 * Returns the input stream.
2100 */
2101 public function stream() {
2102 return $this->stream;
2103 }
2104
2105 private function consumeCharacterReference($allowed = false, $inattr = false) {
2106 // This goes quite far against spec, and is far closer to the Python
2107 // impl., mainly because we don't do the large unconsuming the spec
2108 // requires.
2109
2110 // All consumed characters.
2111 $chars = $this->stream->char();
2112
2113 /* This section defines how to consume a character
2114 reference. This definition is used when parsing character
2115 references in text and in attributes.
2116
2117 The behavior depends on the identity of the next character
2118 (the one immediately after the U+0026 AMPERSAND character): */
2119
2120 if (
2121 $chars[0] === "\x09" ||
2122 $chars[0] === "\x0A" ||
2123 $chars[0] === "\x0C" ||
2124 $chars[0] === "\x20" ||
2125 $chars[0] === '<' ||
2126 $chars[0] === '&' ||
2127 $chars === false ||
2128 $chars[0] === $allowed
2129 ) {
2130 /* U+0009 CHARACTER TABULATION
2131 U+000A LINE FEED (LF)
2132 U+000C FORM FEED (FF)
2133 U+0020 SPACE
2134 U+003C LESS-THAN SIGN
2135 U+0026 AMPERSAND
2136 EOF
2137 The additional allowed character, if there is one
2138 Not a character reference. No characters are consumed,
2139 and nothing is returned. (This is not an error, either.) */
2140 // We already consumed, so unconsume.
2141 $this->stream->unget();
2142 return '&';
2143 } elseif ($chars[0] === '#') {
2144 /* Consume the U+0023 NUMBER SIGN. */
2145 // Um, yeah, we already did that.
2146 /* The behavior further depends on the character after
2147 the U+0023 NUMBER SIGN: */
2148 $chars .= $this->stream->char();
2149 if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2150 /* U+0078 LATIN SMALL LETTER X
2151 U+0058 LATIN CAPITAL LETTER X */
2152 /* Consume the X. */
2153 // Um, yeah, we already did that.
2154 /* Follow the steps below, but using the range of
2155 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2156 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2157 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2158 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2159 words, 0123456789, ABCDEF, abcdef). */
2160 $char_class = self::HEX;
2161 /* When it comes to interpreting the
2162 number, interpret it as a hexadecimal number. */
2163 $hex = true;
2164 } else {
2165 /* Anything else */
2166 // Unconsume because we shouldn't have consumed this.
2167 $chars = $chars[0];
2168 $this->stream->unget();
2169 /* Follow the steps below, but using the range of
2170 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2171 NINE (i.e. just 0123456789). */
2172 $char_class = self::DIGIT;
2173 /* When it comes to interpreting the number,
2174 interpret it as a decimal number. */
2175 $hex = false;
2176 }
2177
2178 /* Consume as many characters as match the range of characters given above. */
2179 $consumed = $this->stream->charsWhile($char_class);
2180 if ($consumed === '' || $consumed === false) {
2181 /* If no characters match the range, then don't consume
2182 any characters (and unconsume the U+0023 NUMBER SIGN
2183 character and, if appropriate, the X character). This
2184 is a parse error; nothing is returned. */
2185 $this->emitToken(array(
2186 'type' => self::PARSEERROR,
2187 'data' => 'expected-numeric-entity'
2188 ));
2189 return '&' . $chars;
2190 } else {
2191 /* Otherwise, if the next character is a U+003B SEMICOLON,
2192 consume that too. If it isn't, there is a parse error. */
2193 if ($this->stream->char() !== ';') {
2194 $this->stream->unget();
2195 $this->emitToken(array(
2196 'type' => self::PARSEERROR,
2197 'data' => 'numeric-entity-without-semicolon'
2198 ));
2199 }
2200
2201 /* If one or more characters match the range, then take
2202 them all and interpret the string of characters as a number
2203 (either hexadecimal or decimal as appropriate). */
2204 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2205
2206 /* If that number is one of the numbers in the first column
2207 of the following table, then this is a parse error. Find the
2208 row with that number in the first column, and return a
2209 character token for the Unicode character given in the
2210 second column of that row. */
2211 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2212 if ($new_codepoint) {
2213 $this->emitToken(array(
2214 'type' => self::PARSEERROR,
2215 'data' => 'illegal-windows-1252-entity'
2216 ));
2217 return HTML5_Data::utf8chr($new_codepoint);
2218 } else {
2219 /* Otherwise, if the number is greater than 0x10FFFF, then
2220 * this is a parse error. Return a U+FFFD REPLACEMENT
2221 * CHARACTER. */
2222 if ($codepoint > 0x10FFFF) {
2223 $this->emitToken(array(
2224 'type' => self::PARSEERROR,
2225 'data' => 'overlong-character-entity' // XXX probably not correct
2226 ));
2227 return "\xEF\xBF\xBD";
2228 }
2229 /* Otherwise, return a character token for the Unicode
2230 * character whose code point is that number. If the
2231 * number is in the range 0x0001 to 0x0008, 0x000E to
2232 * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2233 * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2234 * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2235 * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2236 * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2237 * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2238 * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2239 * or 0x10FFFF, then this is a parse error. */
2240 // && has higher precedence than ||
2241 if (
2242 $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2243 $codepoint === 0x000B ||
2244 $codepoint >= 0x000E && $codepoint <= 0x001F ||
2245 $codepoint >= 0x007F && $codepoint <= 0x009F ||
2246 $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2247 $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2248 ($codepoint & 0xFFFE) === 0xFFFE ||
2249 $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2250 ) {
2251 $this->emitToken(array(
2252 'type' => self::PARSEERROR,
2253 'data' => 'illegal-codepoint-for-numeric-entity'
2254 ));
2255 }
2256 return HTML5_Data::utf8chr($codepoint);
2257 }
2258 }
2259
2260 } else {
2261 /* Anything else */
2262
2263 /* Consume the maximum number of characters possible,
2264 with the consumed characters matching one of the
2265 identifiers in the first column of the named character
2266 references table (in a case-sensitive manner). */
2267 // What we actually do here is consume as much as we can while it
2268 // matches the start of one of the identifiers in the first column.
2269
2270 $refs = HTML5_Data::getNamedCharacterReferences();
2271
2272 // Get the longest string which is the start of an identifier
2273 // ($chars) as well as the longest identifier which matches ($id)
2274 // and its codepoint ($codepoint).
2275 $codepoint = false;
2276 $char = $chars;
2277 while ($char !== false && isset($refs[$char])) {
2278 $refs = $refs[$char];
2279 if (isset($refs['codepoint'])) {
2280 $id = $chars;
2281 $codepoint = $refs['codepoint'];
2282 }
2283 $chars .= $char = $this->stream->char();
2284 }
2285
2286 // Unconsume the one character we just took which caused the while
2287 // statement to fail. This could be anything and could cause state
2288 // changes (as if it matches the while loop it must be
2289 // alphanumeric so we can just concat it to whatever we get later).
2290 $this->stream->unget();
2291 if ($char !== false) {
2292 $chars = substr($chars, 0, -1);
2293 }
2294
2295 /* If no match can be made, then this is a parse error.
2296 No characters are consumed, and nothing is returned. */
2297 if (!$codepoint) {
2298 $this->emitToken(array(
2299 'type' => self::PARSEERROR,
2300 'data' => 'expected-named-entity'
2301 ));
2302 return '&' . $chars;
2303 }
2304
2305 /* If the last character matched is not a U+003B SEMICOLON
2306 (;), there is a parse error. */
2307 $semicolon = true;
2308 if (substr($id, -1) !== ';') {
2309 $this->emitToken(array(
2310 'type' => self::PARSEERROR,
2311 'data' => 'named-entity-without-semicolon'
2312 ));
2313 $semicolon = false;
2314 }
2315
2316 /* If the character reference is being consumed as part of
2317 an attribute, and the last character matched is not a
2318 U+003B SEMICOLON (;), and the next character is in the
2319 range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2320 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2321 or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2322 then, for historical reasons, all the characters that were
2323 matched after the U+0026 AMPERSAND (&) must be unconsumed,
2324 and nothing is returned. */
2325 if ($inattr && !$semicolon) {
2326 // The next character is either the next character in $chars or in the stream.
2327 if (strlen($chars) > strlen($id)) {
2328 $next = substr($chars, strlen($id), 1);
2329 } else {
2330 $next = $this->stream->char();
2331 $this->stream->unget();
2332 }
2333 if (
2334 '0' <= $next && $next <= '9' ||
2335 'A' <= $next && $next <= 'Z' ||
2336 'a' <= $next && $next <= 'z'
2337 ) {
2338 return '&' . $chars;
2339 }
2340 }
2341
2342 /* Otherwise, return a character token for the character
2343 corresponding to the character reference name (as given
2344 by the second column of the named character references table). */
2345 return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2346 }
2347 }
2348
2349 private function characterReferenceInAttributeValue($allowed = false) {
2350 /* Attempt to consume a character reference. */
2351 $entity = $this->consumeCharacterReference($allowed, true);
2352
2353 /* If nothing is returned, append a U+0026 AMPERSAND
2354 character to the current attribute's value.
2355
2356 Otherwise, append the returned character token to the
2357 current attribute's value. */
2358 $char = (!$entity)
2359 ? '&'
2360 : $entity;
2361
2362 $last = count($this->token['attr']) - 1;
2363 $this->token['attr'][$last]['value'] .= $char;
2364
2365 /* Finally, switch back to the attribute value state that you
2366 were in when were switched into this state. */
2367 }
2368
2369 /**
2370 * Emits a token, passing it on to the tree builder.
2371 */
2372 protected function emitToken($token, $checkStream = true, $dry = false) {
2373 if ($checkStream) {
2374 // Emit errors from input stream.
2375 while ($this->stream->errors) {
2376 $this->emitToken(array_shift($this->stream->errors), false);
2377 }
2378 }
2379 if($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2380 for ($i = 0; $i < count($token['attr']); $i++) {
2381 $this->emitToken(array(
2382 'type' => self::PARSEERROR,
2383 'data' => 'attributes-in-end-tag'
2384 ));
2385 }
2386 }
2387 if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2388 $this->emitToken(array(
2389 'type' => self::PARSEERROR,
2390 'data' => 'self-closing-flag-on-end-tag',
2391 ));
2392 }
2393 if($token['type'] === self::STARTTAG) {
2394 // This could be changed to actually pass the tree-builder a hash
2395 $hash = array();
2396 foreach ($token['attr'] as $keypair) {
2397 if (isset($hash[$keypair['name']])) {
2398 $this->emitToken(array(
2399 'type' => self::PARSEERROR,
2400 'data' => 'duplicate-attribute',
2401 ));
2402 } else {
2403 $hash[$keypair['name']] = $keypair['value'];
2404 }
2405 }
2406 }
2407
2408 if(!$dry) {
2409 // the current structure of attributes is not a terribly good one
2410 $this->tree->emitToken($token);
2411 }
2412
2413 if(!$dry && is_int($this->tree->content_model)) {
2414 $this->content_model = $this->tree->content_model;
2415 $this->tree->content_model = null;
2416
2417 } elseif($token['type'] === self::ENDTAG) {
2418 $this->content_model = self::PCDATA;
2419 }
2420 }
2421}
2422