]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/libraries/html5/Tokenizer.php
5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 // /* */ indicates verbatim text from the HTML 5 specification
32 // // indicates regular comments
34 // all flags are in hyphenated form
36 class HTML5_Tokenizer
{
38 * Points to an InputStream object.
43 * Tree builder that the tokenizer emits token to.
48 * Current content model we are parsing as.
50 protected $content_model;
53 * Current token that is being built, but not yet emitted. Also
54 * is the last token emitted, if applicable.
58 // These are constants describing the content model
64 // These are constants describing tokens
65 // XXX should probably be moved somewhere else, probably the
72 const SPACECHARACTER
= 5;
76 // These are constants representing bunches of characters.
77 const ALPHA
= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
78 const UPPER_ALPHA
= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
79 const LOWER_ALPHA
= 'abcdefghijklmnopqrstuvwxyz';
80 const DIGIT
= '0123456789';
81 const HEX
= '0123456789ABCDEFabcdef';
82 const WHITESPACE
= "\t\n\x0c ";
85 * @param $data Data to parse
87 public function __construct($data, $builder = null) {
88 $this->stream
= new HTML5_InputStream($data);
89 if (!$builder) $this->tree
= new HTML5_TreeBuilder
;
90 else $this->tree
= $builder;
91 $this->content_model
= self
::PCDATA
;
94 public function parseFragment($context = null) {
95 $this->tree
->setupContext($context);
96 if ($this->tree
->content_model
) {
97 $this->content_model
= $this->tree
->content_model
;
98 $this->tree
->content_model
= null;
103 // XXX maybe convert this into an iterator? regardless, this function
104 // and the save function should go into a Parser facade of some sort
106 * Performs the actual parsing of the document.
108 public function parse() {
111 // This is used to avoid having to have look-behind in the data state.
114 * Escape flag as specified by the HTML5 specification: "used to
115 * control the behavior of the tokeniser. It is either true or
116 * false, and initially must be set to the false state."
120 while($state !== null) {
123 switch ($this->content_model) {
124 case self::PCDATA: echo 'PCDATA'; break;
125 case self::RCDATA: echo 'RCDATA'; break;
126 case self::CDATA: echo 'CDATA'; break;
127 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
129 if ($escape) echo " escape";
135 /* Consume the next input character */
136 $char = $this->stream
->char();
137 $lastFourChars .= $char;
138 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
140 // see below for meaning
144 $this->content_model
=== self
::RCDATA
||
145 $this->content_model
=== self
::CDATA
150 $this->content_model
=== self
::PCDATA
||
151 $this->content_model
=== self
::RCDATA
154 $this->content_model
=== self
::PCDATA
||
157 $this->content_model
=== self
::RCDATA
||
158 $this->content_model
=== self
::CDATA
165 $this->content_model
=== self
::RCDATA
||
166 $this->content_model
=== self
::CDATA
169 if($char === '&' && $amp_cond) {
170 /* U+0026 AMPERSAND (&)
171 When the content model flag is set to one of the PCDATA or RCDATA
172 states and the escape flag is false: switch to the
173 character reference data state. Otherwise: treat it as per
174 the "anything else" entry below. */
175 $state = 'character reference data';
180 $lastFourChars === '<!--'
183 U+002D HYPHEN-MINUS (-)
184 If the content model flag is set to either the RCDATA state or
185 the CDATA state, and the escape flag is false, and there are at
186 least three characters before this one in the input stream, and the
187 last four characters in the input stream, including this one, are
188 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
189 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
192 /* In any case, emit the input character as a character token. Stay
193 in the data state. */
194 $this->emitToken(array(
195 'type' => self
::CHARACTER
,
198 // We do the "any case" part as part of "anything else".
200 /* U+003C LESS-THAN SIGN (<) */
201 } elseif($char === '<' && $lt_cond) {
202 /* When the content model flag is set to the PCDATA state: switch
203 to the tag open state.
205 When the content model flag is set to either the RCDATA state or
206 the CDATA state and the escape flag is false: switch to the tag
209 Otherwise: treat it as per the "anything else" entry below. */
212 /* U+003E GREATER-THAN SIGN (>) */
216 substr($lastFourChars, 1) === '-->'
218 /* If the content model flag is set to either the RCDATA state or
219 the CDATA state, and the escape flag is true, and the last three
220 characters in the input stream including this one are U+002D
221 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
222 set the escape flag to false. */
225 /* In any case, emit the input character as a character token.
226 Stay in the data state. */
227 $this->emitToken(array(
228 'type' => self
::CHARACTER
,
231 // We do the "any case" part as part of "anything else".
233 } elseif($char === false) {
235 Emit an end-of-file token. */
237 $this->tree
->emitToken(array(
241 } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
242 // Directly after emitting a token you switch back to the "data
243 // state". At that point spaceCharacters are important so they are
244 // emitted separately.
245 $chars = $this->stream
->charsWhile(self
::WHITESPACE
);
246 $this->emitToken(array(
247 'type' => self
::SPACECHARACTER
,
248 'data' => $char . $chars
250 $lastFourChars .= $chars;
251 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
255 THIS IS AN OPTIMIZATION: Get as many character that
256 otherwise would also be treated as a character token and emit it
257 as a single character token. Stay in the data state. */
260 if ($hyp_cond) $mask .= '-';
261 if ($amp_cond) $mask .= '&';
262 if ($lt_cond) $mask .= '<';
263 if ($gt_cond) $mask .= '>';
266 $chars = $this->stream
->remainingChars();
268 $chars = $this->stream
->charsUntil($mask);
271 $this->emitToken(array(
272 'type' => self
::CHARACTER
,
273 'data' => $char . $chars
276 $lastFourChars .= $chars;
277 if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
283 case 'character reference data':
284 /* (This cannot happen if the content model flag
285 is set to the CDATA state.) */
287 /* Attempt to consume a character reference, with no
288 additional allowed character. */
289 $entity = $this->consumeCharacterReference();
291 /* If nothing is returned, emit a U+0026 AMPERSAND
292 character token. Otherwise, emit the character token that
294 // This is all done when consuming the character reference.
295 $this->emitToken(array(
296 'type' => self
::CHARACTER
,
300 /* Finally, switch to the data state. */
305 $char = $this->stream
->char();
307 switch($this->content_model
) {
310 /* Consume the next input character. If it is a
311 U+002F SOLIDUS (/) character, switch to the close
312 tag open state. Otherwise, emit a U+003C LESS-THAN
313 SIGN character token and reconsume the current input
314 character in the data state. */
315 // We consumed above.
318 $state = 'close tag open';
321 $this->emitToken(array(
322 'type' => self
::CHARACTER
,
326 $this->stream
->unget();
333 /* If the content model flag is set to the PCDATA state
334 Consume the next input character: */
335 // We consumed above.
338 /* U+0021 EXCLAMATION MARK (!)
339 Switch to the markup declaration open state. */
340 $state = 'markup declaration open';
342 } elseif($char === '/') {
343 /* U+002F SOLIDUS (/)
344 Switch to the close tag open state. */
345 $state = 'close tag open';
347 } elseif('A' <= $char && $char <= 'Z') {
348 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
349 Create a new start tag token, set its tag name to the lowercase
350 version of the input character (add 0x0020 to the character's code
351 point), then switch to the tag name state. (Don't emit the token
352 yet; further details will be filled in before it is emitted.) */
353 $this->token
= array(
354 'name' => strtolower($char),
355 'type' => self
::STARTTAG
,
361 } elseif('a' <= $char && $char <= 'z') {
362 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
363 Create a new start tag token, set its tag name to the input
364 character, then switch to the tag name state. (Don't emit
365 the token yet; further details will be filled in before it
367 $this->token
= array(
369 'type' => self
::STARTTAG
,
375 } elseif($char === '>') {
376 /* U+003E GREATER-THAN SIGN (>)
377 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
378 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
379 $this->emitToken(array(
380 'type' => self
::PARSEERROR
,
381 'data' => 'expected-tag-name-but-got-right-bracket'
383 $this->emitToken(array(
384 'type' => self
::CHARACTER
,
390 } elseif($char === '?') {
391 /* U+003F QUESTION MARK (?)
392 Parse error. Switch to the bogus comment state. */
393 $this->emitToken(array(
394 'type' => self
::PARSEERROR
,
395 'data' => 'expected-tag-name-but-got-question-mark'
397 $this->token
= array(
399 'type' => self
::COMMENT
401 $state = 'bogus comment';
405 Parse error. Emit a U+003C LESS-THAN SIGN character token and
406 reconsume the current input character in the data state. */
407 $this->emitToken(array(
408 'type' => self
::PARSEERROR
,
409 'data' => 'expected-tag-name'
411 $this->emitToken(array(
412 'type' => self
::CHARACTER
,
417 $this->stream
->unget();
423 case 'close tag open':
425 $this->content_model
=== self
::RCDATA
||
426 $this->content_model
=== self
::CDATA
428 /* If the content model flag is set to the RCDATA or CDATA
430 $name = strtolower($this->stream
->charsWhile(self
::ALPHA
));
431 $following = $this->stream
->char();
432 $this->stream
->unget();
435 $this->token
['name'] !== $name ||
436 $this->token
['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
438 /* if no start tag token has ever been emitted by this instance
439 of the tokenizer (fragment case), or, if the next few
440 characters do not match the tag name of the last start tag
441 token emitted (compared in an ASCII case-insensitive manner),
442 or if they do but they are not immediately followed by one of
443 the following characters:
445 * U+0009 CHARACTER TABULATION
446 * U+000A LINE FEED (LF)
447 * U+000C FORM FEED (FF)
449 * U+003E GREATER-THAN SIGN (>)
453 ...then emit a U+003C LESS-THAN SIGN character token, a
454 U+002F SOLIDUS character token, and switch to the data
455 state to process the next input character. */
456 // XXX: Probably ought to replace in_array with $following === x ||...
458 // We also need to emit $name now we've consumed that, as we
459 // know it'll just be emitted as a character token.
460 $this->emitToken(array(
461 'type' => self
::CHARACTER
,
462 'data' => '</' . $name
467 // This matches what would happen if we actually did the
468 // otherwise below (but we can't because we've consumed too
471 // Start the end tag token with the name we already have.
472 $this->token
= array(
474 'type' => self
::ENDTAG
477 // Change to tag name state.
480 } elseif ($this->content_model
=== self
::PCDATA
) {
481 /* Otherwise, if the content model flag is set to the PCDATA
483 $char = $this->stream
->char();
485 if ('A' <= $char && $char <= 'Z') {
486 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
487 Create a new end tag token, set its tag name to the lowercase version
488 of the input character (add 0x0020 to the character's code point), then
489 switch to the tag name state. (Don't emit the token yet; further details
490 will be filled in before it is emitted.) */
491 $this->token
= array(
492 'name' => strtolower($char),
493 'type' => self
::ENDTAG
498 } elseif ('a' <= $char && $char <= 'z') {
499 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
500 Create a new end tag token, set its tag name to the
501 input character, then switch to the tag name state.
502 (Don't emit the token yet; further details will be
503 filled in before it is emitted.) */
504 $this->token
= array(
506 'type' => self
::ENDTAG
511 } elseif($char === '>') {
512 /* U+003E GREATER-THAN SIGN (>)
513 Parse error. Switch to the data state. */
514 $this->emitToken(array(
515 'type' => self
::PARSEERROR
,
516 'data' => 'expected-closing-tag-but-got-right-bracket'
520 } elseif($char === false) {
522 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
523 SOLIDUS character token. Reconsume the EOF character in the data state. */
524 $this->emitToken(array(
525 'type' => self
::PARSEERROR
,
526 'data' => 'expected-closing-tag-but-got-eof'
528 $this->emitToken(array(
529 'type' => self
::CHARACTER
,
533 $this->stream
->unget();
537 /* Parse error. Switch to the bogus comment state. */
538 $this->emitToken(array(
539 'type' => self
::PARSEERROR
,
540 'data' => 'expected-closing-tag-but-got-char'
542 $this->token
= array(
544 'type' => self
::COMMENT
546 $state = 'bogus comment';
552 /* Consume the next input character: */
553 $char = $this->stream
->char();
555 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
556 /* U+0009 CHARACTER TABULATION
557 U+000A LINE FEED (LF)
558 U+000C FORM FEED (FF)
560 Switch to the before attribute name state. */
561 $state = 'before attribute name';
563 } elseif($char === '/') {
564 /* U+002F SOLIDUS (/)
565 Switch to the self-closing start tag state. */
566 $state = 'self-closing start tag';
568 } elseif($char === '>') {
569 /* U+003E GREATER-THAN SIGN (>)
570 Emit the current tag token. Switch to the data state. */
571 $this->emitToken($this->token
);
574 } elseif('A' <= $char && $char <= 'Z') {
575 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
576 Append the lowercase version of the current input
577 character (add 0x0020 to the character's code point) to
578 the current tag token's tag name. Stay in the tag name state. */
579 $chars = $this->stream
->charsWhile(self
::UPPER_ALPHA
);
581 $this->token
['name'] .= strtolower($char . $chars);
584 } elseif($char === false) {
586 Parse error. Reconsume the EOF character in the data state. */
587 $this->emitToken(array(
588 'type' => self
::PARSEERROR
,
589 'data' => 'eof-in-tag-name'
592 $this->stream
->unget();
597 Append the current input character to the current tag token's tag name.
598 Stay in the tag name state. */
599 $chars = $this->stream
->charsUntil("\t\n\x0C />" . self
::UPPER_ALPHA
);
601 $this->token
['name'] .= $char . $chars;
606 case 'before attribute name':
607 /* Consume the next input character: */
608 $char = $this->stream
->char();
610 // this conditional is optimized, check bottom
611 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
612 /* U+0009 CHARACTER TABULATION
613 U+000A LINE FEED (LF)
614 U+000C FORM FEED (FF)
616 Stay in the before attribute name state. */
617 $state = 'before attribute name';
619 } elseif($char === '/') {
620 /* U+002F SOLIDUS (/)
621 Switch to the self-closing start tag state. */
622 $state = 'self-closing start tag';
624 } elseif($char === '>') {
625 /* U+003E GREATER-THAN SIGN (>)
626 Emit the current tag token. Switch to the data state. */
627 $this->emitToken($this->token
);
630 } elseif('A' <= $char && $char <= 'Z') {
631 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
632 Start a new attribute in the current tag token. Set that
633 attribute's name to the lowercase version of the current
634 input character (add 0x0020 to the character's code
635 point), and its value to the empty string. Switch to the
636 attribute name state.*/
637 $this->token
['attr'][] = array(
638 'name' => strtolower($char),
642 $state = 'attribute name';
644 } elseif($char === false) {
646 Parse error. Reconsume the EOF character in the data state. */
647 $this->emitToken(array(
648 'type' => self
::PARSEERROR
,
649 'data' => 'expected-attribute-name-but-got-eof'
652 $this->stream
->unget();
656 /* U+0022 QUOTATION MARK (")
657 U+0027 APOSTROPHE (')
658 U+003C LESS-THAN SIGN (<)
659 U+003D EQUALS SIGN (=)
660 Parse error. Treat it as per the "anything else" entry
662 if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
663 $this->emitToken(array(
664 'type' => self
::PARSEERROR
,
665 'data' => 'invalid-character-in-attribute-name'
670 Start a new attribute in the current tag token. Set that attribute's
671 name to the current input character, and its value to the empty string.
672 Switch to the attribute name state. */
673 $this->token
['attr'][] = array(
678 $state = 'attribute name';
682 case 'attribute name':
683 // Consume the next input character:
684 $char = $this->stream
->char();
686 // this conditional is optimized, check bottom
687 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
688 /* U+0009 CHARACTER TABULATION
689 U+000A LINE FEED (LF)
690 U+000C FORM FEED (FF)
692 Switch to the after attribute name state. */
693 $state = 'after attribute name';
695 } elseif($char === '/') {
696 /* U+002F SOLIDUS (/)
697 Switch to the self-closing start tag state. */
698 $state = 'self-closing start tag';
700 } elseif($char === '=') {
701 /* U+003D EQUALS SIGN (=)
702 Switch to the before attribute value state. */
703 $state = 'before attribute value';
705 } elseif($char === '>') {
706 /* U+003E GREATER-THAN SIGN (>)
707 Emit the current tag token. Switch to the data state. */
708 $this->emitToken($this->token
);
711 } elseif('A' <= $char && $char <= 'Z') {
712 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
713 Append the lowercase version of the current input
714 character (add 0x0020 to the character's code point) to
715 the current attribute's name. Stay in the attribute name
717 $chars = $this->stream
->charsWhile(self
::UPPER_ALPHA
);
719 $last = count($this->token
['attr']) - 1;
720 $this->token
['attr'][$last]['name'] .= strtolower($char . $chars);
722 $state = 'attribute name';
724 } elseif($char === false) {
726 Parse error. Reconsume the EOF character in the data state. */
727 $this->emitToken(array(
728 'type' => self
::PARSEERROR
,
729 'data' => 'eof-in-attribute-name'
732 $this->stream
->unget();
736 /* U+0022 QUOTATION MARK (")
737 U+0027 APOSTROPHE (')
738 U+003C LESS-THAN SIGN (<)
739 Parse error. Treat it as per the "anything else"
741 if($char === '"' || $char === "'" || $char === '<') {
742 $this->emitToken(array(
743 'type' => self
::PARSEERROR
,
744 'data' => 'invalid-character-in-attribute-name'
749 Append the current input character to the current attribute's name.
750 Stay in the attribute name state. */
751 $chars = $this->stream
->charsUntil("\t\n\x0C /=>\"'" . self
::UPPER_ALPHA
);
753 $last = count($this->token
['attr']) - 1;
754 $this->token
['attr'][$last]['name'] .= $char . $chars;
756 $state = 'attribute name';
759 /* When the user agent leaves the attribute name state
760 (and before emitting the tag token, if appropriate), the
761 complete attribute's name must be compared to the other
762 attributes on the same token; if there is already an
763 attribute on the token with the exact same name, then this
764 is a parse error and the new attribute must be dropped, along
765 with the value that gets associated with it (if any). */
766 // this might be implemented in the emitToken method
769 case 'after attribute name':
770 // Consume the next input character:
771 $char = $this->stream
->char();
773 // this is an optimized conditional, check the bottom
774 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
775 /* U+0009 CHARACTER TABULATION
776 U+000A LINE FEED (LF)
777 U+000C FORM FEED (FF)
779 Stay in the after attribute name state. */
780 $state = 'after attribute name';
782 } elseif($char === '/') {
783 /* U+002F SOLIDUS (/)
784 Switch to the self-closing start tag state. */
785 $state = 'self-closing start tag';
787 } elseif($char === '=') {
788 /* U+003D EQUALS SIGN (=)
789 Switch to the before attribute value state. */
790 $state = 'before attribute value';
792 } elseif($char === '>') {
793 /* U+003E GREATER-THAN SIGN (>)
794 Emit the current tag token. Switch to the data state. */
795 $this->emitToken($this->token
);
798 } elseif('A' <= $char && $char <= 'Z') {
799 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
800 Start a new attribute in the current tag token. Set that
801 attribute's name to the lowercase version of the current
802 input character (add 0x0020 to the character's code
803 point), and its value to the empty string. Switch to the
804 attribute name state. */
805 $this->token
['attr'][] = array(
806 'name' => strtolower($char),
810 $state = 'attribute name';
812 } elseif($char === false) {
814 Parse error. Reconsume the EOF character in the data state. */
815 $this->emitToken(array(
816 'type' => self
::PARSEERROR
,
817 'data' => 'expected-end-of-tag-but-got-eof'
820 $this->stream
->unget();
824 /* U+0022 QUOTATION MARK (")
825 U+0027 APOSTROPHE (')
826 U+003C LESS-THAN SIGN(<)
827 Parse error. Treat it as per the "anything else"
829 if($char === '"' || $char === "'" || $char === "<") {
830 $this->emitToken(array(
831 'type' => self
::PARSEERROR
,
832 'data' => 'invalid-character-after-attribute-name'
837 Start a new attribute in the current tag token. Set that attribute's
838 name to the current input character, and its value to the empty string.
839 Switch to the attribute name state. */
840 $this->token
['attr'][] = array(
845 $state = 'attribute name';
849 case 'before attribute value':
850 // Consume the next input character:
851 $char = $this->stream
->char();
853 // this is an optimized conditional
854 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
855 /* U+0009 CHARACTER TABULATION
856 U+000A LINE FEED (LF)
857 U+000C FORM FEED (FF)
859 Stay in the before attribute value state. */
860 $state = 'before attribute value';
862 } elseif($char === '"') {
863 /* U+0022 QUOTATION MARK (")
864 Switch to the attribute value (double-quoted) state. */
865 $state = 'attribute value (double-quoted)';
867 } elseif($char === '&') {
868 /* U+0026 AMPERSAND (&)
869 Switch to the attribute value (unquoted) state and reconsume
870 this input character. */
871 $this->stream
->unget();
872 $state = 'attribute value (unquoted)';
874 } elseif($char === '\'') {
875 /* U+0027 APOSTROPHE (')
876 Switch to the attribute value (single-quoted) state. */
877 $state = 'attribute value (single-quoted)';
879 } elseif($char === '>') {
880 /* U+003E GREATER-THAN SIGN (>)
881 Parse error. Emit the current tag token. Switch to the data state. */
882 $this->emitToken(array(
883 'type' => self
::PARSEERROR
,
884 'data' => 'expected-attribute-value-but-got-right-bracket'
886 $this->emitToken($this->token
);
889 } elseif($char === false) {
891 Parse error. Reconsume the EOF character in the data state. */
892 $this->emitToken(array(
893 'type' => self
::PARSEERROR
,
894 'data' => 'expected-attribute-value-but-got-eof'
896 $this->stream
->unget();
900 /* U+003D EQUALS SIGN (=)
901 * U+003C LESS-THAN SIGN (<)
902 Parse error. Treat it as per the "anything else" entry below. */
903 if($char === '=' || $char === '<') {
904 $this->emitToken(array(
905 'type' => self
::PARSEERROR
,
906 'data' => 'equals-in-unquoted-attribute-value'
911 Append the current input character to the current attribute's value.
912 Switch to the attribute value (unquoted) state. */
913 $last = count($this->token
['attr']) - 1;
914 $this->token
['attr'][$last]['value'] .= $char;
916 $state = 'attribute value (unquoted)';
920 case 'attribute value (double-quoted)':
921 // Consume the next input character:
922 $char = $this->stream
->char();
925 /* U+0022 QUOTATION MARK (")
926 Switch to the after attribute value (quoted) state. */
927 $state = 'after attribute value (quoted)';
929 } elseif($char === '&') {
930 /* U+0026 AMPERSAND (&)
931 Switch to the character reference in attribute value
932 state, with the additional allowed character
933 being U+0022 QUOTATION MARK ("). */
934 $this->characterReferenceInAttributeValue('"');
936 } elseif($char === false) {
938 Parse error. Reconsume the EOF character in the data state. */
939 $this->emitToken(array(
940 'type' => self
::PARSEERROR
,
941 'data' => 'eof-in-attribute-value-double-quote'
944 $this->stream
->unget();
949 Append the current input character to the current attribute's value.
950 Stay in the attribute value (double-quoted) state. */
951 $chars = $this->stream
->charsUntil('"&');
953 $last = count($this->token
['attr']) - 1;
954 $this->token
['attr'][$last]['value'] .= $char . $chars;
956 $state = 'attribute value (double-quoted)';
960 case 'attribute value (single-quoted)':
961 // Consume the next input character:
962 $char = $this->stream
->char();
965 /* U+0022 QUOTATION MARK (')
966 Switch to the after attribute value state. */
967 $state = 'after attribute value (quoted)';
969 } elseif($char === '&') {
970 /* U+0026 AMPERSAND (&)
971 Switch to the entity in attribute value state. */
972 $this->characterReferenceInAttributeValue("'");
974 } elseif($char === false) {
976 Parse error. Reconsume the EOF character in the data state. */
977 $this->emitToken(array(
978 'type' => self
::PARSEERROR
,
979 'data' => 'eof-in-attribute-value-single-quote'
982 $this->stream
->unget();
987 Append the current input character to the current attribute's value.
988 Stay in the attribute value (single-quoted) state. */
989 $chars = $this->stream
->charsUntil("'&");
991 $last = count($this->token
['attr']) - 1;
992 $this->token
['attr'][$last]['value'] .= $char . $chars;
994 $state = 'attribute value (single-quoted)';
998 case 'attribute value (unquoted)':
999 // Consume the next input character:
1000 $char = $this->stream
->char();
1002 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1003 /* U+0009 CHARACTER TABULATION
1004 U+000A LINE FEED (LF)
1005 U+000C FORM FEED (FF)
1007 Switch to the before attribute name state. */
1008 $state = 'before attribute name';
1010 } elseif($char === '&') {
1011 /* U+0026 AMPERSAND (&)
1012 Switch to the entity in attribute value state, with the
1013 additional allowed character being U+003E
1014 GREATER-THAN SIGN (>). */
1015 $this->characterReferenceInAttributeValue('>');
1017 } elseif($char === '>') {
1018 /* U+003E GREATER-THAN SIGN (>)
1019 Emit the current tag token. Switch to the data state. */
1020 $this->emitToken($this->token
);
1023 } elseif ($char === false) {
1025 Parse error. Reconsume the EOF character in the data state. */
1026 $this->emitToken(array(
1027 'type' => self
::PARSEERROR
,
1028 'data' => 'eof-in-attribute-value-no-quotes'
1030 $this->stream
->unget();
1034 /* U+0022 QUOTATION MARK (")
1035 U+0027 APOSTROPHE (')
1036 U+003C LESS-THAN SIGN (<)
1037 U+003D EQUALS SIGN (=)
1038 Parse error. Treat it as per the "anything else"
1040 if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1041 $this->emitToken(array(
1042 'type' => self
::PARSEERROR
,
1043 'data' => 'unexpected-character-in-unquoted-attribute-value'
1048 Append the current input character to the current attribute's value.
1049 Stay in the attribute value (unquoted) state. */
1050 $chars = $this->stream
->charsUntil("\t\n\x0c &>\"'=");
1052 $last = count($this->token
['attr']) - 1;
1053 $this->token
['attr'][$last]['value'] .= $char . $chars;
1055 $state = 'attribute value (unquoted)';
1059 case 'after attribute value (quoted)':
1060 /* Consume the next input character: */
1061 $char = $this->stream
->char();
1063 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1064 /* U+0009 CHARACTER TABULATION
1065 U+000A LINE FEED (LF)
1066 U+000C FORM FEED (FF)
1068 Switch to the before attribute name state. */
1069 $state = 'before attribute name';
1071 } elseif ($char === '/') {
1072 /* U+002F SOLIDUS (/)
1073 Switch to the self-closing start tag state. */
1074 $state = 'self-closing start tag';
1076 } elseif ($char === '>') {
1077 /* U+003E GREATER-THAN SIGN (>)
1078 Emit the current tag token. Switch to the data state. */
1079 $this->emitToken($this->token
);
1082 } elseif ($char === false) {
1084 Parse error. Reconsume the EOF character in the data state. */
1085 $this->emitToken(array(
1086 'type' => self
::PARSEERROR
,
1087 'data' => 'unexpected-EOF-after-attribute-value'
1089 $this->stream
->unget();
1094 Parse error. Reconsume the character in the before attribute
1096 $this->emitToken(array(
1097 'type' => self
::PARSEERROR
,
1098 'data' => 'unexpected-character-after-attribute-value'
1100 $this->stream
->unget();
1101 $state = 'before attribute name';
1105 case 'self-closing start tag':
1106 /* Consume the next input character: */
1107 $char = $this->stream
->char();
1109 if ($char === '>') {
1110 /* U+003E GREATER-THAN SIGN (>)
1111 Set the self-closing flag of the current tag token.
1112 Emit the current tag token. Switch to the data state. */
1113 // not sure if this is the name we want
1114 $this->token
['self-closing'] = true;
1115 $this->emitToken($this->token
);
1118 } elseif ($char === false) {
1120 Parse error. Reconsume the EOF character in the data state. */
1121 $this->emitToken(array(
1122 'type' => self
::PARSEERROR
,
1123 'data' => 'unexpected-eof-after-self-closing'
1125 $this->stream
->unget();
1130 Parse error. Reconsume the character in the before attribute name state. */
1131 $this->emitToken(array(
1132 'type' => self
::PARSEERROR
,
1133 'data' => 'unexpected-character-after-self-closing'
1135 $this->stream
->unget();
1136 $state = 'before attribute name';
1140 case 'bogus comment':
1141 /* (This can only happen if the content model flag is set to the PCDATA state.) */
1142 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1143 character (>) or the end of the file (EOF), whichever comes first. Emit
1144 a comment token whose data is the concatenation of all the characters
1145 starting from and including the character that caused the state machine
1146 to switch into the bogus comment state, up to and including the last
1147 consumed character before the U+003E character, if any, or up to the
1148 end of the file otherwise. (If the comment was started by the end of
1149 the file (EOF), the token is empty.) */
1150 $this->token
['data'] .= (string) $this->stream
->charsUntil('>');
1151 $this->stream
->char();
1153 $this->emitToken($this->token
);
1155 /* Switch to the data state. */
1159 case 'markup declaration open':
1160 // Consume for below
1161 $hyphens = $this->stream
->charsWhile('-', 2);
1162 if ($hyphens === '-') {
1163 $this->stream
->unget();
1165 if ($hyphens !== '--') {
1166 $alpha = $this->stream
->charsWhile(self
::ALPHA
, 7);
1169 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1170 characters, consume those two characters, create a comment token whose
1171 data is the empty string, and switch to the comment state. */
1172 if($hyphens === '--') {
1173 $state = 'comment start';
1174 $this->token
= array(
1176 'type' => self
::COMMENT
1179 /* Otherwise if the next seven characters are a case-insensitive match
1180 for the word "DOCTYPE", then consume those characters and switch to the
1182 } elseif(strtoupper($alpha) === 'DOCTYPE') {
1185 // XXX not implemented
1186 /* Otherwise, if the insertion mode is "in foreign content"
1187 and the current node is not an element in the HTML namespace
1188 and the next seven characters are an ASCII case-sensitive
1189 match for the string "[CDATA[" (the five uppercase letters
1190 "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1191 and after), then consume those characters and switch to the
1192 CDATA section state (which is unrelated to the content model
1193 flag's CDATA state). */
1195 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1196 The next character that is consumed, if any, is the first character
1197 that will be in the comment. */
1199 $this->emitToken(array(
1200 'type' => self
::PARSEERROR
,
1201 'data' => 'expected-dashes-or-doctype'
1203 $this->token
= array(
1204 'data' => (string) $alpha,
1205 'type' => self
::COMMENT
1207 $state = 'bogus comment';
1211 case 'comment start':
1212 /* Consume the next input character: */
1213 $char = $this->stream
->char();
1215 if ($char === '-') {
1216 /* U+002D HYPHEN-MINUS (-)
1217 Switch to the comment start dash state. */
1218 $state = 'comment start dash';
1219 } elseif ($char === '>') {
1220 /* U+003E GREATER-THAN SIGN (>)
1221 Parse error. Emit the comment token. Switch to the
1223 $this->emitToken(array(
1224 'type' => self
::PARSEERROR
,
1225 'data' => 'incorrect-comment'
1227 $this->emitToken($this->token
);
1229 } elseif ($char === false) {
1231 Parse error. Emit the comment token. Reconsume the
1232 EOF character in the data state. */
1233 $this->emitToken(array(
1234 'type' => self
::PARSEERROR
,
1235 'data' => 'eof-in-comment'
1237 $this->emitToken($this->token
);
1238 $this->stream
->unget();
1242 Append the input character to the comment token's
1243 data. Switch to the comment state. */
1244 $this->token
['data'] .= $char;
1249 case 'comment start dash':
1250 /* Consume the next input character: */
1251 $char = $this->stream
->char();
1252 if ($char === '-') {
1253 /* U+002D HYPHEN-MINUS (-)
1254 Switch to the comment end state */
1255 $state = 'comment end';
1256 } elseif ($char === '>') {
1257 /* U+003E GREATER-THAN SIGN (>)
1258 Parse error. Emit the comment token. Switch to the
1260 $this->emitToken(array(
1261 'type' => self
::PARSEERROR
,
1262 'data' => 'incorrect-comment'
1264 $this->emitToken($this->token
);
1266 } elseif ($char === false) {
1267 /* Parse error. Emit the comment token. Reconsume the
1268 EOF character in the data state. */
1269 $this->emitToken(array(
1270 'type' => self
::PARSEERROR
,
1271 'data' => 'eof-in-comment'
1273 $this->emitToken($this->token
);
1274 $this->stream
->unget();
1277 $this->token
['data'] .= '-' . $char;
1283 /* Consume the next input character: */
1284 $char = $this->stream
->char();
1287 /* U+002D HYPHEN-MINUS (-)
1288 Switch to the comment end dash state */
1289 $state = 'comment end dash';
1291 } elseif($char === false) {
1293 Parse error. Emit the comment token. Reconsume the EOF character
1294 in the data state. */
1295 $this->emitToken(array(
1296 'type' => self
::PARSEERROR
,
1297 'data' => 'eof-in-comment'
1299 $this->emitToken($this->token
);
1300 $this->stream
->unget();
1305 Append the input character to the comment token's data. Stay in
1306 the comment state. */
1307 $chars = $this->stream
->charsUntil('-');
1309 $this->token
['data'] .= $char . $chars;
1313 case 'comment end dash':
1314 /* Consume the next input character: */
1315 $char = $this->stream
->char();
1318 /* U+002D HYPHEN-MINUS (-)
1319 Switch to the comment end state */
1320 $state = 'comment end';
1322 } elseif($char === false) {
1324 Parse error. Emit the comment token. Reconsume the EOF character
1325 in the data state. */
1326 $this->emitToken(array(
1327 'type' => self
::PARSEERROR
,
1328 'data' => 'eof-in-comment-end-dash'
1330 $this->emitToken($this->token
);
1331 $this->stream
->unget();
1336 Append a U+002D HYPHEN-MINUS (-) character and the input
1337 character to the comment token's data. Switch to the comment state. */
1338 $this->token
['data'] .= '-'.$char;
1344 /* Consume the next input character: */
1345 $char = $this->stream
->char();
1348 /* U+003E GREATER-THAN SIGN (>)
1349 Emit the comment token. Switch to the data state. */
1350 $this->emitToken($this->token
);
1353 } elseif($char === '-') {
1354 /* U+002D HYPHEN-MINUS (-)
1355 Parse error. Append a U+002D HYPHEN-MINUS (-) character
1356 to the comment token's data. Stay in the comment end
1358 $this->emitToken(array(
1359 'type' => self
::PARSEERROR
,
1360 'data' => 'unexpected-dash-after-double-dash-in-comment'
1362 $this->token
['data'] .= '-';
1364 } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1365 $this->emitToken(array(
1366 'type' => self
::PARSEERROR
,
1367 'data' => 'unexpected-space-after-double-dash-in-comment'
1369 $this->token
['data'] .= '--' . $char;
1370 $state = 'comment end space';
1372 } elseif($char === '!') {
1373 $this->emitToken(array(
1374 'type' => self
::PARSEERROR
,
1375 'data' => 'unexpected-bang-after-double-dash-in-comment'
1377 $state = 'comment end bang';
1379 } elseif($char === false) {
1381 Parse error. Emit the comment token. Reconsume the
1382 EOF character in the data state. */
1383 $this->emitToken(array(
1384 'type' => self
::PARSEERROR
,
1385 'data' => 'eof-in-comment-double-dash'
1387 $this->emitToken($this->token
);
1388 $this->stream
->unget();
1393 Parse error. Append two U+002D HYPHEN-MINUS (-)
1394 characters and the input character to the comment token's
1395 data. Switch to the comment state. */
1396 $this->emitToken(array(
1397 'type' => self
::PARSEERROR
,
1398 'data' => 'unexpected-char-in-comment'
1400 $this->token
['data'] .= '--'.$char;
1405 case 'comment end bang':
1406 $char = $this->stream
->char();
1407 if ($char === '>') {
1408 $this->emitToken($this->token
);
1410 } elseif ($char === "-") {
1411 $this->token
['data'] .= '--!';
1412 $state = 'comment end dash';
1413 } elseif ($char === false) {
1414 $this->emitToken(array(
1415 'type' => self
::PARSEERROR
,
1416 'data' => 'eof-in-comment-end-bang'
1418 $this->emitToken($this->token
);
1419 $this->stream
->unget();
1422 $this->token
['data'] .= '--!' . $char;
1427 case 'comment end space':
1428 $char = $this->stream
->char();
1429 if ($char === '>') {
1430 $this->emitToken($this->token
);
1432 } elseif ($char === '-') {
1433 $state = 'comment end dash';
1434 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1435 $this->token
['data'] .= $char;
1436 } elseif ($char === false) {
1437 $this->emitToken(array(
1438 'type' => self
::PARSEERROR
,
1439 'data' => 'unexpected-eof-in-comment-end-space',
1441 $this->emitToken($this->token
);
1442 $this->stream
->unget();
1445 $this->token
['data'] .= $char;
1451 /* Consume the next input character: */
1452 $char = $this->stream
->char();
1454 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1455 /* U+0009 CHARACTER TABULATION
1456 U+000A LINE FEED (LF)
1457 U+000C FORM FEED (FF)
1459 Switch to the before DOCTYPE name state. */
1460 $state = 'before DOCTYPE name';
1462 } elseif($char === false) {
1464 Parse error. Create a new DOCTYPE token. Set its
1465 force-quirks flag to on. Emit the token. Reconsume the
1466 EOF character in the data state. */
1467 $this->emitToken(array(
1468 'type' => self
::PARSEERROR
,
1469 'data' => 'need-space-after-doctype-but-got-eof'
1471 $this->emitToken(array(
1473 'type' => self
::DOCTYPE
,
1474 'force-quirks' => true,
1477 $this->stream
->unget();
1482 Parse error. Reconsume the current character in the
1483 before DOCTYPE name state. */
1484 $this->emitToken(array(
1485 'type' => self
::PARSEERROR
,
1486 'data' => 'need-space-after-doctype'
1488 $this->stream
->unget();
1489 $state = 'before DOCTYPE name';
1493 case 'before DOCTYPE name':
1494 /* Consume the next input character: */
1495 $char = $this->stream
->char();
1497 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1498 /* U+0009 CHARACTER TABULATION
1499 U+000A LINE FEED (LF)
1500 U+000C FORM FEED (FF)
1502 Stay in the before DOCTYPE name state. */
1504 } elseif($char === '>') {
1505 /* U+003E GREATER-THAN SIGN (>)
1506 Parse error. Create a new DOCTYPE token. Set its
1507 force-quirks flag to on. Emit the token. Switch to the
1509 $this->emitToken(array(
1510 'type' => self
::PARSEERROR
,
1511 'data' => 'expected-doctype-name-but-got-right-bracket'
1513 $this->emitToken(array(
1515 'type' => self
::DOCTYPE
,
1516 'force-quirks' => true,
1522 } elseif('A' <= $char && $char <= 'Z') {
1523 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1524 Create a new DOCTYPE token. Set the token's name to the
1525 lowercase version of the input character (add 0x0020 to
1526 the character's code point). Switch to the DOCTYPE name
1528 $this->token
= array(
1529 'name' => strtolower($char),
1530 'type' => self
::DOCTYPE
,
1534 $state = 'DOCTYPE name';
1536 } elseif($char === false) {
1538 Parse error. Create a new DOCTYPE token. Set its
1539 force-quirks flag to on. Emit the token. Reconsume the
1540 EOF character in the data state. */
1541 $this->emitToken(array(
1542 'type' => self
::PARSEERROR
,
1543 'data' => 'expected-doctype-name-but-got-eof'
1545 $this->emitToken(array(
1547 'type' => self
::DOCTYPE
,
1548 'force-quirks' => true,
1552 $this->stream
->unget();
1557 Create a new DOCTYPE token. Set the token's name to the
1558 current input character. Switch to the DOCTYPE name state. */
1559 $this->token
= array(
1561 'type' => self
::DOCTYPE
,
1565 $state = 'DOCTYPE name';
1569 case 'DOCTYPE name':
1570 /* Consume the next input character: */
1571 $char = $this->stream
->char();
1573 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1574 /* U+0009 CHARACTER TABULATION
1575 U+000A LINE FEED (LF)
1576 U+000C FORM FEED (FF)
1578 Switch to the after DOCTYPE name state. */
1579 $state = 'after DOCTYPE name';
1581 } elseif($char === '>') {
1582 /* U+003E GREATER-THAN SIGN (>)
1583 Emit the current DOCTYPE token. Switch to the data state. */
1584 $this->emitToken($this->token
);
1587 } elseif('A' <= $char && $char <= 'Z') {
1588 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1589 Append the lowercase version of the input character
1590 (add 0x0020 to the character's code point) to the current
1591 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1592 $this->token
['name'] .= strtolower($char);
1594 } elseif($char === false) {
1596 Parse error. Set the DOCTYPE token's force-quirks flag
1597 to on. Emit that DOCTYPE token. Reconsume the EOF
1598 character in the data state. */
1599 $this->emitToken(array(
1600 'type' => self
::PARSEERROR
,
1601 'data' => 'eof-in-doctype-name'
1603 $this->token
['force-quirks'] = true;
1604 $this->emitToken($this->token
);
1605 $this->stream
->unget();
1610 Append the current input character to the current
1611 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1612 $this->token
['name'] .= $char;
1615 // XXX this is probably some sort of quirks mode designation,
1616 // check tree-builder to be sure. In general 'error' needs
1617 // to be specc'ified, this probably means removing it at the end
1618 $this->token
['error'] = ($this->token
['name'] === 'HTML')
1623 case 'after DOCTYPE name':
1624 /* Consume the next input character: */
1625 $char = $this->stream
->char();
1627 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1628 /* U+0009 CHARACTER TABULATION
1629 U+000A LINE FEED (LF)
1630 U+000C FORM FEED (FF)
1632 Stay in the after DOCTYPE name state. */
1634 } elseif($char === '>') {
1635 /* U+003E GREATER-THAN SIGN (>)
1636 Emit the current DOCTYPE token. Switch to the data state. */
1637 $this->emitToken($this->token
);
1640 } elseif($char === false) {
1642 Parse error. Set the DOCTYPE token's force-quirks flag
1643 to on. Emit that DOCTYPE token. Reconsume the EOF
1644 character in the data state. */
1645 $this->emitToken(array(
1646 'type' => self
::PARSEERROR
,
1647 'data' => 'eof-in-doctype'
1649 $this->token
['force-quirks'] = true;
1650 $this->emitToken($this->token
);
1651 $this->stream
->unget();
1657 $nextSix = strtoupper($char . $this->stream
->charsWhile(self
::ALPHA
, 5));
1658 if ($nextSix === 'PUBLIC') {
1659 /* If the next six characters are an ASCII
1660 case-insensitive match for the word "PUBLIC", then
1661 consume those characters and switch to the before
1662 DOCTYPE public identifier state. */
1663 $state = 'before DOCTYPE public identifier';
1665 } elseif ($nextSix === 'SYSTEM') {
1666 /* Otherwise, if the next six characters are an ASCII
1667 case-insensitive match for the word "SYSTEM", then
1668 consume those characters and switch to the before
1669 DOCTYPE system identifier state. */
1670 $state = 'before DOCTYPE system identifier';
1673 /* Otherwise, this is the parse error. Set the DOCTYPE
1674 token's force-quirks flag to on. Switch to the bogus
1676 $this->emitToken(array(
1677 'type' => self
::PARSEERROR
,
1678 'data' => 'expected-space-or-right-bracket-in-doctype'
1680 $this->token
['force-quirks'] = true;
1681 $this->token
['error'] = true;
1682 $state = 'bogus DOCTYPE';
1687 case 'before DOCTYPE public identifier':
1688 /* Consume the next input character: */
1689 $char = $this->stream
->char();
1691 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1692 /* U+0009 CHARACTER TABULATION
1693 U+000A LINE FEED (LF)
1694 U+000C FORM FEED (FF)
1696 Stay in the before DOCTYPE public identifier state. */
1697 } elseif ($char === '"') {
1698 /* U+0022 QUOTATION MARK (")
1699 Set the DOCTYPE token's public identifier to the empty
1700 string (not missing), then switch to the DOCTYPE public
1701 identifier (double-quoted) state. */
1702 $this->token
['public'] = '';
1703 $state = 'DOCTYPE public identifier (double-quoted)';
1704 } elseif ($char === "'") {
1705 /* U+0027 APOSTROPHE (')
1706 Set the DOCTYPE token's public identifier to the empty
1707 string (not missing), then switch to the DOCTYPE public
1708 identifier (single-quoted) state. */
1709 $this->token
['public'] = '';
1710 $state = 'DOCTYPE public identifier (single-quoted)';
1711 } elseif ($char === '>') {
1712 /* Parse error. Set the DOCTYPE token's force-quirks flag
1713 to on. Emit that DOCTYPE token. Switch to the data state. */
1714 $this->emitToken(array(
1715 'type' => self
::PARSEERROR
,
1716 'data' => 'unexpected-end-of-doctype'
1718 $this->token
['force-quirks'] = true;
1719 $this->emitToken($this->token
);
1721 } elseif ($char === false) {
1722 /* Parse error. Set the DOCTYPE token's force-quirks
1723 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1724 character in the data state. */
1725 $this->emitToken(array(
1726 'type' => self
::PARSEERROR
,
1727 'data' => 'eof-in-doctype'
1729 $this->token
['force-quirks'] = true;
1730 $this->emitToken($this->token
);
1731 $this->stream
->unget();
1734 /* Parse error. Set the DOCTYPE token's force-quirks flag
1735 to on. Switch to the bogus DOCTYPE state. */
1736 $this->emitToken(array(
1737 'type' => self
::PARSEERROR
,
1738 'data' => 'unexpected-char-in-doctype'
1740 $this->token
['force-quirks'] = true;
1741 $state = 'bogus DOCTYPE';
1745 case 'DOCTYPE public identifier (double-quoted)':
1746 /* Consume the next input character: */
1747 $char = $this->stream
->char();
1749 if ($char === '"') {
1750 /* U+0022 QUOTATION MARK (")
1751 Switch to the after DOCTYPE public identifier state. */
1752 $state = 'after DOCTYPE public identifier';
1753 } elseif ($char === '>') {
1754 /* U+003E GREATER-THAN SIGN (>)
1755 Parse error. Set the DOCTYPE token's force-quirks flag
1756 to on. Emit that DOCTYPE token. Switch to the data state. */
1757 $this->emitToken(array(
1758 'type' => self
::PARSEERROR
,
1759 'data' => 'unexpected-end-of-doctype'
1761 $this->token
['force-quirks'] = true;
1762 $this->emitToken($this->token
);
1764 } elseif ($char === false) {
1766 Parse error. Set the DOCTYPE token's force-quirks flag
1767 to on. Emit that DOCTYPE token. Reconsume the EOF
1768 character in the data state. */
1769 $this->emitToken(array(
1770 'type' => self
::PARSEERROR
,
1771 'data' => 'eof-in-doctype'
1773 $this->token
['force-quirks'] = true;
1774 $this->emitToken($this->token
);
1775 $this->stream
->unget();
1779 Append the current input character to the current
1780 DOCTYPE token's public identifier. Stay in the DOCTYPE
1781 public identifier (double-quoted) state. */
1782 $this->token
['public'] .= $char;
1786 case 'DOCTYPE public identifier (single-quoted)':
1787 /* Consume the next input character: */
1788 $char = $this->stream
->char();
1790 if ($char === "'") {
1791 /* U+0027 APOSTROPHE (')
1792 Switch to the after DOCTYPE public identifier state. */
1793 $state = 'after DOCTYPE public identifier';
1794 } elseif ($char === '>') {
1795 /* U+003E GREATER-THAN SIGN (>)
1796 Parse error. Set the DOCTYPE token's force-quirks flag
1797 to on. Emit that DOCTYPE token. Switch to the data state. */
1798 $this->emitToken(array(
1799 'type' => self
::PARSEERROR
,
1800 'data' => 'unexpected-end-of-doctype'
1802 $this->token
['force-quirks'] = true;
1803 $this->emitToken($this->token
);
1805 } elseif ($char === false) {
1807 Parse error. Set the DOCTYPE token's force-quirks flag
1808 to on. Emit that DOCTYPE token. Reconsume the EOF
1809 character in the data state. */
1810 $this->emitToken(array(
1811 'type' => self
::PARSEERROR
,
1812 'data' => 'eof-in-doctype'
1814 $this->token
['force-quirks'] = true;
1815 $this->emitToken($this->token
);
1816 $this->stream
->unget();
1820 Append the current input character to the current
1821 DOCTYPE token's public identifier. Stay in the DOCTYPE
1822 public identifier (double-quoted) state. */
1823 $this->token
['public'] .= $char;
1827 case 'after DOCTYPE public identifier':
1828 /* Consume the next input character: */
1829 $char = $this->stream
->char();
1831 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1832 /* U+0009 CHARACTER TABULATION
1833 U+000A LINE FEED (LF)
1834 U+000C FORM FEED (FF)
1836 Stay in the after DOCTYPE public identifier state. */
1837 } elseif ($char === '"') {
1838 /* U+0022 QUOTATION MARK (")
1839 Set the DOCTYPE token's system identifier to the
1840 empty string (not missing), then switch to the DOCTYPE
1841 system identifier (double-quoted) state. */
1842 $this->token
['system'] = '';
1843 $state = 'DOCTYPE system identifier (double-quoted)';
1844 } elseif ($char === "'") {
1845 /* U+0027 APOSTROPHE (')
1846 Set the DOCTYPE token's system identifier to the
1847 empty string (not missing), then switch to the DOCTYPE
1848 system identifier (single-quoted) state. */
1849 $this->token
['system'] = '';
1850 $state = 'DOCTYPE system identifier (single-quoted)';
1851 } elseif ($char === '>') {
1852 /* U+003E GREATER-THAN SIGN (>)
1853 Emit the current DOCTYPE token. Switch to the data state. */
1854 $this->emitToken($this->token
);
1856 } elseif ($char === false) {
1857 /* Parse error. Set the DOCTYPE token's force-quirks
1858 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1859 character in the data state. */
1860 $this->emitToken(array(
1861 'type' => self
::PARSEERROR
,
1862 'data' => 'eof-in-doctype'
1864 $this->token
['force-quirks'] = true;
1865 $this->emitToken($this->token
);
1866 $this->stream
->unget();
1870 Parse error. Set the DOCTYPE token's force-quirks flag
1871 to on. Switch to the bogus DOCTYPE state. */
1872 $this->emitToken(array(
1873 'type' => self
::PARSEERROR
,
1874 'data' => 'unexpected-char-in-doctype'
1876 $this->token
['force-quirks'] = true;
1877 $state = 'bogus DOCTYPE';
1881 case 'before DOCTYPE system identifier':
1882 /* Consume the next input character: */
1883 $char = $this->stream
->char();
1885 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1886 /* U+0009 CHARACTER TABULATION
1887 U+000A LINE FEED (LF)
1888 U+000C FORM FEED (FF)
1890 Stay in the before DOCTYPE system identifier state. */
1891 } elseif ($char === '"') {
1892 /* U+0022 QUOTATION MARK (")
1893 Set the DOCTYPE token's system identifier to the empty
1894 string (not missing), then switch to the DOCTYPE system
1895 identifier (double-quoted) state. */
1896 $this->token
['system'] = '';
1897 $state = 'DOCTYPE system identifier (double-quoted)';
1898 } elseif ($char === "'") {
1899 /* U+0027 APOSTROPHE (')
1900 Set the DOCTYPE token's system identifier to the empty
1901 string (not missing), then switch to the DOCTYPE system
1902 identifier (single-quoted) state. */
1903 $this->token
['system'] = '';
1904 $state = 'DOCTYPE system identifier (single-quoted)';
1905 } elseif ($char === '>') {
1906 /* Parse error. Set the DOCTYPE token's force-quirks flag
1907 to on. Emit that DOCTYPE token. Switch to the data state. */
1908 $this->emitToken(array(
1909 'type' => self
::PARSEERROR
,
1910 'data' => 'unexpected-char-in-doctype'
1912 $this->token
['force-quirks'] = true;
1913 $this->emitToken($this->token
);
1915 } elseif ($char === false) {
1916 /* Parse error. Set the DOCTYPE token's force-quirks
1917 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1918 character in the data state. */
1919 $this->emitToken(array(
1920 'type' => self
::PARSEERROR
,
1921 'data' => 'eof-in-doctype'
1923 $this->token
['force-quirks'] = true;
1924 $this->emitToken($this->token
);
1925 $this->stream
->unget();
1928 /* Parse error. Set the DOCTYPE token's force-quirks flag
1929 to on. Switch to the bogus DOCTYPE state. */
1930 $this->emitToken(array(
1931 'type' => self
::PARSEERROR
,
1932 'data' => 'unexpected-char-in-doctype'
1934 $this->token
['force-quirks'] = true;
1935 $state = 'bogus DOCTYPE';
1939 case 'DOCTYPE system identifier (double-quoted)':
1940 /* Consume the next input character: */
1941 $char = $this->stream
->char();
1943 if ($char === '"') {
1944 /* U+0022 QUOTATION MARK (")
1945 Switch to the after DOCTYPE system identifier state. */
1946 $state = 'after DOCTYPE system identifier';
1947 } elseif ($char === '>') {
1948 /* U+003E GREATER-THAN SIGN (>)
1949 Parse error. Set the DOCTYPE token's force-quirks flag
1950 to on. Emit that DOCTYPE token. Switch to the data state. */
1951 $this->emitToken(array(
1952 'type' => self
::PARSEERROR
,
1953 'data' => 'unexpected-end-of-doctype'
1955 $this->token
['force-quirks'] = true;
1956 $this->emitToken($this->token
);
1958 } elseif ($char === false) {
1960 Parse error. Set the DOCTYPE token's force-quirks flag
1961 to on. Emit that DOCTYPE token. Reconsume the EOF
1962 character in the data state. */
1963 $this->emitToken(array(
1964 'type' => self
::PARSEERROR
,
1965 'data' => 'eof-in-doctype'
1967 $this->token
['force-quirks'] = true;
1968 $this->emitToken($this->token
);
1969 $this->stream
->unget();
1973 Append the current input character to the current
1974 DOCTYPE token's system identifier. Stay in the DOCTYPE
1975 system identifier (double-quoted) state. */
1976 $this->token
['system'] .= $char;
1980 case 'DOCTYPE system identifier (single-quoted)':
1981 /* Consume the next input character: */
1982 $char = $this->stream
->char();
1984 if ($char === "'") {
1985 /* U+0027 APOSTROPHE (')
1986 Switch to the after DOCTYPE system identifier state. */
1987 $state = 'after DOCTYPE system identifier';
1988 } elseif ($char === '>') {
1989 /* U+003E GREATER-THAN SIGN (>)
1990 Parse error. Set the DOCTYPE token's force-quirks flag
1991 to on. Emit that DOCTYPE token. Switch to the data state. */
1992 $this->emitToken(array(
1993 'type' => self
::PARSEERROR
,
1994 'data' => 'unexpected-end-of-doctype'
1996 $this->token
['force-quirks'] = true;
1997 $this->emitToken($this->token
);
1999 } elseif ($char === false) {
2001 Parse error. Set the DOCTYPE token's force-quirks flag
2002 to on. Emit that DOCTYPE token. Reconsume the EOF
2003 character in the data state. */
2004 $this->emitToken(array(
2005 'type' => self
::PARSEERROR
,
2006 'data' => 'eof-in-doctype'
2008 $this->token
['force-quirks'] = true;
2009 $this->emitToken($this->token
);
2010 $this->stream
->unget();
2014 Append the current input character to the current
2015 DOCTYPE token's system identifier. Stay in the DOCTYPE
2016 system identifier (double-quoted) state. */
2017 $this->token
['system'] .= $char;
2021 case 'after DOCTYPE system identifier':
2022 /* Consume the next input character: */
2023 $char = $this->stream
->char();
2025 if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2026 /* U+0009 CHARACTER TABULATION
2027 U+000A LINE FEED (LF)
2028 U+000C FORM FEED (FF)
2030 Stay in the after DOCTYPE system identifier state. */
2031 } elseif ($char === '>') {
2032 /* U+003E GREATER-THAN SIGN (>)
2033 Emit the current DOCTYPE token. Switch to the data state. */
2034 $this->emitToken($this->token
);
2036 } elseif ($char === false) {
2037 /* Parse error. Set the DOCTYPE token's force-quirks
2038 flag to on. Emit that DOCTYPE token. Reconsume the EOF
2039 character in the data state. */
2040 $this->emitToken(array(
2041 'type' => self
::PARSEERROR
,
2042 'data' => 'eof-in-doctype'
2044 $this->token
['force-quirks'] = true;
2045 $this->emitToken($this->token
);
2046 $this->stream
->unget();
2050 Parse error. Switch to the bogus DOCTYPE state.
2051 (This does not set the DOCTYPE token's force-quirks
2053 $this->emitToken(array(
2054 'type' => self
::PARSEERROR
,
2055 'data' => 'unexpected-char-in-doctype'
2057 $state = 'bogus DOCTYPE';
2061 case 'bogus DOCTYPE':
2062 /* Consume the next input character: */
2063 $char = $this->stream
->char();
2065 if ($char === '>') {
2066 /* U+003E GREATER-THAN SIGN (>)
2067 Emit the DOCTYPE token. Switch to the data state. */
2068 $this->emitToken($this->token
);
2071 } elseif($char === false) {
2073 Emit the DOCTYPE token. Reconsume the EOF character in
2075 $this->emitToken($this->token
);
2076 $this->stream
->unget();
2081 Stay in the bogus DOCTYPE state. */
2085 // case 'cdataSection':
2092 * Returns a serialized representation of the tree.
2094 public function save() {
2095 return $this->tree
->save();
2099 * Returns the input stream.
2101 public function stream() {
2102 return $this->stream
;
2105 private function consumeCharacterReference($allowed = false, $inattr = false) {
2106 // This goes quite far against spec, and is far closer to the Python
2107 // impl., mainly because we don't do the large unconsuming the spec
2110 // All consumed characters.
2111 $chars = $this->stream
->char();
2113 /* This section defines how to consume a character
2114 reference. This definition is used when parsing character
2115 references in text and in attributes.
2117 The behavior depends on the identity of the next character
2118 (the one immediately after the U+0026 AMPERSAND character): */
2121 $chars[0] === "\x09" ||
2122 $chars[0] === "\x0A" ||
2123 $chars[0] === "\x0C" ||
2124 $chars[0] === "\x20" ||
2125 $chars[0] === '<' ||
2126 $chars[0] === '&' ||
2128 $chars[0] === $allowed
2130 /* U+0009 CHARACTER TABULATION
2131 U+000A LINE FEED (LF)
2132 U+000C FORM FEED (FF)
2134 U+003C LESS-THAN SIGN
2137 The additional allowed character, if there is one
2138 Not a character reference. No characters are consumed,
2139 and nothing is returned. (This is not an error, either.) */
2140 // We already consumed, so unconsume.
2141 $this->stream
->unget();
2143 } elseif ($chars[0] === '#') {
2144 /* Consume the U+0023 NUMBER SIGN. */
2145 // Um, yeah, we already did that.
2146 /* The behavior further depends on the character after
2147 the U+0023 NUMBER SIGN: */
2148 $chars .= $this->stream
->char();
2149 if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2150 /* U+0078 LATIN SMALL LETTER X
2151 U+0058 LATIN CAPITAL LETTER X */
2152 /* Consume the X. */
2153 // Um, yeah, we already did that.
2154 /* Follow the steps below, but using the range of
2155 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2156 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2157 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2158 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2159 words, 0123456789, ABCDEF, abcdef). */
2160 $char_class = self
::HEX
;
2161 /* When it comes to interpreting the
2162 number, interpret it as a hexadecimal number. */
2166 // Unconsume because we shouldn't have consumed this.
2168 $this->stream
->unget();
2169 /* Follow the steps below, but using the range of
2170 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2171 NINE (i.e. just 0123456789). */
2172 $char_class = self
::DIGIT
;
2173 /* When it comes to interpreting the number,
2174 interpret it as a decimal number. */
2178 /* Consume as many characters as match the range of characters given above. */
2179 $consumed = $this->stream
->charsWhile($char_class);
2180 if ($consumed === '' || $consumed === false) {
2181 /* If no characters match the range, then don't consume
2182 any characters (and unconsume the U+0023 NUMBER SIGN
2183 character and, if appropriate, the X character). This
2184 is a parse error; nothing is returned. */
2185 $this->emitToken(array(
2186 'type' => self
::PARSEERROR
,
2187 'data' => 'expected-numeric-entity'
2189 return '&' . $chars;
2191 /* Otherwise, if the next character is a U+003B SEMICOLON,
2192 consume that too. If it isn't, there is a parse error. */
2193 if ($this->stream
->char() !== ';') {
2194 $this->stream
->unget();
2195 $this->emitToken(array(
2196 'type' => self
::PARSEERROR
,
2197 'data' => 'numeric-entity-without-semicolon'
2201 /* If one or more characters match the range, then take
2202 them all and interpret the string of characters as a number
2203 (either hexadecimal or decimal as appropriate). */
2204 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2206 /* If that number is one of the numbers in the first column
2207 of the following table, then this is a parse error. Find the
2208 row with that number in the first column, and return a
2209 character token for the Unicode character given in the
2210 second column of that row. */
2211 $new_codepoint = HTML5_Data
::getRealCodepoint($codepoint);
2212 if ($new_codepoint) {
2213 $this->emitToken(array(
2214 'type' => self
::PARSEERROR
,
2215 'data' => 'illegal-windows-1252-entity'
2217 return HTML5_Data
::utf8chr($new_codepoint);
2219 /* Otherwise, if the number is greater than 0x10FFFF, then
2220 * this is a parse error. Return a U+FFFD REPLACEMENT
2222 if ($codepoint > 0x10FFFF) {
2223 $this->emitToken(array(
2224 'type' => self
::PARSEERROR
,
2225 'data' => 'overlong-character-entity' // XXX probably not correct
2227 return "\xEF\xBF\xBD";
2229 /* Otherwise, return a character token for the Unicode
2230 * character whose code point is that number. If the
2231 * number is in the range 0x0001 to 0x0008, 0x000E to
2232 * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2233 * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2234 * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2235 * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2236 * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2237 * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2238 * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2239 * or 0x10FFFF, then this is a parse error. */
2240 // && has higher precedence than ||
2242 $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2243 $codepoint === 0x000B ||
2244 $codepoint >= 0x000E && $codepoint <= 0x001F ||
2245 $codepoint >= 0x007F && $codepoint <= 0x009F ||
2246 $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2247 $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2248 ($codepoint & 0xFFFE) === 0xFFFE ||
2249 $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2251 $this->emitToken(array(
2252 'type' => self
::PARSEERROR
,
2253 'data' => 'illegal-codepoint-for-numeric-entity'
2256 return HTML5_Data
::utf8chr($codepoint);
2263 /* Consume the maximum number of characters possible,
2264 with the consumed characters matching one of the
2265 identifiers in the first column of the named character
2266 references table (in a case-sensitive manner). */
2267 // What we actually do here is consume as much as we can while it
2268 // matches the start of one of the identifiers in the first column.
2270 $refs = HTML5_Data
::getNamedCharacterReferences();
2272 // Get the longest string which is the start of an identifier
2273 // ($chars) as well as the longest identifier which matches ($id)
2274 // and its codepoint ($codepoint).
2277 while ($char !== false && isset($refs[$char])) {
2278 $refs = $refs[$char];
2279 if (isset($refs['codepoint'])) {
2281 $codepoint = $refs['codepoint'];
2283 $chars .= $char = $this->stream
->char();
2286 // Unconsume the one character we just took which caused the while
2287 // statement to fail. This could be anything and could cause state
2288 // changes (as if it matches the while loop it must be
2289 // alphanumeric so we can just concat it to whatever we get later).
2290 $this->stream
->unget();
2291 if ($char !== false) {
2292 $chars = substr($chars, 0, -1);
2295 /* If no match can be made, then this is a parse error.
2296 No characters are consumed, and nothing is returned. */
2298 $this->emitToken(array(
2299 'type' => self
::PARSEERROR
,
2300 'data' => 'expected-named-entity'
2302 return '&' . $chars;
2305 /* If the last character matched is not a U+003B SEMICOLON
2306 (;), there is a parse error. */
2308 if (substr($id, -1) !== ';') {
2309 $this->emitToken(array(
2310 'type' => self
::PARSEERROR
,
2311 'data' => 'named-entity-without-semicolon'
2316 /* If the character reference is being consumed as part of
2317 an attribute, and the last character matched is not a
2318 U+003B SEMICOLON (;), and the next character is in the
2319 range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2320 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2321 or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2322 then, for historical reasons, all the characters that were
2323 matched after the U+0026 AMPERSAND (&) must be unconsumed,
2324 and nothing is returned. */
2325 if ($inattr && !$semicolon) {
2326 // The next character is either the next character in $chars or in the stream.
2327 if (strlen($chars) > strlen($id)) {
2328 $next = substr($chars, strlen($id), 1);
2330 $next = $this->stream
->char();
2331 $this->stream
->unget();
2334 '0' <= $next && $next <= '9' ||
2335 'A' <= $next && $next <= 'Z' ||
2336 'a' <= $next && $next <= 'z'
2338 return '&' . $chars;
2342 /* Otherwise, return a character token for the character
2343 corresponding to the character reference name (as given
2344 by the second column of the named character references table). */
2345 return HTML5_Data
::utf8chr($codepoint) . substr($chars, strlen($id));
2349 private function characterReferenceInAttributeValue($allowed = false) {
2350 /* Attempt to consume a character reference. */
2351 $entity = $this->consumeCharacterReference($allowed, true);
2353 /* If nothing is returned, append a U+0026 AMPERSAND
2354 character to the current attribute's value.
2356 Otherwise, append the returned character token to the
2357 current attribute's value. */
2362 $last = count($this->token
['attr']) - 1;
2363 $this->token
['attr'][$last]['value'] .= $char;
2365 /* Finally, switch back to the attribute value state that you
2366 were in when were switched into this state. */
2370 * Emits a token, passing it on to the tree builder.
2372 protected function emitToken($token, $checkStream = true, $dry = false) {
2374 // Emit errors from input stream.
2375 while ($this->stream
->errors
) {
2376 $this->emitToken(array_shift($this->stream
->errors
), false);
2379 if($token['type'] === self
::ENDTAG
&& !empty($token['attr'])) {
2380 for ($i = 0; $i < count($token['attr']); $i++
) {
2381 $this->emitToken(array(
2382 'type' => self
::PARSEERROR
,
2383 'data' => 'attributes-in-end-tag'
2387 if($token['type'] === self
::ENDTAG
&& !empty($token['self-closing'])) {
2388 $this->emitToken(array(
2389 'type' => self
::PARSEERROR
,
2390 'data' => 'self-closing-flag-on-end-tag',
2393 if($token['type'] === self
::STARTTAG
) {
2394 // This could be changed to actually pass the tree-builder a hash
2396 foreach ($token['attr'] as $keypair) {
2397 if (isset($hash[$keypair['name']])) {
2398 $this->emitToken(array(
2399 'type' => self
::PARSEERROR
,
2400 'data' => 'duplicate-attribute',
2403 $hash[$keypair['name']] = $keypair['value'];
2409 // the current structure of attributes is not a terribly good one
2410 $this->tree
->emitToken($token);
2413 if(!$dry && is_int($this->tree
->content_model
)) {
2414 $this->content_model
= $this->tree
->content_model
;
2415 $this->tree
->content_model
= null;
2417 } elseif($token['type'] === self
::ENDTAG
) {
2418 $this->content_model
= self
::PCDATA
;