inc/3rdparty/libraries/html5/Tokenizer.php

   1 <?php
   2
   3 /*
   4
   5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
   6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
   7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a
  10 copy of this software and associated documentation files (the
  11 "Software"), to deal in the Software without restriction, including
  12 without limitation the rights to use, copy, modify, merge, publish,
  13 distribute, sublicense, and/or sell copies of the Software, and to
  14 permit persons to whom the Software is furnished to do so, subject to
  15 the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included
  18 in all copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27
  28 */
  29
  30 // Some conventions:
  31 // /* */ indicates verbatim text from the HTML 5 specification
  32 // // indicates regular comments
  33
  34 // all flags are in hyphenated form
  35
  36 class HTML5_Tokenizer {
  37     /**
  38      * Points to an InputStream object.
  39      */
  40     protected $stream;
  41
  42     /**
  43      * Tree builder that the tokenizer emits token to.
  44      */
  45     private $tree;
  46
  47     /**
  48      * Current content model we are parsing as.
  49      */
  50     protected $content_model;
  51
  52     /**
  53      * Current token that is being built, but not yet emitted. Also
  54      * is the last token emitted, if applicable.
  55      */
  56     protected $token;
  57
  58     // These are constants describing the content model
  59     const PCDATA    = 0;
  60     const RCDATA    = 1;
  61     const CDATA     = 2;
  62     const PLAINTEXT = 3;
  63
  64     // These are constants describing tokens
  65     // XXX should probably be moved somewhere else, probably the
  66     // HTML5 class.
  67     const DOCTYPE        = 0;
  68     const STARTTAG       = 1;
  69     const ENDTAG         = 2;
  70     const COMMENT        = 3;
  71     const CHARACTER      = 4;
  72     const SPACECHARACTER = 5;
  73     const EOF            = 6;
  74     const PARSEERROR     = 7;
  75
  76     // These are constants representing bunches of characters.
  77     const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
  78     const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
  79     const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
  80     const DIGIT       = '0123456789';
  81     const HEX         = '0123456789ABCDEFabcdef';
  82     const WHITESPACE  = "\t\n\x0c ";
  83
  84     /**
  85      * @param $data Data to parse
  86      */
  87     public function __construct($data, $builder = null) {
  88         $this->stream = new HTML5_InputStream($data);
  89         if (!$builder) $this->tree = new HTML5_TreeBuilder;
  90         else $this->tree = $builder;
  91         $this->content_model = self::PCDATA;
  92     }
  93
  94     public function parseFragment($context = null) {
  95         $this->tree->setupContext($context);
  96         if ($this->tree->content_model) {
  97             $this->content_model = $this->tree->content_model;
  98             $this->tree->content_model = null;
  99         }
 100         $this->parse();
 101     }
 102
 103     // XXX maybe convert this into an iterator? regardless, this function
 104     // and the save function should go into a Parser facade of some sort
 105     /**
 106      * Performs the actual parsing of the document.
 107      */
 108     public function parse() {
 109         // Current state
 110         $state = 'data';
 111         // This is used to avoid having to have look-behind in the data state.
 112         $lastFourChars = '';
 113         /**
 114          * Escape flag as specified by the HTML5 specification: "used to
 115          * control the behavior of the tokeniser. It is either true or
 116          * false, and initially must be set to the false state."
 117          */
 118         $escape = false;
 119         //echo "\n\n";
 120         while($state !== null) {
 121
 122             /*echo $state . ' ';
 123             switch ($this->content_model) {
 124                 case self::PCDATA: echo 'PCDATA'; break;
 125                 case self::RCDATA: echo 'RCDATA'; break;
 126                 case self::CDATA: echo 'CDATA'; break;
 127                 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
 128             }
 129             if ($escape) echo " escape";
 130             echo "\n";*/
 131
 132             switch($state) {
 133                 case 'data':
 134
 135                     /* Consume the next input character */
 136                     $char = $this->stream->char();
 137                     $lastFourChars .= $char;
 138                     if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 139
 140                     // see below for meaning
 141                     $hyp_cond =
 142                         !$escape &&
 143                         (
 144                             $this->content_model === self::RCDATA ||
 145                             $this->content_model === self::CDATA
 146                         );
 147                     $amp_cond =
 148                         !$escape &&
 149                         (
 150                             $this->content_model === self::PCDATA ||
 151                             $this->content_model === self::RCDATA
 152                         );
 153                     $lt_cond =
 154                         $this->content_model === self::PCDATA ||
 155                         (
 156                             (
 157                                 $this->content_model === self::RCDATA ||
 158                                 $this->content_model === self::CDATA
 159                              ) &&
 160                              !$escape
 161                         );
 162                     $gt_cond =
 163                         $escape &&
 164                         (
 165                             $this->content_model === self::RCDATA ||
 166                             $this->content_model === self::CDATA
 167                         );
 168
 169                     if($char === '&' && $amp_cond) {
 170                         /* U+0026 AMPERSAND (&)
 171                         When the content model flag is set to one of the PCDATA or RCDATA
 172                         states and the escape flag is false: switch to the
 173                         character reference data state. Otherwise: treat it as per
 174                         the "anything else" entry below. */
 175                         $state = 'character reference data';
 176
 177                     } elseif(
 178                         $char === '-' &&
 179                         $hyp_cond &&
 180                         $lastFourChars === '<!--'
 181                     ) {
 182                         /*
 183                         U+002D HYPHEN-MINUS (-)
 184                         If the content model flag is set to either the RCDATA state or
 185                         the CDATA state, and the escape flag is false, and there are at
 186                         least three characters before this one in the input stream, and the
 187                         last four characters in the input stream, including this one, are
 188                         U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 189                         and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 190                         $escape = true;
 191
 192                         /* In any case, emit the input character as a character token. Stay
 193                         in the data state. */
 194                         $this->emitToken(array(
 195                             'type' => self::CHARACTER,
 196                             'data' => '-'
 197                         ));
 198                         // We do the "any case" part as part of "anything else".
 199
 200                     /* U+003C LESS-THAN SIGN (<) */
 201                     } elseif($char === '<' && $lt_cond) {
 202                         /* When the content model flag is set to the PCDATA state: switch
 203                         to the tag open state.
 204
 205                         When the content model flag is set to either the RCDATA state or
 206                         the CDATA state and the escape flag is false: switch to the tag
 207                         open state.
 208
 209                         Otherwise: treat it as per the "anything else" entry below. */
 210                         $state = 'tag open';
 211
 212                     /* U+003E GREATER-THAN SIGN (>) */
 213                     } elseif(
 214                         $char === '>' &&
 215                         $gt_cond &&
 216                         substr($lastFourChars, 1) === '-->'
 217                     ) {
 218                         /* If the content model flag is set to either the RCDATA state or
 219                         the CDATA state, and the escape flag is true, and the last three
 220                         characters in the input stream including this one are U+002D
 221                         HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 222                         set the escape flag to false. */
 223                         $escape = false;
 224
 225                         /* In any case, emit the input character as a character token.
 226                         Stay in the data state. */
 227                         $this->emitToken(array(
 228                             'type' => self::CHARACTER,
 229                             'data' => '>'
 230                         ));
 231                         // We do the "any case" part as part of "anything else".
 232
 233                     } elseif($char === false) {
 234                         /* EOF
 235                         Emit an end-of-file token. */
 236                         $state = null;
 237                         $this->tree->emitToken(array(
 238                             'type' => self::EOF
 239                         ));
 240
 241                     } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 242                         // Directly after emitting a token you switch back to the "data
 243                         // state". At that point spaceCharacters are important so they are
 244                         // emitted separately.
 245                         $chars = $this->stream->charsWhile(self::WHITESPACE);
 246                         $this->emitToken(array(
 247                             'type' => self::SPACECHARACTER,
 248                             'data' => $char . $chars
 249                         ));
 250                         $lastFourChars .= $chars;
 251                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 252
 253                     } else {
 254                         /* Anything else
 255                         THIS IS AN OPTIMIZATION: Get as many character that
 256                         otherwise would also be treated as a character token and emit it
 257                         as a single character token. Stay in the data state. */
 258
 259                         $mask = '';
 260                         if ($hyp_cond) $mask .= '-';
 261                         if ($amp_cond) $mask .= '&';
 262                         if ($lt_cond)  $mask .= '<';
 263                         if ($gt_cond)  $mask .= '>';
 264
 265                         if ($mask === '') {
 266                             $chars = $this->stream->remainingChars();
 267                         } else {
 268                             $chars = $this->stream->charsUntil($mask);
 269                         }
 270
 271                         $this->emitToken(array(
 272                             'type' => self::CHARACTER,
 273                             'data' => $char . $chars
 274                         ));
 275
 276                         $lastFourChars .= $chars;
 277                         if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
 278
 279                         $state = 'data';
 280                     }
 281                 break;
 282
 283                 case 'character reference data':
 284                     /* (This cannot happen if the content model flag
 285                     is set to the CDATA state.) */
 286
 287                     /* Attempt to consume a character reference, with no
 288                     additional allowed character. */
 289                     $entity = $this->consumeCharacterReference();
 290
 291                     /* If nothing is returned, emit a U+0026 AMPERSAND
 292                     character token. Otherwise, emit the character token that
 293                     was returned. */
 294                     // This is all done when consuming the character reference.
 295                     $this->emitToken(array(
 296                         'type' => self::CHARACTER,
 297                         'data' => $entity
 298                     ));
 299
 300                     /* Finally, switch to the data state. */
 301                     $state = 'data';
 302                 break;
 303
 304                 case 'tag open':
 305                     $char = $this->stream->char();
 306
 307                     switch($this->content_model) {
 308                         case self::RCDATA:
 309                         case self::CDATA:
 310                             /* Consume the next input character. If it is a
 311                             U+002F SOLIDUS (/) character, switch to the close
 312                             tag open state. Otherwise, emit a U+003C LESS-THAN
 313                             SIGN character token and reconsume the current input
 314                             character in the data state. */
 315                             // We consumed above.
 316
 317                             if($char === '/') {
 318                                 $state = 'close tag open';
 319
 320                             } else {
 321                                 $this->emitToken(array(
 322                                     'type' => self::CHARACTER,
 323                                     'data' => '<'
 324                                 ));
 325
 326                                 $this->stream->unget();
 327
 328                                 $state = 'data';
 329                             }
 330                         break;
 331
 332                         case self::PCDATA:
 333                             /* If the content model flag is set to the PCDATA state
 334                             Consume the next input character: */
 335                             // We consumed above.
 336
 337                             if($char === '!') {
 338                                 /* U+0021 EXCLAMATION MARK (!)
 339                                 Switch to the markup declaration open state. */
 340                                 $state = 'markup declaration open';
 341
 342                             } elseif($char === '/') {
 343                                 /* U+002F SOLIDUS (/)
 344                                 Switch to the close tag open state. */
 345                                 $state = 'close tag open';
 346
 347                             } elseif('A' <= $char && $char <= 'Z') {
 348                                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 349                                 Create a new start tag token, set its tag name to the lowercase
 350                                 version of the input character (add 0x0020 to the character's code
 351                                 point), then switch to the tag name state. (Don't emit the token
 352                                 yet; further details will be filled in before it is emitted.) */
 353                                 $this->token = array(
 354                                     'name'  => strtolower($char),
 355                                     'type'  => self::STARTTAG,
 356                                     'attr'  => array()
 357                                 );
 358
 359                                 $state = 'tag name';
 360
 361                             } elseif('a' <= $char && $char <= 'z') {
 362                                 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 363                                 Create a new start tag token, set its tag name to the input
 364                                 character, then switch to the tag name state. (Don't emit
 365                                 the token yet; further details will be filled in before it
 366                                 is emitted.) */
 367                                 $this->token = array(
 368                                     'name'  => $char,
 369                                     'type'  => self::STARTTAG,
 370                                     'attr'  => array()
 371                                 );
 372
 373                                 $state = 'tag name';
 374
 375                             } elseif($char === '>') {
 376                                 /* U+003E GREATER-THAN SIGN (>)
 377                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 378                                 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 379                                 $this->emitToken(array(
 380                                     'type' => self::PARSEERROR,
 381                                     'data' => 'expected-tag-name-but-got-right-bracket'
 382                                 ));
 383                                 $this->emitToken(array(
 384                                     'type' => self::CHARACTER,
 385                                     'data' => '<>'
 386                                 ));
 387
 388                                 $state = 'data';
 389
 390                             } elseif($char === '?') {
 391                                 /* U+003F QUESTION MARK (?)
 392                                 Parse error. Switch to the bogus comment state. */
 393                                 $this->emitToken(array(
 394                                     'type' => self::PARSEERROR,
 395                                     'data' => 'expected-tag-name-but-got-question-mark'
 396                                 ));
 397                                 $this->token = array(
 398                                     'data' => '?',
 399                                     'type' => self::COMMENT
 400                                 );
 401                                 $state = 'bogus comment';
 402
 403                             } else {
 404                                 /* Anything else
 405                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and
 406                                 reconsume the current input character in the data state. */
 407                                 $this->emitToken(array(
 408                                     'type' => self::PARSEERROR,
 409                                     'data' => 'expected-tag-name'
 410                                 ));
 411                                 $this->emitToken(array(
 412                                     'type' => self::CHARACTER,
 413                                     'data' => '<'
 414                                 ));
 415
 416                                 $state = 'data';
 417                                 $this->stream->unget();
 418                             }
 419                         break;
 420                     }
 421                 break;
 422
 423                 case 'close tag open':
 424                     if (
 425                         $this->content_model === self::RCDATA ||
 426                         $this->content_model === self::CDATA
 427                     ) {
 428                         /* If the content model flag is set to the RCDATA or CDATA
 429                         states... */
 430                         $name = strtolower($this->stream->charsWhile(self::ALPHA));
 431                         $following = $this->stream->char();
 432                         $this->stream->unget();
 433                         if (
 434                             !$this->token ||
 435                             $this->token['name'] !== $name ||
 436                             $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
 437                         ) {
 438                             /* if no start tag token has ever been emitted by this instance
 439                             of the tokenizer (fragment case), or, if the next few
 440                             characters do not match the tag name of the last start tag
 441                             token emitted (compared in an ASCII case-insensitive manner),
 442                             or if they do but they are not immediately followed by one of
 443                             the following characters:
 444
 445                                 * U+0009 CHARACTER TABULATION
 446                                 * U+000A LINE FEED (LF)
 447                                 * U+000C FORM FEED (FF)
 448                                 * U+0020 SPACE
 449                                 * U+003E GREATER-THAN SIGN (>)
 450                                 * U+002F SOLIDUS (/)
 451                                 * EOF
 452
 453                             ...then emit a U+003C LESS-THAN SIGN character token, a
 454                             U+002F SOLIDUS character token, and switch to the data
 455                             state to process the next input character. */
 456                             // XXX: Probably ought to replace in_array with $following === x ||...
 457
 458                             // We also need to emit $name now we've consumed that, as we
 459                             // know it'll just be emitted as a character token.
 460                             $this->emitToken(array(
 461                                 'type' => self::CHARACTER,
 462                                 'data' => '</' . $name
 463                             ));
 464
 465                             $state = 'data';
 466                         } else {
 467                             // This matches what would happen if we actually did the
 468                             // otherwise below (but we can't because we've consumed too
 469                             // much).
 470
 471                             // Start the end tag token with the name we already have.
 472                             $this->token = array(
 473                                 'name'  => $name,
 474                                 'type'  => self::ENDTAG
 475                             );
 476
 477                             // Change to tag name state.
 478                             $state = 'tag name';
 479                         }
 480                     } elseif ($this->content_model === self::PCDATA) {
 481                         /* Otherwise, if the content model flag is set to the PCDATA
 482                         state [...]: */
 483                         $char = $this->stream->char();
 484
 485                         if ('A' <= $char && $char <= 'Z') {
 486                             /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 487                             Create a new end tag token, set its tag name to the lowercase version
 488                             of the input character (add 0x0020 to the character's code point), then
 489                             switch to the tag name state. (Don't emit the token yet; further details
 490                             will be filled in before it is emitted.) */
 491                             $this->token = array(
 492                                 'name'  => strtolower($char),
 493                                 'type'  => self::ENDTAG
 494                             );
 495
 496                             $state = 'tag name';
 497
 498                         } elseif ('a' <= $char && $char <= 'z') {
 499                             /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 500                             Create a new end tag token, set its tag name to the
 501                             input character, then switch to the tag name state.
 502                             (Don't emit the token yet; further details will be
 503                             filled in before it is emitted.) */
 504                             $this->token = array(
 505                                 'name'  => $char,
 506                                 'type'  => self::ENDTAG
 507                             );
 508
 509                             $state = 'tag name';
 510
 511                         } elseif($char === '>') {
 512                             /* U+003E GREATER-THAN SIGN (>)
 513                             Parse error. Switch to the data state. */
 514                             $this->emitToken(array(
 515                                 'type' => self::PARSEERROR,
 516                                 'data' => 'expected-closing-tag-but-got-right-bracket'
 517                             ));
 518                             $state = 'data';
 519
 520                         } elseif($char === false) {
 521                             /* EOF
 522                             Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 523                             SOLIDUS character token. Reconsume the EOF character in the data state. */
 524                             $this->emitToken(array(
 525                                 'type' => self::PARSEERROR,
 526                                 'data' => 'expected-closing-tag-but-got-eof'
 527                             ));
 528                             $this->emitToken(array(
 529                                 'type' => self::CHARACTER,
 530                                 'data' => '</'
 531                             ));
 532
 533                             $this->stream->unget();
 534                             $state = 'data';
 535
 536                         } else {
 537                             /* Parse error. Switch to the bogus comment state. */
 538                             $this->emitToken(array(
 539                                 'type' => self::PARSEERROR,
 540                                 'data' => 'expected-closing-tag-but-got-char'
 541                             ));
 542                             $this->token = array(
 543                                 'data' => $char,
 544                                 'type' => self::COMMENT
 545                             );
 546                             $state = 'bogus comment';
 547                         }
 548                     }
 549                 break;
 550
 551                 case 'tag name':
 552                     /* Consume the next input character: */
 553                     $char = $this->stream->char();
 554
 555                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 556                         /* U+0009 CHARACTER TABULATION
 557                         U+000A LINE FEED (LF)
 558                         U+000C FORM FEED (FF)
 559                         U+0020 SPACE
 560                         Switch to the before attribute name state. */
 561                         $state = 'before attribute name';
 562
 563                     } elseif($char === '/') {
 564                         /* U+002F SOLIDUS (/)
 565                         Switch to the self-closing start tag state. */
 566                         $state = 'self-closing start tag';
 567
 568                     } elseif($char === '>') {
 569                         /* U+003E GREATER-THAN SIGN (>)
 570                         Emit the current tag token. Switch to the data state. */
 571                         $this->emitToken($this->token);
 572                         $state = 'data';
 573
 574                     } elseif('A' <= $char && $char <= 'Z') {
 575                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 576                         Append the lowercase version of the current input
 577                         character (add 0x0020 to the character's code point) to
 578                         the current tag token's tag name. Stay in the tag name state. */
 579                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 580
 581                         $this->token['name'] .= strtolower($char . $chars);
 582                         $state = 'tag name';
 583
 584                     } elseif($char === false) {
 585                         /* EOF
 586                         Parse error. Reconsume the EOF character in the data state. */
 587                         $this->emitToken(array(
 588                             'type' => self::PARSEERROR,
 589                             'data' => 'eof-in-tag-name'
 590                         ));
 591
 592                         $this->stream->unget();
 593                         $state = 'data';
 594
 595                     } else {
 596                         /* Anything else
 597                         Append the current input character to the current tag token's tag name.
 598                         Stay in the tag name state. */
 599                         $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
 600
 601                         $this->token['name'] .= $char . $chars;
 602                         $state = 'tag name';
 603                     }
 604                 break;
 605
 606                 case 'before attribute name':
 607                     /* Consume the next input character: */
 608                     $char = $this->stream->char();
 609
 610                     // this conditional is optimized, check bottom
 611                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 612                         /* U+0009 CHARACTER TABULATION
 613                         U+000A LINE FEED (LF)
 614                         U+000C FORM FEED (FF)
 615                         U+0020 SPACE
 616                         Stay in the before attribute name state. */
 617                         $state = 'before attribute name';
 618
 619                     } elseif($char === '/') {
 620                         /* U+002F SOLIDUS (/)
 621                         Switch to the self-closing start tag state. */
 622                         $state = 'self-closing start tag';
 623
 624                     } elseif($char === '>') {
 625                         /* U+003E GREATER-THAN SIGN (>)
 626                         Emit the current tag token. Switch to the data state. */
 627                         $this->emitToken($this->token);
 628                         $state = 'data';
 629
 630                     } elseif('A' <= $char && $char <= 'Z') {
 631                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 632                         Start a new attribute in the current tag token. Set that
 633                         attribute's name to the lowercase version of the current
 634                         input character (add 0x0020 to the character's code
 635                         point), and its value to the empty string. Switch to the
 636                         attribute name state.*/
 637                         $this->token['attr'][] = array(
 638                             'name'  => strtolower($char),
 639                             'value' => ''
 640                         );
 641
 642                         $state = 'attribute name';
 643
 644                     } elseif($char === false) {
 645                         /* EOF
 646                         Parse error. Reconsume the EOF character in the data state. */
 647                         $this->emitToken(array(
 648                             'type' => self::PARSEERROR,
 649                             'data' => 'expected-attribute-name-but-got-eof'
 650                         ));
 651
 652                         $this->stream->unget();
 653                         $state = 'data';
 654
 655                     } else {
 656                         /* U+0022 QUOTATION MARK (")
 657                            U+0027 APOSTROPHE (')
 658                            U+003C LESS-THAN SIGN (<)
 659                            U+003D EQUALS SIGN (=)
 660                         Parse error. Treat it as per the "anything else" entry
 661                         below. */
 662                         if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
 663                             $this->emitToken(array(
 664                                 'type' => self::PARSEERROR,
 665                                 'data' => 'invalid-character-in-attribute-name'
 666                             ));
 667                         }
 668
 669                         /* Anything else
 670                         Start a new attribute in the current tag token. Set that attribute's
 671                         name to the current input character, and its value to the empty string.
 672                         Switch to the attribute name state. */
 673                         $this->token['attr'][] = array(
 674                             'name'  => $char,
 675                             'value' => ''
 676                         );
 677
 678                         $state = 'attribute name';
 679                     }
 680                 break;
 681
 682                 case 'attribute name':
 683                     // Consume the next input character:
 684                     $char = $this->stream->char();
 685
 686                     // this conditional is optimized, check bottom
 687                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 688                         /* U+0009 CHARACTER TABULATION
 689                         U+000A LINE FEED (LF)
 690                         U+000C FORM FEED (FF)
 691                         U+0020 SPACE
 692                         Switch to the after attribute name state. */
 693                         $state = 'after attribute name';
 694
 695                     } elseif($char === '/') {
 696                         /* U+002F SOLIDUS (/)
 697                         Switch to the self-closing start tag state. */
 698                         $state = 'self-closing start tag';
 699
 700                     } elseif($char === '=') {
 701                         /* U+003D EQUALS SIGN (=)
 702                         Switch to the before attribute value state. */
 703                         $state = 'before attribute value';
 704
 705                     } elseif($char === '>') {
 706                         /* U+003E GREATER-THAN SIGN (>)
 707                         Emit the current tag token. Switch to the data state. */
 708                         $this->emitToken($this->token);
 709                         $state = 'data';
 710
 711                     } elseif('A' <= $char && $char <= 'Z') {
 712                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 713                         Append the lowercase version of the current input
 714                         character (add 0x0020 to the character's code point) to
 715                         the current attribute's name. Stay in the attribute name
 716                         state. */
 717                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 718
 719                         $last = count($this->token['attr']) - 1;
 720                         $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
 721
 722                         $state = 'attribute name';
 723
 724                     } elseif($char === false) {
 725                         /* EOF
 726                         Parse error. Reconsume the EOF character in the data state. */
 727                         $this->emitToken(array(
 728                             'type' => self::PARSEERROR,
 729                             'data' => 'eof-in-attribute-name'
 730                         ));
 731
 732                         $this->stream->unget();
 733                         $state = 'data';
 734
 735                     } else {
 736                         /* U+0022 QUOTATION MARK (")
 737                            U+0027 APOSTROPHE (')
 738                            U+003C LESS-THAN SIGN (<)
 739                         Parse error. Treat it as per the "anything else"
 740                         entry below. */
 741                         if($char === '"' || $char === "'" || $char === '<') {
 742                             $this->emitToken(array(
 743                                 'type' => self::PARSEERROR,
 744                                 'data' => 'invalid-character-in-attribute-name'
 745                             ));
 746                         }
 747
 748                         /* Anything else
 749                         Append the current input character to the current attribute's name.
 750                         Stay in the attribute name state. */
 751                         $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
 752
 753                         $last = count($this->token['attr']) - 1;
 754                         $this->token['attr'][$last]['name'] .= $char . $chars;
 755
 756                         $state = 'attribute name';
 757                     }
 758
 759                     /* When the user agent leaves the attribute name state
 760                     (and before emitting the tag token, if appropriate), the
 761                     complete attribute's name must be compared to the other
 762                     attributes on the same token; if there is already an
 763                     attribute on the token with the exact same name, then this
 764                     is a parse error and the new attribute must be dropped, along
 765                     with the value that gets associated with it (if any). */
 766                     // this might be implemented in the emitToken method
 767                 break;
 768
 769                 case 'after attribute name':
 770                     // Consume the next input character:
 771                     $char = $this->stream->char();
 772
 773                     // this is an optimized conditional, check the bottom
 774                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 775                         /* U+0009 CHARACTER TABULATION
 776                         U+000A LINE FEED (LF)
 777                         U+000C FORM FEED (FF)
 778                         U+0020 SPACE
 779                         Stay in the after attribute name state. */
 780                         $state = 'after attribute name';
 781
 782                     } elseif($char === '/') {
 783                         /* U+002F SOLIDUS (/)
 784                         Switch to the self-closing start tag state. */
 785                         $state = 'self-closing start tag';
 786
 787                     } elseif($char === '=') {
 788                         /* U+003D EQUALS SIGN (=)
 789                         Switch to the before attribute value state. */
 790                         $state = 'before attribute value';
 791
 792                     } elseif($char === '>') {
 793                         /* U+003E GREATER-THAN SIGN (>)
 794                         Emit the current tag token. Switch to the data state. */
 795                         $this->emitToken($this->token);
 796                         $state = 'data';
 797
 798                     } elseif('A' <= $char && $char <= 'Z') {
 799                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 800                         Start a new attribute in the current tag token. Set that
 801                         attribute's name to the lowercase version of the current
 802                         input character (add 0x0020 to the character's code
 803                         point), and its value to the empty string. Switch to the
 804                         attribute name state. */
 805                         $this->token['attr'][] = array(
 806                             'name'  => strtolower($char),
 807                             'value' => ''
 808                         );
 809
 810                         $state = 'attribute name';
 811
 812                     } elseif($char === false) {
 813                         /* EOF
 814                         Parse error. Reconsume the EOF character in the data state. */
 815                         $this->emitToken(array(
 816                             'type' => self::PARSEERROR,
 817                             'data' => 'expected-end-of-tag-but-got-eof'
 818                         ));
 819
 820                         $this->stream->unget();
 821                         $state = 'data';
 822
 823                     } else {
 824                         /* U+0022 QUOTATION MARK (")
 825                            U+0027 APOSTROPHE (')
 826                            U+003C LESS-THAN SIGN(<)
 827                         Parse error. Treat it as per the "anything else"
 828                         entry below. */
 829                         if($char === '"' || $char === "'" || $char === "<") {
 830                             $this->emitToken(array(
 831                                 'type' => self::PARSEERROR,
 832                                 'data' => 'invalid-character-after-attribute-name'
 833                             ));
 834                         }
 835
 836                         /* Anything else
 837                         Start a new attribute in the current tag token. Set that attribute's
 838                         name to the current input character, and its value to the empty string.
 839                         Switch to the attribute name state. */
 840                         $this->token['attr'][] = array(
 841                             'name'  => $char,
 842                             'value' => ''
 843                         );
 844
 845                         $state = 'attribute name';
 846                     }
 847                 break;
 848
 849                 case 'before attribute value':
 850                     // Consume the next input character:
 851                     $char = $this->stream->char();
 852
 853                     // this is an optimized conditional
 854                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 855                         /* U+0009 CHARACTER TABULATION
 856                         U+000A LINE FEED (LF)
 857                         U+000C FORM FEED (FF)
 858                         U+0020 SPACE
 859                         Stay in the before attribute value state. */
 860                         $state = 'before attribute value';
 861
 862                     } elseif($char === '"') {
 863                         /* U+0022 QUOTATION MARK (")
 864                         Switch to the attribute value (double-quoted) state. */
 865                         $state = 'attribute value (double-quoted)';
 866
 867                     } elseif($char === '&') {
 868                         /* U+0026 AMPERSAND (&)
 869                         Switch to the attribute value (unquoted) state and reconsume
 870                         this input character. */
 871                         $this->stream->unget();
 872                         $state = 'attribute value (unquoted)';
 873
 874                     } elseif($char === '\'') {
 875                         /* U+0027 APOSTROPHE (')
 876                         Switch to the attribute value (single-quoted) state. */
 877                         $state = 'attribute value (single-quoted)';
 878
 879                     } elseif($char === '>') {
 880                         /* U+003E GREATER-THAN SIGN (>)
 881                         Parse error. Emit the current tag token. Switch to the data state. */
 882                         $this->emitToken(array(
 883                             'type' => self::PARSEERROR,
 884                             'data' => 'expected-attribute-value-but-got-right-bracket'
 885                         ));
 886                         $this->emitToken($this->token);
 887                         $state = 'data';
 888
 889                     } elseif($char === false) {
 890                         /* EOF
 891                         Parse error. Reconsume the EOF character in the data state. */
 892                         $this->emitToken(array(
 893                             'type' => self::PARSEERROR,
 894                             'data' => 'expected-attribute-value-but-got-eof'
 895                         ));
 896                         $this->stream->unget();
 897                         $state = 'data';
 898
 899                     } else {
 900                         /* U+003D EQUALS SIGN (=)
 901                          * U+003C LESS-THAN SIGN (<)
 902                         Parse error. Treat it as per the "anything else" entry below. */
 903                         if($char === '=' || $char === '<') {
 904                             $this->emitToken(array(
 905                                 'type' => self::PARSEERROR,
 906                                 'data' => 'equals-in-unquoted-attribute-value'
 907                             ));
 908                         }
 909
 910                         /* Anything else
 911                         Append the current input character to the current attribute's value.
 912                         Switch to the attribute value (unquoted) state. */
 913                         $last = count($this->token['attr']) - 1;
 914                         $this->token['attr'][$last]['value'] .= $char;
 915
 916                         $state = 'attribute value (unquoted)';
 917                     }
 918                 break;
 919
 920                 case 'attribute value (double-quoted)':
 921                     // Consume the next input character:
 922                     $char = $this->stream->char();
 923
 924                     if($char === '"') {
 925                         /* U+0022 QUOTATION MARK (")
 926                         Switch to the after attribute value (quoted) state. */
 927                         $state = 'after attribute value (quoted)';
 928
 929                     } elseif($char === '&') {
 930                         /* U+0026 AMPERSAND (&)
 931                         Switch to the character reference in attribute value
 932                         state, with the additional allowed character
 933                         being U+0022 QUOTATION MARK ("). */
 934                         $this->characterReferenceInAttributeValue('"');
 935
 936                     } elseif($char === false) {
 937                         /* EOF
 938                         Parse error. Reconsume the EOF character in the data state. */
 939                         $this->emitToken(array(
 940                             'type' => self::PARSEERROR,
 941                             'data' => 'eof-in-attribute-value-double-quote'
 942                         ));
 943
 944                         $this->stream->unget();
 945                         $state = 'data';
 946
 947                     } else {
 948                         /* Anything else
 949                         Append the current input character to the current attribute's value.
 950                         Stay in the attribute value (double-quoted) state. */
 951                         $chars = $this->stream->charsUntil('"&');
 952
 953                         $last = count($this->token['attr']) - 1;
 954                         $this->token['attr'][$last]['value'] .= $char . $chars;
 955
 956                         $state = 'attribute value (double-quoted)';
 957                     }
 958                 break;
 959
 960                 case 'attribute value (single-quoted)':
 961                     // Consume the next input character:
 962                     $char = $this->stream->char();
 963
 964                     if($char === "'") {
 965                         /* U+0022 QUOTATION MARK (')
 966                         Switch to the after attribute value state. */
 967                         $state = 'after attribute value (quoted)';
 968
 969                     } elseif($char === '&') {
 970                         /* U+0026 AMPERSAND (&)
 971                         Switch to the entity in attribute value state. */
 972                         $this->characterReferenceInAttributeValue("'");
 973
 974                     } elseif($char === false) {
 975                         /* EOF
 976                         Parse error. Reconsume the EOF character in the data state. */
 977                         $this->emitToken(array(
 978                             'type' => self::PARSEERROR,
 979                             'data' => 'eof-in-attribute-value-single-quote'
 980                         ));
 981
 982                         $this->stream->unget();
 983                         $state = 'data';
 984
 985                     } else {
 986                         /* Anything else
 987                         Append the current input character to the current attribute's value.
 988                         Stay in the attribute value (single-quoted) state. */
 989                         $chars = $this->stream->charsUntil("'&");
 990
 991                         $last = count($this->token['attr']) - 1;
 992                         $this->token['attr'][$last]['value'] .= $char . $chars;
 993
 994                         $state = 'attribute value (single-quoted)';
 995                     }
 996                 break;
 997
 998                 case 'attribute value (unquoted)':
 999                     // Consume the next input character:
1000                     $char = $this->stream->char();
1001
1002                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1003                         /* U+0009 CHARACTER TABULATION
1004                         U+000A LINE FEED (LF)
1005                         U+000C FORM FEED (FF)
1006                         U+0020 SPACE
1007                         Switch to the before attribute name state. */
1008                         $state = 'before attribute name';
1009
1010                     } elseif($char === '&') {
1011                         /* U+0026 AMPERSAND (&)
1012                         Switch to the entity in attribute value state, with the
1013                         additional allowed character  being U+003E
1014                         GREATER-THAN SIGN (>). */
1015                         $this->characterReferenceInAttributeValue('>');
1016
1017                     } elseif($char === '>') {
1018                         /* U+003E GREATER-THAN SIGN (>)
1019                         Emit the current tag token. Switch to the data state. */
1020                         $this->emitToken($this->token);
1021                         $state = 'data';
1022
1023                     } elseif ($char === false) {
1024                         /* EOF
1025                         Parse error. Reconsume the EOF character in the data state. */
1026                         $this->emitToken(array(
1027                             'type' => self::PARSEERROR,
1028                             'data' => 'eof-in-attribute-value-no-quotes'
1029                         ));
1030                         $this->stream->unget();
1031                         $state = 'data';
1032
1033                     } else {
1034                         /* U+0022 QUOTATION MARK (")
1035                            U+0027 APOSTROPHE (')
1036                            U+003C LESS-THAN SIGN (<)
1037                            U+003D EQUALS SIGN (=)
1038                         Parse error. Treat it as per the "anything else"
1039                         entry below. */
1040                         if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1041                             $this->emitToken(array(
1042                                 'type' => self::PARSEERROR,
1043                                 'data' => 'unexpected-character-in-unquoted-attribute-value'
1044                             ));
1045                         }
1046
1047                         /* Anything else
1048                         Append the current input character to the current attribute's value.
1049                         Stay in the attribute value (unquoted) state. */
1050                         $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1051
1052                         $last = count($this->token['attr']) - 1;
1053                         $this->token['attr'][$last]['value'] .= $char . $chars;
1054
1055                         $state = 'attribute value (unquoted)';
1056                     }
1057                 break;
1058
1059                 case 'after attribute value (quoted)':
1060                     /* Consume the next input character: */
1061                     $char = $this->stream->char();
1062
1063                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1064                         /* U+0009 CHARACTER TABULATION
1065                            U+000A LINE FEED (LF)
1066                            U+000C FORM FEED (FF)
1067                            U+0020 SPACE
1068                         Switch to the before attribute name state. */
1069                         $state = 'before attribute name';
1070
1071                     } elseif ($char === '/') {
1072                         /* U+002F SOLIDUS (/)
1073                         Switch to the self-closing start tag state. */
1074                         $state = 'self-closing start tag';
1075
1076                     } elseif ($char === '>') {
1077                         /* U+003E GREATER-THAN SIGN (>)
1078                         Emit the current tag token. Switch to the data state. */
1079                         $this->emitToken($this->token);
1080                         $state = 'data';
1081
1082                     } elseif ($char === false) {
1083                         /* EOF
1084                         Parse error. Reconsume the EOF character in the data state. */
1085                         $this->emitToken(array(
1086                             'type' => self::PARSEERROR,
1087                             'data' => 'unexpected-EOF-after-attribute-value'
1088                         ));
1089                         $this->stream->unget();
1090                         $state = 'data';
1091
1092                     } else {
1093                         /* Anything else
1094                         Parse error. Reconsume the character in the before attribute
1095                         name state. */
1096                         $this->emitToken(array(
1097                             'type' => self::PARSEERROR,
1098                             'data' => 'unexpected-character-after-attribute-value'
1099                         ));
1100                         $this->stream->unget();
1101                         $state = 'before attribute name';
1102                     }
1103                 break;
1104
1105                 case 'self-closing start tag':
1106                     /* Consume the next input character: */
1107                     $char = $this->stream->char();
1108
1109                     if ($char === '>') {
1110                         /* U+003E GREATER-THAN SIGN (>)
1111                         Set the self-closing flag of the current tag token.
1112                         Emit the current tag token. Switch to the data state. */
1113                         // not sure if this is the name we want
1114                         $this->token['self-closing'] = true;
1115                         $this->emitToken($this->token);
1116                         $state = 'data';
1117
1118                     } elseif ($char === false) {
1119                         /* EOF
1120                         Parse error. Reconsume the EOF character in the data state. */
1121                         $this->emitToken(array(
1122                             'type' => self::PARSEERROR,
1123                             'data' => 'unexpected-eof-after-self-closing'
1124                         ));
1125                         $this->stream->unget();
1126                         $state = 'data';
1127
1128                     } else {
1129                         /* Anything else
1130                         Parse error. Reconsume the character in the before attribute name state. */
1131                         $this->emitToken(array(
1132                             'type' => self::PARSEERROR,
1133                             'data' => 'unexpected-character-after-self-closing'
1134                         ));
1135                         $this->stream->unget();
1136                         $state = 'before attribute name';
1137                     }
1138                 break;
1139
1140                 case 'bogus comment':
1141                     /* (This can only happen if the content model flag is set to the PCDATA state.) */
1142                     /* Consume every character up to the first U+003E GREATER-THAN SIGN
1143                     character (>) or the end of the file (EOF), whichever comes first. Emit
1144                     a comment token whose data is the concatenation of all the characters
1145                     starting from and including the character that caused the state machine
1146                     to switch into the bogus comment state, up to and including the last
1147                     consumed character before the U+003E character, if any, or up to the
1148                     end of the file otherwise. (If the comment was started by the end of
1149                     the file (EOF), the token is empty.) */
1150                     $this->token['data'] .= (string) $this->stream->charsUntil('>');
1151                     $this->stream->char();
1152
1153                     $this->emitToken($this->token);
1154
1155                     /* Switch to the data state. */
1156                     $state = 'data';
1157                 break;
1158
1159                 case 'markup declaration open':
1160                     // Consume for below
1161                     $hyphens = $this->stream->charsWhile('-', 2);
1162                     if ($hyphens === '-') {
1163                         $this->stream->unget();
1164                     }
1165                     if ($hyphens !== '--') {
1166                         $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1167                     }
1168
1169                     /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1170                     characters, consume those two characters, create a comment token whose
1171                     data is the empty string, and switch to the comment state. */
1172                     if($hyphens === '--') {
1173                         $state = 'comment start';
1174                         $this->token = array(
1175                             'data' => '',
1176                             'type' => self::COMMENT
1177                         );
1178
1179                     /* Otherwise if the next seven characters are a case-insensitive match
1180                     for the word "DOCTYPE", then consume those characters and switch to the
1181                     DOCTYPE state. */
1182                     } elseif(strtoupper($alpha) === 'DOCTYPE') {
1183                         $state = 'DOCTYPE';
1184
1185                     // XXX not implemented
1186                     /* Otherwise, if the insertion mode is "in foreign content"
1187                     and the current node is not an element in the HTML namespace
1188                     and the next seven characters are an ASCII case-sensitive
1189                     match for the string "[CDATA[" (the five uppercase letters
1190                     "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1191                     and after), then consume those characters and switch to the
1192                     CDATA section state (which is unrelated to the content model
1193                     flag's CDATA state). */
1194
1195                     /* Otherwise, is is a parse error. Switch to the bogus comment state.
1196                     The next character that is consumed, if any, is the first character
1197                     that will be in the comment. */
1198                     } else {
1199                         $this->emitToken(array(
1200                             'type' => self::PARSEERROR,
1201                             'data' => 'expected-dashes-or-doctype'
1202                         ));
1203                         $this->token = array(
1204                             'data' => (string) $alpha,
1205                             'type' => self::COMMENT
1206                         );
1207                         $state = 'bogus comment';
1208                     }
1209                 break;
1210
1211                 case 'comment start':
1212                     /* Consume the next input character: */
1213                     $char = $this->stream->char();
1214
1215                     if ($char === '-') {
1216                         /* U+002D HYPHEN-MINUS (-)
1217                         Switch to the comment start dash state. */
1218                         $state = 'comment start dash';
1219                     } elseif ($char === '>') {
1220                         /* U+003E GREATER-THAN SIGN (>)
1221                         Parse error. Emit the comment token. Switch to the
1222                         data state. */
1223                         $this->emitToken(array(
1224                             'type' => self::PARSEERROR,
1225                             'data' => 'incorrect-comment'
1226                         ));
1227                         $this->emitToken($this->token);
1228                         $state = 'data';
1229                     } elseif ($char === false) {
1230                         /* EOF
1231                         Parse error. Emit the comment token. Reconsume the
1232                         EOF character in the data state. */
1233                         $this->emitToken(array(
1234                             'type' => self::PARSEERROR,
1235                             'data' => 'eof-in-comment'
1236                         ));
1237                         $this->emitToken($this->token);
1238                         $this->stream->unget();
1239                         $state = 'data';
1240                     } else {
1241                         /* Anything else
1242                         Append the input character to the comment token's
1243                         data. Switch to the comment state. */
1244                         $this->token['data'] .= $char;
1245                         $state = 'comment';
1246                     }
1247                 break;
1248
1249                 case 'comment start dash':
1250                     /* Consume the next input character: */
1251                     $char = $this->stream->char();
1252                     if ($char === '-') {
1253                         /* U+002D HYPHEN-MINUS (-)
1254                         Switch to the comment end state */
1255                         $state = 'comment end';
1256                     } elseif ($char === '>') {
1257                         /* U+003E GREATER-THAN SIGN (>)
1258                         Parse error. Emit the comment token. Switch to the
1259                         data state. */
1260                         $this->emitToken(array(
1261                             'type' => self::PARSEERROR,
1262                             'data' => 'incorrect-comment'
1263                         ));
1264                         $this->emitToken($this->token);
1265                         $state = 'data';
1266                     } elseif ($char === false) {
1267                         /* Parse error. Emit the comment token. Reconsume the
1268                         EOF character in the data state. */
1269                         $this->emitToken(array(
1270                             'type' => self::PARSEERROR,
1271                             'data' => 'eof-in-comment'
1272                         ));
1273                         $this->emitToken($this->token);
1274                         $this->stream->unget();
1275                         $state = 'data';
1276                     } else {
1277                         $this->token['data'] .= '-' . $char;
1278                         $state = 'comment';
1279                     }
1280                 break;
1281
1282                 case 'comment':
1283                     /* Consume the next input character: */
1284                     $char = $this->stream->char();
1285
1286                     if($char === '-') {
1287                         /* U+002D HYPHEN-MINUS (-)
1288                         Switch to the comment end dash state */
1289                         $state = 'comment end dash';
1290
1291                     } elseif($char === false) {
1292                         /* EOF
1293                         Parse error. Emit the comment token. Reconsume the EOF character
1294                         in the data state. */
1295                         $this->emitToken(array(
1296                             'type' => self::PARSEERROR,
1297                             'data' => 'eof-in-comment'
1298                         ));
1299                         $this->emitToken($this->token);
1300                         $this->stream->unget();
1301                         $state = 'data';
1302
1303                     } else {
1304                         /* Anything else
1305                         Append the input character to the comment token's data. Stay in
1306                         the comment state. */
1307                         $chars = $this->stream->charsUntil('-');
1308
1309                         $this->token['data'] .= $char . $chars;
1310                     }
1311                 break;
1312
1313                 case 'comment end dash':
1314                     /* Consume the next input character: */
1315                     $char = $this->stream->char();
1316
1317                     if($char === '-') {
1318                         /* U+002D HYPHEN-MINUS (-)
1319                         Switch to the comment end state  */
1320                         $state = 'comment end';
1321
1322                     } elseif($char === false) {
1323                         /* EOF
1324                         Parse error. Emit the comment token. Reconsume the EOF character
1325                         in the data state. */
1326                         $this->emitToken(array(
1327                             'type' => self::PARSEERROR,
1328                             'data' => 'eof-in-comment-end-dash'
1329                         ));
1330                         $this->emitToken($this->token);
1331                         $this->stream->unget();
1332                         $state = 'data';
1333
1334                     } else {
1335                         /* Anything else
1336                         Append a U+002D HYPHEN-MINUS (-) character and the input
1337                         character to the comment token's data. Switch to the comment state. */
1338                         $this->token['data'] .= '-'.$char;
1339                         $state = 'comment';
1340                     }
1341                 break;
1342
1343                 case 'comment end':
1344                     /* Consume the next input character: */
1345                     $char = $this->stream->char();
1346
1347                     if($char === '>') {
1348                         /* U+003E GREATER-THAN SIGN (>)
1349                         Emit the comment token. Switch to the data state. */
1350                         $this->emitToken($this->token);
1351                         $state = 'data';
1352
1353                     } elseif($char === '-') {
1354                         /* U+002D HYPHEN-MINUS (-)
1355                         Parse error. Append a U+002D HYPHEN-MINUS (-) character
1356                         to the comment token's data. Stay in the comment end
1357                         state. */
1358                         $this->emitToken(array(
1359                             'type' => self::PARSEERROR,
1360                             'data' => 'unexpected-dash-after-double-dash-in-comment'
1361                         ));
1362                         $this->token['data'] .= '-';
1363
1364                     } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1365                         $this->emitToken(array(
1366                             'type' => self::PARSEERROR,
1367                             'data' => 'unexpected-space-after-double-dash-in-comment'
1368                         ));
1369                         $this->token['data'] .= '--' . $char;
1370                         $state = 'comment end space';
1371
1372                     } elseif($char === '!') {
1373                         $this->emitToken(array(
1374                             'type' => self::PARSEERROR,
1375                             'data' => 'unexpected-bang-after-double-dash-in-comment'
1376                         ));
1377                         $state = 'comment end bang';
1378
1379                     } elseif($char === false) {
1380                         /* EOF
1381                         Parse error. Emit the comment token. Reconsume the
1382                         EOF character in the data state. */
1383                         $this->emitToken(array(
1384                             'type' => self::PARSEERROR,
1385                             'data' => 'eof-in-comment-double-dash'
1386                         ));
1387                         $this->emitToken($this->token);
1388                         $this->stream->unget();
1389                         $state = 'data';
1390
1391                     } else {
1392                         /* Anything else
1393                         Parse error. Append two U+002D HYPHEN-MINUS (-)
1394                         characters and the input character to the comment token's
1395                         data. Switch to the comment state. */
1396                         $this->emitToken(array(
1397                             'type' => self::PARSEERROR,
1398                             'data' => 'unexpected-char-in-comment'
1399                         ));
1400                         $this->token['data'] .= '--'.$char;
1401                         $state = 'comment';
1402                     }
1403                 break;
1404
1405                 case 'comment end bang':
1406                     $char = $this->stream->char();
1407                     if ($char === '>') {
1408                         $this->emitToken($this->token);
1409                         $state = 'data';
1410                     } elseif ($char === "-") {
1411                         $this->token['data'] .= '--!';
1412                         $state = 'comment end dash';
1413                     } elseif ($char === false) {
1414                         $this->emitToken(array(
1415                             'type' => self::PARSEERROR,
1416                             'data' => 'eof-in-comment-end-bang'
1417                         ));
1418                         $this->emitToken($this->token);
1419                         $this->stream->unget();
1420                         $state = 'data';
1421                     } else {
1422                         $this->token['data'] .= '--!' . $char;
1423                         $state = 'comment';
1424                     }
1425                 break;
1426
1427                 case 'comment end space':
1428                     $char = $this->stream->char();
1429                     if ($char === '>') {
1430                         $this->emitToken($this->token);
1431                         $state = 'data';
1432                     } elseif ($char === '-') {
1433                         $state = 'comment end dash';
1434                     } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1435                         $this->token['data'] .= $char;
1436                     } elseif ($char === false) {
1437                         $this->emitToken(array(
1438                             'type' => self::PARSEERROR,
1439                             'data' => 'unexpected-eof-in-comment-end-space',
1440                         ));
1441                         $this->emitToken($this->token);
1442                         $this->stream->unget();
1443                         $state = 'data';
1444                     } else {
1445                         $this->token['data'] .= $char;
1446                         $state = 'comment';
1447                     }
1448                 break;
1449
1450                 case 'DOCTYPE':
1451                     /* Consume the next input character: */
1452                     $char = $this->stream->char();
1453
1454                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1455                         /* U+0009 CHARACTER TABULATION
1456                            U+000A LINE FEED (LF)
1457                            U+000C FORM FEED (FF)
1458                            U+0020 SPACE
1459                         Switch to the before DOCTYPE name state. */
1460                         $state = 'before DOCTYPE name';
1461
1462                     } elseif($char === false) {
1463                         /* EOF
1464                         Parse error. Create a new DOCTYPE token. Set its
1465                         force-quirks flag to on. Emit the token. Reconsume the
1466                         EOF character in the data state. */
1467                         $this->emitToken(array(
1468                             'type' => self::PARSEERROR,
1469                             'data' => 'need-space-after-doctype-but-got-eof'
1470                         ));
1471                         $this->emitToken(array(
1472                             'name' => '',
1473                             'type' => self::DOCTYPE,
1474                             'force-quirks' => true,
1475                             'error' => true
1476                         ));
1477                         $this->stream->unget();
1478                         $state = 'data';
1479
1480                     } else {
1481                         /* Anything else
1482                         Parse error. Reconsume the current character in the
1483                         before DOCTYPE name state. */
1484                         $this->emitToken(array(
1485                             'type' => self::PARSEERROR,
1486                             'data' => 'need-space-after-doctype'
1487                         ));
1488                         $this->stream->unget();
1489                         $state = 'before DOCTYPE name';
1490                     }
1491                 break;
1492
1493                 case 'before DOCTYPE name':
1494                     /* Consume the next input character: */
1495                     $char = $this->stream->char();
1496
1497                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1498                         /* U+0009 CHARACTER TABULATION
1499                            U+000A LINE FEED (LF)
1500                            U+000C FORM FEED (FF)
1501                            U+0020 SPACE
1502                         Stay in the before DOCTYPE name state. */
1503
1504                     } elseif($char === '>') {
1505                         /* U+003E GREATER-THAN SIGN (>)
1506                         Parse error. Create a new DOCTYPE token. Set its
1507                         force-quirks flag to on. Emit the token. Switch to the
1508                         data state. */
1509                         $this->emitToken(array(
1510                             'type' => self::PARSEERROR,
1511                             'data' => 'expected-doctype-name-but-got-right-bracket'
1512                         ));
1513                         $this->emitToken(array(
1514                             'name' => '',
1515                             'type' => self::DOCTYPE,
1516                             'force-quirks' => true,
1517                             'error' => true
1518                         ));
1519
1520                         $state = 'data';
1521
1522                     } elseif('A' <= $char && $char <= 'Z') {
1523                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1524                         Create a new DOCTYPE token. Set the token's name to the
1525                         lowercase version of the input character (add 0x0020 to
1526                         the character's code point). Switch to the DOCTYPE name
1527                         state. */
1528                         $this->token = array(
1529                             'name' => strtolower($char),
1530                             'type' => self::DOCTYPE,
1531                             'error' => true
1532                         );
1533
1534                         $state = 'DOCTYPE name';
1535
1536                     } elseif($char === false) {
1537                         /* EOF
1538                         Parse error. Create a new DOCTYPE token. Set its
1539                         force-quirks flag to on. Emit the token. Reconsume the
1540                         EOF character in the data state. */
1541                         $this->emitToken(array(
1542                             'type' => self::PARSEERROR,
1543                             'data' => 'expected-doctype-name-but-got-eof'
1544                         ));
1545                         $this->emitToken(array(
1546                             'name' => '',
1547                             'type' => self::DOCTYPE,
1548                             'force-quirks' => true,
1549                             'error' => true
1550                         ));
1551
1552                         $this->stream->unget();
1553                         $state = 'data';
1554
1555                     } else {
1556                         /* Anything else
1557                         Create a new DOCTYPE token. Set the token's name to the
1558                         current input character. Switch to the DOCTYPE name state. */
1559                         $this->token = array(
1560                             'name' => $char,
1561                             'type' => self::DOCTYPE,
1562                             'error' => true
1563                         );
1564
1565                         $state = 'DOCTYPE name';
1566                     }
1567                 break;
1568
1569                 case 'DOCTYPE name':
1570                     /* Consume the next input character: */
1571                     $char = $this->stream->char();
1572
1573                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1574                         /* U+0009 CHARACTER TABULATION
1575                            U+000A LINE FEED (LF)
1576                            U+000C FORM FEED (FF)
1577                            U+0020 SPACE
1578                         Switch to the after DOCTYPE name state. */
1579                         $state = 'after DOCTYPE name';
1580
1581                     } elseif($char === '>') {
1582                         /* U+003E GREATER-THAN SIGN (>)
1583                         Emit the current DOCTYPE token. Switch to the data state. */
1584                         $this->emitToken($this->token);
1585                         $state = 'data';
1586
1587                     } elseif('A' <= $char && $char <= 'Z') {
1588                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1589                         Append the lowercase version of the input character
1590                         (add 0x0020 to the character's code point) to the current
1591                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1592                         $this->token['name'] .= strtolower($char);
1593
1594                     } elseif($char === false) {
1595                         /* EOF
1596                         Parse error. Set the DOCTYPE token's force-quirks flag
1597                         to on. Emit that DOCTYPE token. Reconsume the EOF
1598                         character in the data state. */
1599                         $this->emitToken(array(
1600                             'type' => self::PARSEERROR,
1601                             'data' => 'eof-in-doctype-name'
1602                         ));
1603                         $this->token['force-quirks'] = true;
1604                         $this->emitToken($this->token);
1605                         $this->stream->unget();
1606                         $state = 'data';
1607
1608                     } else {
1609                         /* Anything else
1610                         Append the current input character to the current
1611                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1612                         $this->token['name'] .= $char;
1613                     }
1614
1615                     // XXX this is probably some sort of quirks mode designation,
1616                     // check tree-builder to be sure. In general 'error' needs
1617                     // to be specc'ified, this probably means removing it at the end
1618                     $this->token['error'] = ($this->token['name'] === 'HTML')
1619                         ? false
1620                         : true;
1621                 break;
1622
1623                 case 'after DOCTYPE name':
1624                     /* Consume the next input character: */
1625                     $char = $this->stream->char();
1626
1627                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1628                         /* U+0009 CHARACTER TABULATION
1629                            U+000A LINE FEED (LF)
1630                            U+000C FORM FEED (FF)
1631                            U+0020 SPACE
1632                         Stay in the after DOCTYPE name state. */
1633
1634                     } elseif($char === '>') {
1635                         /* U+003E GREATER-THAN SIGN (>)
1636                         Emit the current DOCTYPE token. Switch to the data state. */
1637                         $this->emitToken($this->token);
1638                         $state = 'data';
1639
1640                     } elseif($char === false) {
1641                         /* EOF
1642                         Parse error. Set the DOCTYPE token's force-quirks flag
1643                         to on. Emit that DOCTYPE token. Reconsume the EOF
1644                         character in the data state. */
1645                         $this->emitToken(array(
1646                             'type' => self::PARSEERROR,
1647                             'data' => 'eof-in-doctype'
1648                         ));
1649                         $this->token['force-quirks'] = true;
1650                         $this->emitToken($this->token);
1651                         $this->stream->unget();
1652                         $state = 'data';
1653
1654                     } else {
1655                         /* Anything else */
1656
1657                         $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1658                         if ($nextSix === 'PUBLIC') {
1659                             /* If the next six characters are an ASCII
1660                             case-insensitive match for the word "PUBLIC", then
1661                             consume those characters and switch to the before
1662                             DOCTYPE public identifier state. */
1663                             $state = 'before DOCTYPE public identifier';
1664
1665                         } elseif ($nextSix === 'SYSTEM') {
1666                             /* Otherwise, if the next six characters are an ASCII
1667                             case-insensitive match for the word "SYSTEM", then
1668                             consume those characters and switch to the before
1669                             DOCTYPE system identifier state. */
1670                             $state = 'before DOCTYPE system identifier';
1671
1672                         } else {
1673                             /* Otherwise, this is the parse error. Set the DOCTYPE
1674                             token's force-quirks flag to on. Switch to the bogus
1675                             DOCTYPE state. */
1676                             $this->emitToken(array(
1677                                 'type' => self::PARSEERROR,
1678                                 'data' => 'expected-space-or-right-bracket-in-doctype'
1679                             ));
1680                             $this->token['force-quirks'] = true;
1681                             $this->token['error'] = true;
1682                             $state = 'bogus DOCTYPE';
1683                         }
1684                     }
1685                 break;
1686
1687                 case 'before DOCTYPE public identifier':
1688                     /* Consume the next input character: */
1689                     $char = $this->stream->char();
1690
1691                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1692                         /* U+0009 CHARACTER TABULATION
1693                            U+000A LINE FEED (LF)
1694                            U+000C FORM FEED (FF)
1695                            U+0020 SPACE
1696                         Stay in the before DOCTYPE public identifier state. */
1697                     } elseif ($char === '"') {
1698                         /* U+0022 QUOTATION MARK (")
1699                         Set the DOCTYPE token's public identifier to the empty
1700                         string (not missing), then switch to the DOCTYPE public
1701                         identifier (double-quoted) state. */
1702                         $this->token['public'] = '';
1703                         $state = 'DOCTYPE public identifier (double-quoted)';
1704                     } elseif ($char === "'") {
1705                         /* U+0027 APOSTROPHE (')
1706                         Set the DOCTYPE token's public identifier to the empty
1707                         string (not missing), then switch to the DOCTYPE public
1708                         identifier (single-quoted) state. */
1709                         $this->token['public'] = '';
1710                         $state = 'DOCTYPE public identifier (single-quoted)';
1711                     } elseif ($char === '>') {
1712                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1713                         to on. Emit that DOCTYPE token. Switch to the data state. */
1714                         $this->emitToken(array(
1715                             'type' => self::PARSEERROR,
1716                             'data' => 'unexpected-end-of-doctype'
1717                         ));
1718                         $this->token['force-quirks'] = true;
1719                         $this->emitToken($this->token);
1720                         $state = 'data';
1721                     } elseif ($char === false) {
1722                         /* Parse error. Set the DOCTYPE token's force-quirks
1723                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1724                         character in the data state. */
1725                         $this->emitToken(array(
1726                             'type' => self::PARSEERROR,
1727                             'data' => 'eof-in-doctype'
1728                         ));
1729                         $this->token['force-quirks'] = true;
1730                         $this->emitToken($this->token);
1731                         $this->stream->unget();
1732                         $state = 'data';
1733                     } else {
1734                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1735                         to on. Switch to the bogus DOCTYPE state. */
1736                         $this->emitToken(array(
1737                             'type' => self::PARSEERROR,
1738                             'data' => 'unexpected-char-in-doctype'
1739                         ));
1740                         $this->token['force-quirks'] = true;
1741                         $state = 'bogus DOCTYPE';
1742                     }
1743                 break;
1744
1745                 case 'DOCTYPE public identifier (double-quoted)':
1746                     /* Consume the next input character: */
1747                     $char = $this->stream->char();
1748
1749                     if ($char === '"') {
1750                         /* U+0022 QUOTATION MARK (")
1751                         Switch to the after DOCTYPE public identifier state. */
1752                         $state = 'after DOCTYPE public identifier';
1753                     } elseif ($char === '>') {
1754                         /* U+003E GREATER-THAN SIGN (>)
1755                         Parse error. Set the DOCTYPE token's force-quirks flag
1756                         to on. Emit that DOCTYPE token. Switch to the data state. */
1757                         $this->emitToken(array(
1758                             'type' => self::PARSEERROR,
1759                             'data' => 'unexpected-end-of-doctype'
1760                         ));
1761                         $this->token['force-quirks'] = true;
1762                         $this->emitToken($this->token);
1763                         $state = 'data';
1764                     } elseif ($char === false) {
1765                         /* EOF
1766                         Parse error. Set the DOCTYPE token's force-quirks flag
1767                         to on. Emit that DOCTYPE token. Reconsume the EOF
1768                         character in the data state. */
1769                         $this->emitToken(array(
1770                             'type' => self::PARSEERROR,
1771                             'data' => 'eof-in-doctype'
1772                         ));
1773                         $this->token['force-quirks'] = true;
1774                         $this->emitToken($this->token);
1775                         $this->stream->unget();
1776                         $state = 'data';
1777                     } else {
1778                         /* Anything else
1779                         Append the current input character to the current
1780                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1781                         public identifier (double-quoted) state. */
1782                         $this->token['public'] .= $char;
1783                     }
1784                 break;
1785
1786                 case 'DOCTYPE public identifier (single-quoted)':
1787                     /* Consume the next input character: */
1788                     $char = $this->stream->char();
1789
1790                     if ($char === "'") {
1791                         /* U+0027 APOSTROPHE (')
1792                         Switch to the after DOCTYPE public identifier state. */
1793                         $state = 'after DOCTYPE public identifier';
1794                     } elseif ($char === '>') {
1795                         /* U+003E GREATER-THAN SIGN (>)
1796                         Parse error. Set the DOCTYPE token's force-quirks flag
1797                         to on. Emit that DOCTYPE token. Switch to the data state. */
1798                         $this->emitToken(array(
1799                             'type' => self::PARSEERROR,
1800                             'data' => 'unexpected-end-of-doctype'
1801                         ));
1802                         $this->token['force-quirks'] = true;
1803                         $this->emitToken($this->token);
1804                         $state = 'data';
1805                     } elseif ($char === false) {
1806                         /* EOF
1807                         Parse error. Set the DOCTYPE token's force-quirks flag
1808                         to on. Emit that DOCTYPE token. Reconsume the EOF
1809                         character in the data state. */
1810                         $this->emitToken(array(
1811                             'type' => self::PARSEERROR,
1812                             'data' => 'eof-in-doctype'
1813                         ));
1814                         $this->token['force-quirks'] = true;
1815                         $this->emitToken($this->token);
1816                         $this->stream->unget();
1817                         $state = 'data';
1818                     } else {
1819                         /* Anything else
1820                         Append the current input character to the current
1821                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1822                         public identifier (double-quoted) state. */
1823                         $this->token['public'] .= $char;
1824                     }
1825                 break;
1826
1827                 case 'after DOCTYPE public identifier':
1828                     /* Consume the next input character: */
1829                     $char = $this->stream->char();
1830
1831                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1832                         /* U+0009 CHARACTER TABULATION
1833                            U+000A LINE FEED (LF)
1834                            U+000C FORM FEED (FF)
1835                            U+0020 SPACE
1836                         Stay in the after DOCTYPE public identifier state. */
1837                     } elseif ($char === '"') {
1838                         /* U+0022 QUOTATION MARK (")
1839                         Set the DOCTYPE token's system identifier to the
1840                         empty string (not missing), then switch to the DOCTYPE
1841                         system identifier (double-quoted) state. */
1842                         $this->token['system'] = '';
1843                         $state = 'DOCTYPE system identifier (double-quoted)';
1844                     } elseif ($char === "'") {
1845                         /* U+0027 APOSTROPHE (')
1846                         Set the DOCTYPE token's system identifier to the
1847                         empty string (not missing), then switch to the DOCTYPE
1848                         system identifier (single-quoted) state. */
1849                         $this->token['system'] = '';
1850                         $state = 'DOCTYPE system identifier (single-quoted)';
1851                     } elseif ($char === '>') {
1852                         /* U+003E GREATER-THAN SIGN (>)
1853                         Emit the current DOCTYPE token. Switch to the data state. */
1854                         $this->emitToken($this->token);
1855                         $state = 'data';
1856                     } elseif ($char === false) {
1857                         /* Parse error. Set the DOCTYPE token's force-quirks
1858                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1859                         character in the data state. */
1860                         $this->emitToken(array(
1861                             'type' => self::PARSEERROR,
1862                             'data' => 'eof-in-doctype'
1863                         ));
1864                         $this->token['force-quirks'] = true;
1865                         $this->emitToken($this->token);
1866                         $this->stream->unget();
1867                         $state = 'data';
1868                     } else {
1869                         /* Anything else
1870                         Parse error. Set the DOCTYPE token's force-quirks flag
1871                         to on. Switch to the bogus DOCTYPE state. */
1872                         $this->emitToken(array(
1873                             'type' => self::PARSEERROR,
1874                             'data' => 'unexpected-char-in-doctype'
1875                         ));
1876                         $this->token['force-quirks'] = true;
1877                         $state = 'bogus DOCTYPE';
1878                     }
1879                 break;
1880
1881                 case 'before DOCTYPE system identifier':
1882                     /* Consume the next input character: */
1883                     $char = $this->stream->char();
1884
1885                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1886                         /* U+0009 CHARACTER TABULATION
1887                            U+000A LINE FEED (LF)
1888                            U+000C FORM FEED (FF)
1889                            U+0020 SPACE
1890                         Stay in the before DOCTYPE system identifier state. */
1891                     } elseif ($char === '"') {
1892                         /* U+0022 QUOTATION MARK (")
1893                         Set the DOCTYPE token's system identifier to the empty
1894                         string (not missing), then switch to the DOCTYPE system
1895                         identifier (double-quoted) state. */
1896                         $this->token['system'] = '';
1897                         $state = 'DOCTYPE system identifier (double-quoted)';
1898                     } elseif ($char === "'") {
1899                         /* U+0027 APOSTROPHE (')
1900                         Set the DOCTYPE token's system identifier to the empty
1901                         string (not missing), then switch to the DOCTYPE system
1902                         identifier (single-quoted) state. */
1903                         $this->token['system'] = '';
1904                         $state = 'DOCTYPE system identifier (single-quoted)';
1905                     } elseif ($char === '>') {
1906                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1907                         to on. Emit that DOCTYPE token. Switch to the data state. */
1908                         $this->emitToken(array(
1909                             'type' => self::PARSEERROR,
1910                             'data' => 'unexpected-char-in-doctype'
1911                         ));
1912                         $this->token['force-quirks'] = true;
1913                         $this->emitToken($this->token);
1914                         $state = 'data';
1915                     } elseif ($char === false) {
1916                         /* Parse error. Set the DOCTYPE token's force-quirks
1917                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1918                         character in the data state. */
1919                         $this->emitToken(array(
1920                             'type' => self::PARSEERROR,
1921                             'data' => 'eof-in-doctype'
1922                         ));
1923                         $this->token['force-quirks'] = true;
1924                         $this->emitToken($this->token);
1925                         $this->stream->unget();
1926                         $state = 'data';
1927                     } else {
1928                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1929                         to on. Switch to the bogus DOCTYPE state. */
1930                         $this->emitToken(array(
1931                             'type' => self::PARSEERROR,
1932                             'data' => 'unexpected-char-in-doctype'
1933                         ));
1934                         $this->token['force-quirks'] = true;
1935                         $state = 'bogus DOCTYPE';
1936                     }
1937                 break;
1938
1939                 case 'DOCTYPE system identifier (double-quoted)':
1940                     /* Consume the next input character: */
1941                     $char = $this->stream->char();
1942
1943                     if ($char === '"') {
1944                         /* U+0022 QUOTATION MARK (")
1945                         Switch to the after DOCTYPE system identifier state. */
1946                         $state = 'after DOCTYPE system identifier';
1947                     } elseif ($char === '>') {
1948                         /* U+003E GREATER-THAN SIGN (>)
1949                         Parse error. Set the DOCTYPE token's force-quirks flag
1950                         to on. Emit that DOCTYPE token. Switch to the data state. */
1951                         $this->emitToken(array(
1952                             'type' => self::PARSEERROR,
1953                             'data' => 'unexpected-end-of-doctype'
1954                         ));
1955                         $this->token['force-quirks'] = true;
1956                         $this->emitToken($this->token);
1957                         $state = 'data';
1958                     } elseif ($char === false) {
1959                         /* EOF
1960                         Parse error. Set the DOCTYPE token's force-quirks flag
1961                         to on. Emit that DOCTYPE token. Reconsume the EOF
1962                         character in the data state. */
1963                         $this->emitToken(array(
1964                             'type' => self::PARSEERROR,
1965                             'data' => 'eof-in-doctype'
1966                         ));
1967                         $this->token['force-quirks'] = true;
1968                         $this->emitToken($this->token);
1969                         $this->stream->unget();
1970                         $state = 'data';
1971                     } else {
1972                         /* Anything else
1973                         Append the current input character to the current
1974                         DOCTYPE token's system identifier. Stay in the DOCTYPE
1975                         system identifier (double-quoted) state. */
1976                         $this->token['system'] .= $char;
1977                     }
1978                 break;
1979
1980                 case 'DOCTYPE system identifier (single-quoted)':
1981                     /* Consume the next input character: */
1982                     $char = $this->stream->char();
1983
1984                     if ($char === "'") {
1985                         /* U+0027 APOSTROPHE (')
1986                         Switch to the after DOCTYPE system identifier state. */
1987                         $state = 'after DOCTYPE system identifier';
1988                     } elseif ($char === '>') {
1989                         /* U+003E GREATER-THAN SIGN (>)
1990                         Parse error. Set the DOCTYPE token's force-quirks flag
1991                         to on. Emit that DOCTYPE token. Switch to the data state. */
1992                         $this->emitToken(array(
1993                             'type' => self::PARSEERROR,
1994                             'data' => 'unexpected-end-of-doctype'
1995                         ));
1996                         $this->token['force-quirks'] = true;
1997                         $this->emitToken($this->token);
1998                         $state = 'data';
1999                     } elseif ($char === false) {
2000                         /* EOF
2001                         Parse error. Set the DOCTYPE token's force-quirks flag
2002                         to on. Emit that DOCTYPE token. Reconsume the EOF
2003                         character in the data state. */
2004                         $this->emitToken(array(
2005                             'type' => self::PARSEERROR,
2006                             'data' => 'eof-in-doctype'
2007                         ));
2008                         $this->token['force-quirks'] = true;
2009                         $this->emitToken($this->token);
2010                         $this->stream->unget();
2011                         $state = 'data';
2012                     } else {
2013                         /* Anything else
2014                         Append the current input character to the current
2015                         DOCTYPE token's system identifier. Stay in the DOCTYPE
2016                         system identifier (double-quoted) state. */
2017                         $this->token['system'] .= $char;
2018                     }
2019                 break;
2020
2021                 case 'after DOCTYPE system identifier':
2022                     /* Consume the next input character: */
2023                     $char = $this->stream->char();
2024
2025                     if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2026                         /* U+0009 CHARACTER TABULATION
2027                            U+000A LINE FEED (LF)
2028                            U+000C FORM FEED (FF)
2029                            U+0020 SPACE
2030                         Stay in the after DOCTYPE system identifier state. */
2031                     } elseif ($char === '>') {
2032                         /* U+003E GREATER-THAN SIGN (>)
2033                         Emit the current DOCTYPE token. Switch to the data state. */
2034                         $this->emitToken($this->token);
2035                         $state = 'data';
2036                     } elseif ($char === false) {
2037                         /* Parse error. Set the DOCTYPE token's force-quirks
2038                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
2039                         character in the data state. */
2040                         $this->emitToken(array(
2041                             'type' => self::PARSEERROR,
2042                             'data' => 'eof-in-doctype'
2043                         ));
2044                         $this->token['force-quirks'] = true;
2045                         $this->emitToken($this->token);
2046                         $this->stream->unget();
2047                         $state = 'data';
2048                     } else {
2049                         /* Anything else
2050                         Parse error. Switch to the bogus DOCTYPE state.
2051                         (This does not set the DOCTYPE token's force-quirks
2052                         flag to on.) */
2053                         $this->emitToken(array(
2054                             'type' => self::PARSEERROR,
2055                             'data' => 'unexpected-char-in-doctype'
2056                         ));
2057                         $state = 'bogus DOCTYPE';
2058                     }
2059                 break;
2060
2061                 case 'bogus DOCTYPE':
2062                     /* Consume the next input character: */
2063                     $char = $this->stream->char();
2064
2065                     if ($char === '>') {
2066                         /* U+003E GREATER-THAN SIGN (>)
2067                         Emit the DOCTYPE token. Switch to the data state. */
2068                         $this->emitToken($this->token);
2069                         $state = 'data';
2070
2071                     } elseif($char === false) {
2072                         /* EOF
2073                         Emit the DOCTYPE token. Reconsume the EOF character in
2074                         the data state. */
2075                         $this->emitToken($this->token);
2076                         $this->stream->unget();
2077                         $state = 'data';
2078
2079                     } else {
2080                         /* Anything else
2081                         Stay in the bogus DOCTYPE state. */
2082                     }
2083                 break;
2084
2085                 // case 'cdataSection':
2086
2087             }
2088         }
2089     }
2090
2091     /**
2092      * Returns a serialized representation of the tree.
2093      */
2094     public function save() {
2095         return $this->tree->save();
2096     }
2097
2098     /**
2099      * Returns the input stream.
2100      */
2101     public function stream() {
2102         return $this->stream;
2103     }
2104
2105     private function consumeCharacterReference($allowed = false, $inattr = false) {
2106         // This goes quite far against spec, and is far closer to the Python
2107         // impl., mainly because we don't do the large unconsuming the spec
2108         // requires.
2109
2110         // All consumed characters.
2111         $chars = $this->stream->char();
2112
2113         /* This section defines how to consume a character
2114         reference. This definition is used when parsing character
2115         references in text and in attributes.
2116
2117         The behavior depends on the identity of the next character
2118         (the one immediately after the U+0026 AMPERSAND character): */
2119
2120         if (
2121             $chars[0] === "\x09" ||
2122             $chars[0] === "\x0A" ||
2123             $chars[0] === "\x0C" ||
2124             $chars[0] === "\x20" ||
2125             $chars[0] === '<' ||
2126             $chars[0] === '&' ||
2127             $chars === false ||
2128             $chars[0] === $allowed
2129         ) {
2130             /* U+0009 CHARACTER TABULATION
2131                U+000A LINE FEED (LF)
2132                U+000C FORM FEED (FF)
2133                U+0020 SPACE
2134                U+003C LESS-THAN SIGN
2135                U+0026 AMPERSAND
2136                EOF
2137                The additional allowed character, if there is one
2138             Not a character reference. No characters are consumed,
2139             and nothing is returned. (This is not an error, either.) */
2140             // We already consumed, so unconsume.
2141             $this->stream->unget();
2142             return '&';
2143         } elseif ($chars[0] === '#') {
2144             /* Consume the U+0023 NUMBER SIGN. */
2145             // Um, yeah, we already did that.
2146             /* The behavior further depends on the character after
2147             the U+0023 NUMBER SIGN: */
2148             $chars .= $this->stream->char();
2149             if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2150                 /* U+0078 LATIN SMALL LETTER X
2151                    U+0058 LATIN CAPITAL LETTER X */
2152                 /* Consume the X. */
2153                 // Um, yeah, we already did that.
2154                 /* Follow the steps below, but using the range of
2155                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2156                 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2157                 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2158                 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2159                 words, 0123456789, ABCDEF, abcdef). */
2160                 $char_class = self::HEX;
2161                 /* When it comes to interpreting the
2162                 number, interpret it as a hexadecimal number. */
2163                 $hex = true;
2164             } else {
2165                 /* Anything else */
2166                 // Unconsume because we shouldn't have consumed this.
2167                 $chars = $chars[0];
2168                 $this->stream->unget();
2169                 /* Follow the steps below, but using the range of
2170                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2171                 NINE (i.e. just 0123456789). */
2172                 $char_class = self::DIGIT;
2173                 /* When it comes to interpreting the number,
2174                 interpret it as a decimal number. */
2175                 $hex = false;
2176             }
2177
2178             /* Consume as many characters as match the range of characters given above. */
2179             $consumed = $this->stream->charsWhile($char_class);
2180             if ($consumed === '' || $consumed === false) {
2181                 /* If no characters match the range, then don't consume
2182                 any characters (and unconsume the U+0023 NUMBER SIGN
2183                 character and, if appropriate, the X character). This
2184                 is a parse error; nothing is returned. */
2185                 $this->emitToken(array(
2186                     'type' => self::PARSEERROR,
2187                     'data' => 'expected-numeric-entity'
2188                 ));
2189                 return '&' . $chars;
2190             } else {
2191                 /* Otherwise, if the next character is a U+003B SEMICOLON,
2192                 consume that too. If it isn't, there is a parse error. */
2193                 if ($this->stream->char() !== ';') {
2194                     $this->stream->unget();
2195                     $this->emitToken(array(
2196                         'type' => self::PARSEERROR,
2197                         'data' => 'numeric-entity-without-semicolon'
2198                     ));
2199                 }
2200
2201                 /* If one or more characters match the range, then take
2202                 them all and interpret the string of characters as a number
2203                 (either hexadecimal or decimal as appropriate). */
2204                 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2205
2206                 /* If that number is one of the numbers in the first column
2207                 of the following table, then this is a parse error. Find the
2208                 row with that number in the first column, and return a
2209                 character token for the Unicode character given in the
2210                 second column of that row. */
2211                 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2212                 if ($new_codepoint) {
2213                     $this->emitToken(array(
2214                         'type' => self::PARSEERROR,
2215                         'data' => 'illegal-windows-1252-entity'
2216                     ));
2217                     return HTML5_Data::utf8chr($new_codepoint);
2218                 } else {
2219                     /* Otherwise, if the number is greater than 0x10FFFF, then
2220                      * this is a parse error. Return a U+FFFD REPLACEMENT
2221                      * CHARACTER. */
2222                     if ($codepoint > 0x10FFFF) {
2223                         $this->emitToken(array(
2224                             'type' => self::PARSEERROR,
2225                             'data' => 'overlong-character-entity' // XXX probably not correct
2226                         ));
2227                         return "\xEF\xBF\xBD";
2228                     }
2229                     /* Otherwise, return a character token for the Unicode
2230                      * character whose code point is that number.  If the
2231                      * number is in the range 0x0001 to 0x0008,    0x000E to
2232                      * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2233                      * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2234                      * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2235                      * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2236                      * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2237                      * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2238                      * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2239                      * or 0x10FFFF, then this is a parse error. */
2240                     // && has higher precedence than ||
2241                     if (
2242                         $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2243                         $codepoint === 0x000B ||
2244                         $codepoint >= 0x000E && $codepoint <= 0x001F ||
2245                         $codepoint >= 0x007F && $codepoint <= 0x009F ||
2246                         $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2247                         $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2248                         ($codepoint & 0xFFFE) === 0xFFFE ||
2249                         $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2250                     ) {
2251                         $this->emitToken(array(
2252                             'type' => self::PARSEERROR,
2253                             'data' => 'illegal-codepoint-for-numeric-entity'
2254                         ));
2255                     }
2256                     return HTML5_Data::utf8chr($codepoint);
2257                 }
2258             }
2259
2260         } else {
2261             /* Anything else */
2262
2263             /* Consume the maximum number of characters possible,
2264             with the consumed characters matching one of the
2265             identifiers in the first column of the named character
2266             references table (in a case-sensitive manner). */
2267             // What we actually do here is consume as much as we can while it
2268             // matches the start of one of the identifiers in the first column.
2269
2270             $refs = HTML5_Data::getNamedCharacterReferences();
2271
2272             // Get the longest string which is the start of an identifier
2273             // ($chars) as well as the longest identifier which matches ($id)
2274             // and its codepoint ($codepoint).
2275             $codepoint = false;
2276             $char = $chars;
2277             while ($char !== false && isset($refs[$char])) {
2278                 $refs = $refs[$char];
2279                 if (isset($refs['codepoint'])) {
2280                     $id = $chars;
2281                     $codepoint = $refs['codepoint'];
2282                 }
2283                 $chars .= $char = $this->stream->char();
2284             }
2285
2286             // Unconsume the one character we just took which caused the while
2287             // statement to fail. This could be anything and could cause state
2288             // changes (as if it matches the while loop it must be
2289             // alphanumeric so we can just concat it to whatever we get later).
2290             $this->stream->unget();
2291             if ($char !== false) {
2292                 $chars = substr($chars, 0, -1);
2293             }
2294
2295             /* If no match can be made, then this is a parse error.
2296             No characters are consumed, and nothing is returned. */
2297             if (!$codepoint) {
2298                 $this->emitToken(array(
2299                     'type' => self::PARSEERROR,
2300                     'data' => 'expected-named-entity'
2301                 ));
2302                 return '&' . $chars;
2303             }
2304
2305             /* If the last character matched is not a U+003B SEMICOLON
2306             (;), there is a parse error. */
2307             $semicolon = true;
2308             if (substr($id, -1) !== ';') {
2309                 $this->emitToken(array(
2310                     'type' => self::PARSEERROR,
2311                     'data' => 'named-entity-without-semicolon'
2312                 ));
2313                 $semicolon = false;
2314             }
2315
2316             /* If the character reference is being consumed as part of
2317             an attribute, and the last character matched is not a
2318             U+003B SEMICOLON (;), and the next character is in the
2319             range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2320             LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2321             or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2322             then, for historical reasons, all the characters that were
2323             matched after the U+0026 AMPERSAND (&) must be unconsumed,
2324             and nothing is returned. */
2325             if ($inattr && !$semicolon) {
2326                 // The next character is either the next character in $chars or in the stream.
2327                 if (strlen($chars) > strlen($id)) {
2328                     $next = substr($chars, strlen($id), 1);
2329                 } else {
2330                     $next = $this->stream->char();
2331                     $this->stream->unget();
2332                 }
2333                 if (
2334                     '0' <= $next && $next <= '9' ||
2335                     'A' <= $next && $next <= 'Z' ||
2336                     'a' <= $next && $next <= 'z'
2337                 ) {
2338                     return '&' . $chars;
2339                 }
2340             }
2341
2342             /* Otherwise, return a character token for the character
2343             corresponding to the character reference name (as given
2344             by the second column of the named character references table). */
2345             return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2346         }
2347     }
2348
2349     private function characterReferenceInAttributeValue($allowed = false) {
2350         /* Attempt to consume a character reference. */
2351         $entity = $this->consumeCharacterReference($allowed, true);
2352
2353         /* If nothing is returned, append a U+0026 AMPERSAND
2354         character to the current attribute's value.
2355
2356         Otherwise, append the returned character token to the
2357         current attribute's value. */
2358         $char = (!$entity)
2359             ? '&'
2360             : $entity;
2361
2362         $last = count($this->token['attr']) - 1;
2363         $this->token['attr'][$last]['value'] .= $char;
2364
2365         /* Finally, switch back to the attribute value state that you
2366         were in when were switched into this state. */
2367     }
2368
2369     /**
2370      * Emits a token, passing it on to the tree builder.
2371      */
2372     protected function emitToken($token, $checkStream = true, $dry = false) {
2373         if ($checkStream) {
2374             // Emit errors from input stream.
2375             while ($this->stream->errors) {
2376                 $this->emitToken(array_shift($this->stream->errors), false);
2377             }
2378         }
2379         if($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2380             for ($i = 0; $i < count($token['attr']); $i++) {
2381                 $this->emitToken(array(
2382                     'type' => self::PARSEERROR,
2383                     'data' => 'attributes-in-end-tag'
2384                 ));
2385             }
2386         }
2387         if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2388             $this->emitToken(array(
2389                 'type' => self::PARSEERROR,
2390                 'data' => 'self-closing-flag-on-end-tag',
2391             ));
2392         }
2393         if($token['type'] === self::STARTTAG) {
2394             // This could be changed to actually pass the tree-builder a hash
2395             $hash = array();
2396             foreach ($token['attr'] as $keypair) {
2397                 if (isset($hash[$keypair['name']])) {
2398                     $this->emitToken(array(
2399                         'type' => self::PARSEERROR,
2400                         'data' => 'duplicate-attribute',
2401                     ));
2402                 } else {
2403                     $hash[$keypair['name']] = $keypair['value'];
2404                 }
2405             }
2406         }
2407
2408         if(!$dry) {
2409             // the current structure of attributes is not a terribly good one
2410             $this->tree->emitToken($token);
2411         }
2412
2413         if(!$dry && is_int($this->tree->content_model)) {
2414             $this->content_model = $this->tree->content_model;
2415             $this->tree->content_model = null;
2416
2417         } elseif($token['type'] === self::ENDTAG) {
2418             $this->content_model = self::PCDATA;
2419         }
2420     }
2421 }
2422