]>
Commit | Line | Data |
---|---|---|
d4949327 NL |
1 | <?php\r |
2 | \r | |
3 | /**\r | |
4 | * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.\r | |
5 | * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.\r | |
6 | *\r | |
7 | * @note\r | |
8 | * Recent changes to PHP's DOM extension have resulted in some fatal\r | |
9 | * error conditions with the original version of PH5P. Pending changes,\r | |
10 | * this lexer will punt to DirectLex if DOM throws an exception.\r | |
11 | */\r | |
12 | \r | |
13 | class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex\r | |
14 | {\r | |
15 | /**\r | |
16 | * @param string $html\r | |
17 | * @param HTMLPurifier_Config $config\r | |
18 | * @param HTMLPurifier_Context $context\r | |
19 | * @return HTMLPurifier_Token[]\r | |
20 | */\r | |
21 | public function tokenizeHTML($html, $config, $context)\r | |
22 | {\r | |
23 | $new_html = $this->normalize($html, $config, $context);\r | |
24 | $new_html = $this->wrapHTML($new_html, $config, $context);\r | |
25 | try {\r | |
26 | $parser = new HTML5($new_html);\r | |
27 | $doc = $parser->save();\r | |
28 | } catch (DOMException $e) {\r | |
29 | // Uh oh, it failed. Punt to DirectLex.\r | |
30 | $lexer = new HTMLPurifier_Lexer_DirectLex();\r | |
31 | $context->register('PH5PError', $e); // save the error, so we can detect it\r | |
32 | return $lexer->tokenizeHTML($html, $config, $context); // use original HTML\r | |
33 | }\r | |
34 | $tokens = array();\r | |
35 | $this->tokenizeDOM(\r | |
36 | $doc->getElementsByTagName('html')->item(0)-> // <html>\r | |
37 | getElementsByTagName('body')->item(0)-> // <body>\r | |
38 | getElementsByTagName('div')->item(0) // <div>\r | |
39 | ,\r | |
40 | $tokens\r | |
41 | );\r | |
42 | return $tokens;\r | |
43 | }\r | |
44 | }\r | |
45 | \r | |
46 | /*\r | |
47 | \r | |
48 | Copyright 2007 Jeroen van der Meer <http://jero.net/>\r | |
49 | \r | |
50 | Permission is hereby granted, free of charge, to any person obtaining a\r | |
51 | copy of this software and associated documentation files (the\r | |
52 | "Software"), to deal in the Software without restriction, including\r | |
53 | without limitation the rights to use, copy, modify, merge, publish,\r | |
54 | distribute, sublicense, and/or sell copies of the Software, and to\r | |
55 | permit persons to whom the Software is furnished to do so, subject to\r | |
56 | the following conditions:\r | |
57 | \r | |
58 | The above copyright notice and this permission notice shall be included\r | |
59 | in all copies or substantial portions of the Software.\r | |
60 | \r | |
61 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS\r | |
62 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r | |
63 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\r | |
64 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\r | |
65 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\r | |
66 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\r | |
67 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\r | |
68 | \r | |
69 | */\r | |
70 | \r | |
71 | class HTML5\r | |
72 | {\r | |
73 | private $data;\r | |
74 | private $char;\r | |
75 | private $EOF;\r | |
76 | private $state;\r | |
77 | private $tree;\r | |
78 | private $token;\r | |
79 | private $content_model;\r | |
80 | private $escape = false;\r | |
81 | private $entities = array(\r | |
82 | 'AElig;',\r | |
83 | 'AElig',\r | |
84 | 'AMP;',\r | |
85 | 'AMP',\r | |
86 | 'Aacute;',\r | |
87 | 'Aacute',\r | |
88 | 'Acirc;',\r | |
89 | 'Acirc',\r | |
90 | 'Agrave;',\r | |
91 | 'Agrave',\r | |
92 | 'Alpha;',\r | |
93 | 'Aring;',\r | |
94 | 'Aring',\r | |
95 | 'Atilde;',\r | |
96 | 'Atilde',\r | |
97 | 'Auml;',\r | |
98 | 'Auml',\r | |
99 | 'Beta;',\r | |
100 | 'COPY;',\r | |
101 | 'COPY',\r | |
102 | 'Ccedil;',\r | |
103 | 'Ccedil',\r | |
104 | 'Chi;',\r | |
105 | 'Dagger;',\r | |
106 | 'Delta;',\r | |
107 | 'ETH;',\r | |
108 | 'ETH',\r | |
109 | 'Eacute;',\r | |
110 | 'Eacute',\r | |
111 | 'Ecirc;',\r | |
112 | 'Ecirc',\r | |
113 | 'Egrave;',\r | |
114 | 'Egrave',\r | |
115 | 'Epsilon;',\r | |
116 | 'Eta;',\r | |
117 | 'Euml;',\r | |
118 | 'Euml',\r | |
119 | 'GT;',\r | |
120 | 'GT',\r | |
121 | 'Gamma;',\r | |
122 | 'Iacute;',\r | |
123 | 'Iacute',\r | |
124 | 'Icirc;',\r | |
125 | 'Icirc',\r | |
126 | 'Igrave;',\r | |
127 | 'Igrave',\r | |
128 | 'Iota;',\r | |
129 | 'Iuml;',\r | |
130 | 'Iuml',\r | |
131 | 'Kappa;',\r | |
132 | 'LT;',\r | |
133 | 'LT',\r | |
134 | 'Lambda;',\r | |
135 | 'Mu;',\r | |
136 | 'Ntilde;',\r | |
137 | 'Ntilde',\r | |
138 | 'Nu;',\r | |
139 | 'OElig;',\r | |
140 | 'Oacute;',\r | |
141 | 'Oacute',\r | |
142 | 'Ocirc;',\r | |
143 | 'Ocirc',\r | |
144 | 'Ograve;',\r | |
145 | 'Ograve',\r | |
146 | 'Omega;',\r | |
147 | 'Omicron;',\r | |
148 | 'Oslash;',\r | |
149 | 'Oslash',\r | |
150 | 'Otilde;',\r | |
151 | 'Otilde',\r | |
152 | 'Ouml;',\r | |
153 | 'Ouml',\r | |
154 | 'Phi;',\r | |
155 | 'Pi;',\r | |
156 | 'Prime;',\r | |
157 | 'Psi;',\r | |
158 | 'QUOT;',\r | |
159 | 'QUOT',\r | |
160 | 'REG;',\r | |
161 | 'REG',\r | |
162 | 'Rho;',\r | |
163 | 'Scaron;',\r | |
164 | 'Sigma;',\r | |
165 | 'THORN;',\r | |
166 | 'THORN',\r | |
167 | 'TRADE;',\r | |
168 | 'Tau;',\r | |
169 | 'Theta;',\r | |
170 | 'Uacute;',\r | |
171 | 'Uacute',\r | |
172 | 'Ucirc;',\r | |
173 | 'Ucirc',\r | |
174 | 'Ugrave;',\r | |
175 | 'Ugrave',\r | |
176 | 'Upsilon;',\r | |
177 | 'Uuml;',\r | |
178 | 'Uuml',\r | |
179 | 'Xi;',\r | |
180 | 'Yacute;',\r | |
181 | 'Yacute',\r | |
182 | 'Yuml;',\r | |
183 | 'Zeta;',\r | |
184 | 'aacute;',\r | |
185 | 'aacute',\r | |
186 | 'acirc;',\r | |
187 | 'acirc',\r | |
188 | 'acute;',\r | |
189 | 'acute',\r | |
190 | 'aelig;',\r | |
191 | 'aelig',\r | |
192 | 'agrave;',\r | |
193 | 'agrave',\r | |
194 | 'alefsym;',\r | |
195 | 'alpha;',\r | |
196 | 'amp;',\r | |
197 | 'amp',\r | |
198 | 'and;',\r | |
199 | 'ang;',\r | |
200 | 'apos;',\r | |
201 | 'aring;',\r | |
202 | 'aring',\r | |
203 | 'asymp;',\r | |
204 | 'atilde;',\r | |
205 | 'atilde',\r | |
206 | 'auml;',\r | |
207 | 'auml',\r | |
208 | 'bdquo;',\r | |
209 | 'beta;',\r | |
210 | 'brvbar;',\r | |
211 | 'brvbar',\r | |
212 | 'bull;',\r | |
213 | 'cap;',\r | |
214 | 'ccedil;',\r | |
215 | 'ccedil',\r | |
216 | 'cedil;',\r | |
217 | 'cedil',\r | |
218 | 'cent;',\r | |
219 | 'cent',\r | |
220 | 'chi;',\r | |
221 | 'circ;',\r | |
222 | 'clubs;',\r | |
223 | 'cong;',\r | |
224 | 'copy;',\r | |
225 | 'copy',\r | |
226 | 'crarr;',\r | |
227 | 'cup;',\r | |
228 | 'curren;',\r | |
229 | 'curren',\r | |
230 | 'dArr;',\r | |
231 | 'dagger;',\r | |
232 | 'darr;',\r | |
233 | 'deg;',\r | |
234 | 'deg',\r | |
235 | 'delta;',\r | |
236 | 'diams;',\r | |
237 | 'divide;',\r | |
238 | 'divide',\r | |
239 | 'eacute;',\r | |
240 | 'eacute',\r | |
241 | 'ecirc;',\r | |
242 | 'ecirc',\r | |
243 | 'egrave;',\r | |
244 | 'egrave',\r | |
245 | 'empty;',\r | |
246 | 'emsp;',\r | |
247 | 'ensp;',\r | |
248 | 'epsilon;',\r | |
249 | 'equiv;',\r | |
250 | 'eta;',\r | |
251 | 'eth;',\r | |
252 | 'eth',\r | |
253 | 'euml;',\r | |
254 | 'euml',\r | |
255 | 'euro;',\r | |
256 | 'exist;',\r | |
257 | 'fnof;',\r | |
258 | 'forall;',\r | |
259 | 'frac12;',\r | |
260 | 'frac12',\r | |
261 | 'frac14;',\r | |
262 | 'frac14',\r | |
263 | 'frac34;',\r | |
264 | 'frac34',\r | |
265 | 'frasl;',\r | |
266 | 'gamma;',\r | |
267 | 'ge;',\r | |
268 | 'gt;',\r | |
269 | 'gt',\r | |
270 | 'hArr;',\r | |
271 | 'harr;',\r | |
272 | 'hearts;',\r | |
273 | 'hellip;',\r | |
274 | 'iacute;',\r | |
275 | 'iacute',\r | |
276 | 'icirc;',\r | |
277 | 'icirc',\r | |
278 | 'iexcl;',\r | |
279 | 'iexcl',\r | |
280 | 'igrave;',\r | |
281 | 'igrave',\r | |
282 | 'image;',\r | |
283 | 'infin;',\r | |
284 | 'int;',\r | |
285 | 'iota;',\r | |
286 | 'iquest;',\r | |
287 | 'iquest',\r | |
288 | 'isin;',\r | |
289 | 'iuml;',\r | |
290 | 'iuml',\r | |
291 | 'kappa;',\r | |
292 | 'lArr;',\r | |
293 | 'lambda;',\r | |
294 | 'lang;',\r | |
295 | 'laquo;',\r | |
296 | 'laquo',\r | |
297 | 'larr;',\r | |
298 | 'lceil;',\r | |
299 | 'ldquo;',\r | |
300 | 'le;',\r | |
301 | 'lfloor;',\r | |
302 | 'lowast;',\r | |
303 | 'loz;',\r | |
304 | 'lrm;',\r | |
305 | 'lsaquo;',\r | |
306 | 'lsquo;',\r | |
307 | 'lt;',\r | |
308 | 'lt',\r | |
309 | 'macr;',\r | |
310 | 'macr',\r | |
311 | 'mdash;',\r | |
312 | 'micro;',\r | |
313 | 'micro',\r | |
314 | 'middot;',\r | |
315 | 'middot',\r | |
316 | 'minus;',\r | |
317 | 'mu;',\r | |
318 | 'nabla;',\r | |
319 | 'nbsp;',\r | |
320 | 'nbsp',\r | |
321 | 'ndash;',\r | |
322 | 'ne;',\r | |
323 | 'ni;',\r | |
324 | 'not;',\r | |
325 | 'not',\r | |
326 | 'notin;',\r | |
327 | 'nsub;',\r | |
328 | 'ntilde;',\r | |
329 | 'ntilde',\r | |
330 | 'nu;',\r | |
331 | 'oacute;',\r | |
332 | 'oacute',\r | |
333 | 'ocirc;',\r | |
334 | 'ocirc',\r | |
335 | 'oelig;',\r | |
336 | 'ograve;',\r | |
337 | 'ograve',\r | |
338 | 'oline;',\r | |
339 | 'omega;',\r | |
340 | 'omicron;',\r | |
341 | 'oplus;',\r | |
342 | 'or;',\r | |
343 | 'ordf;',\r | |
344 | 'ordf',\r | |
345 | 'ordm;',\r | |
346 | 'ordm',\r | |
347 | 'oslash;',\r | |
348 | 'oslash',\r | |
349 | 'otilde;',\r | |
350 | 'otilde',\r | |
351 | 'otimes;',\r | |
352 | 'ouml;',\r | |
353 | 'ouml',\r | |
354 | 'para;',\r | |
355 | 'para',\r | |
356 | 'part;',\r | |
357 | 'permil;',\r | |
358 | 'perp;',\r | |
359 | 'phi;',\r | |
360 | 'pi;',\r | |
361 | 'piv;',\r | |
362 | 'plusmn;',\r | |
363 | 'plusmn',\r | |
364 | 'pound;',\r | |
365 | 'pound',\r | |
366 | 'prime;',\r | |
367 | 'prod;',\r | |
368 | 'prop;',\r | |
369 | 'psi;',\r | |
370 | 'quot;',\r | |
371 | 'quot',\r | |
372 | 'rArr;',\r | |
373 | 'radic;',\r | |
374 | 'rang;',\r | |
375 | 'raquo;',\r | |
376 | 'raquo',\r | |
377 | 'rarr;',\r | |
378 | 'rceil;',\r | |
379 | 'rdquo;',\r | |
380 | 'real;',\r | |
381 | 'reg;',\r | |
382 | 'reg',\r | |
383 | 'rfloor;',\r | |
384 | 'rho;',\r | |
385 | 'rlm;',\r | |
386 | 'rsaquo;',\r | |
387 | 'rsquo;',\r | |
388 | 'sbquo;',\r | |
389 | 'scaron;',\r | |
390 | 'sdot;',\r | |
391 | 'sect;',\r | |
392 | 'sect',\r | |
393 | 'shy;',\r | |
394 | 'shy',\r | |
395 | 'sigma;',\r | |
396 | 'sigmaf;',\r | |
397 | 'sim;',\r | |
398 | 'spades;',\r | |
399 | 'sub;',\r | |
400 | 'sube;',\r | |
401 | 'sum;',\r | |
402 | 'sup1;',\r | |
403 | 'sup1',\r | |
404 | 'sup2;',\r | |
405 | 'sup2',\r | |
406 | 'sup3;',\r | |
407 | 'sup3',\r | |
408 | 'sup;',\r | |
409 | 'supe;',\r | |
410 | 'szlig;',\r | |
411 | 'szlig',\r | |
412 | 'tau;',\r | |
413 | 'there4;',\r | |
414 | 'theta;',\r | |
415 | 'thetasym;',\r | |
416 | 'thinsp;',\r | |
417 | 'thorn;',\r | |
418 | 'thorn',\r | |
419 | 'tilde;',\r | |
420 | 'times;',\r | |
421 | 'times',\r | |
422 | 'trade;',\r | |
423 | 'uArr;',\r | |
424 | 'uacute;',\r | |
425 | 'uacute',\r | |
426 | 'uarr;',\r | |
427 | 'ucirc;',\r | |
428 | 'ucirc',\r | |
429 | 'ugrave;',\r | |
430 | 'ugrave',\r | |
431 | 'uml;',\r | |
432 | 'uml',\r | |
433 | 'upsih;',\r | |
434 | 'upsilon;',\r | |
435 | 'uuml;',\r | |
436 | 'uuml',\r | |
437 | 'weierp;',\r | |
438 | 'xi;',\r | |
439 | 'yacute;',\r | |
440 | 'yacute',\r | |
441 | 'yen;',\r | |
442 | 'yen',\r | |
443 | 'yuml;',\r | |
444 | 'yuml',\r | |
445 | 'zeta;',\r | |
446 | 'zwj;',\r | |
447 | 'zwnj;'\r | |
448 | );\r | |
449 | \r | |
450 | const PCDATA = 0;\r | |
451 | const RCDATA = 1;\r | |
452 | const CDATA = 2;\r | |
453 | const PLAINTEXT = 3;\r | |
454 | \r | |
455 | const DOCTYPE = 0;\r | |
456 | const STARTTAG = 1;\r | |
457 | const ENDTAG = 2;\r | |
458 | const COMMENT = 3;\r | |
459 | const CHARACTR = 4;\r | |
460 | const EOF = 5;\r | |
461 | \r | |
462 | public function __construct($data)\r | |
463 | {\r | |
464 | $this->data = $data;\r | |
465 | $this->char = -1;\r | |
466 | $this->EOF = strlen($data);\r | |
467 | $this->tree = new HTML5TreeConstructer;\r | |
468 | $this->content_model = self::PCDATA;\r | |
469 | \r | |
470 | $this->state = 'data';\r | |
471 | \r | |
472 | while ($this->state !== null) {\r | |
473 | $this->{$this->state . 'State'}();\r | |
474 | }\r | |
475 | }\r | |
476 | \r | |
477 | public function save()\r | |
478 | {\r | |
479 | return $this->tree->save();\r | |
480 | }\r | |
481 | \r | |
482 | private function char()\r | |
483 | {\r | |
484 | return ($this->char < $this->EOF)\r | |
485 | ? $this->data[$this->char]\r | |
486 | : false;\r | |
487 | }\r | |
488 | \r | |
489 | private function character($s, $l = 0)\r | |
490 | {\r | |
491 | if ($s + $l < $this->EOF) {\r | |
492 | if ($l === 0) {\r | |
493 | return $this->data[$s];\r | |
494 | } else {\r | |
495 | return substr($this->data, $s, $l);\r | |
496 | }\r | |
497 | }\r | |
498 | }\r | |
499 | \r | |
500 | private function characters($char_class, $start)\r | |
501 | {\r | |
502 | return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));\r | |
503 | }\r | |
504 | \r | |
505 | private function dataState()\r | |
506 | {\r | |
507 | // Consume the next input character\r | |
508 | $this->char++;\r | |
509 | $char = $this->char();\r | |
510 | \r | |
511 | if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {\r | |
512 | /* U+0026 AMPERSAND (&)\r | |
513 | When the content model flag is set to one of the PCDATA or RCDATA\r | |
514 | states: switch to the entity data state. Otherwise: treat it as per\r | |
515 | the "anything else" entry below. */\r | |
516 | $this->state = 'entityData';\r | |
517 | \r | |
518 | } elseif ($char === '-') {\r | |
519 | /* If the content model flag is set to either the RCDATA state or\r | |
520 | the CDATA state, and the escape flag is false, and there are at\r | |
521 | least three characters before this one in the input stream, and the\r | |
522 | last four characters in the input stream, including this one, are\r | |
523 | U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,\r | |
524 | and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */\r | |
525 | if (($this->content_model === self::RCDATA || $this->content_model ===\r | |
526 | self::CDATA) && $this->escape === false &&\r | |
527 | $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'\r | |
528 | ) {\r | |
529 | $this->escape = true;\r | |
530 | }\r | |
531 | \r | |
532 | /* In any case, emit the input character as a character token. Stay\r | |
533 | in the data state. */\r | |
534 | $this->emitToken(\r | |
535 | array(\r | |
536 | 'type' => self::CHARACTR,\r | |
537 | 'data' => $char\r | |
538 | )\r | |
539 | );\r | |
540 | \r | |
541 | /* U+003C LESS-THAN SIGN (<) */\r | |
542 | } elseif ($char === '<' && ($this->content_model === self::PCDATA ||\r | |
543 | (($this->content_model === self::RCDATA ||\r | |
544 | $this->content_model === self::CDATA) && $this->escape === false))\r | |
545 | ) {\r | |
546 | /* When the content model flag is set to the PCDATA state: switch\r | |
547 | to the tag open state.\r | |
548 | \r | |
549 | When the content model flag is set to either the RCDATA state or\r | |
550 | the CDATA state and the escape flag is false: switch to the tag\r | |
551 | open state.\r | |
552 | \r | |
553 | Otherwise: treat it as per the "anything else" entry below. */\r | |
554 | $this->state = 'tagOpen';\r | |
555 | \r | |
556 | /* U+003E GREATER-THAN SIGN (>) */\r | |
557 | } elseif ($char === '>') {\r | |
558 | /* If the content model flag is set to either the RCDATA state or\r | |
559 | the CDATA state, and the escape flag is true, and the last three\r | |
560 | characters in the input stream including this one are U+002D\r | |
561 | HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),\r | |
562 | set the escape flag to false. */\r | |
563 | if (($this->content_model === self::RCDATA ||\r | |
564 | $this->content_model === self::CDATA) && $this->escape === true &&\r | |
565 | $this->character($this->char, 3) === '-->'\r | |
566 | ) {\r | |
567 | $this->escape = false;\r | |
568 | }\r | |
569 | \r | |
570 | /* In any case, emit the input character as a character token.\r | |
571 | Stay in the data state. */\r | |
572 | $this->emitToken(\r | |
573 | array(\r | |
574 | 'type' => self::CHARACTR,\r | |
575 | 'data' => $char\r | |
576 | )\r | |
577 | );\r | |
578 | \r | |
579 | } elseif ($this->char === $this->EOF) {\r | |
580 | /* EOF\r | |
581 | Emit an end-of-file token. */\r | |
582 | $this->EOF();\r | |
583 | \r | |
584 | } elseif ($this->content_model === self::PLAINTEXT) {\r | |
585 | /* When the content model flag is set to the PLAINTEXT state\r | |
586 | THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of\r | |
587 | the text and emit it as a character token. */\r | |
588 | $this->emitToken(\r | |
589 | array(\r | |
590 | 'type' => self::CHARACTR,\r | |
591 | 'data' => substr($this->data, $this->char)\r | |
592 | )\r | |
593 | );\r | |
594 | \r | |
595 | $this->EOF();\r | |
596 | \r | |
597 | } else {\r | |
598 | /* Anything else\r | |
599 | THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that\r | |
600 | otherwise would also be treated as a character token and emit it\r | |
601 | as a single character token. Stay in the data state. */\r | |
602 | $len = strcspn($this->data, '<&', $this->char);\r | |
603 | $char = substr($this->data, $this->char, $len);\r | |
604 | $this->char += $len - 1;\r | |
605 | \r | |
606 | $this->emitToken(\r | |
607 | array(\r | |
608 | 'type' => self::CHARACTR,\r | |
609 | 'data' => $char\r | |
610 | )\r | |
611 | );\r | |
612 | \r | |
613 | $this->state = 'data';\r | |
614 | }\r | |
615 | }\r | |
616 | \r | |
617 | private function entityDataState()\r | |
618 | {\r | |
619 | // Attempt to consume an entity.\r | |
620 | $entity = $this->entity();\r | |
621 | \r | |
622 | // If nothing is returned, emit a U+0026 AMPERSAND character token.\r | |
623 | // Otherwise, emit the character token that was returned.\r | |
624 | $char = (!$entity) ? '&' : $entity;\r | |
625 | $this->emitToken(\r | |
626 | array(\r | |
627 | 'type' => self::CHARACTR,\r | |
628 | 'data' => $char\r | |
629 | )\r | |
630 | );\r | |
631 | \r | |
632 | // Finally, switch to the data state.\r | |
633 | $this->state = 'data';\r | |
634 | }\r | |
635 | \r | |
636 | private function tagOpenState()\r | |
637 | {\r | |
638 | switch ($this->content_model) {\r | |
639 | case self::RCDATA:\r | |
640 | case self::CDATA:\r | |
641 | /* If the next input character is a U+002F SOLIDUS (/) character,\r | |
642 | consume it and switch to the close tag open state. If the next\r | |
643 | input character is not a U+002F SOLIDUS (/) character, emit a\r | |
644 | U+003C LESS-THAN SIGN character token and switch to the data\r | |
645 | state to process the next input character. */\r | |
646 | if ($this->character($this->char + 1) === '/') {\r | |
647 | $this->char++;\r | |
648 | $this->state = 'closeTagOpen';\r | |
649 | \r | |
650 | } else {\r | |
651 | $this->emitToken(\r | |
652 | array(\r | |
653 | 'type' => self::CHARACTR,\r | |
654 | 'data' => '<'\r | |
655 | )\r | |
656 | );\r | |
657 | \r | |
658 | $this->state = 'data';\r | |
659 | }\r | |
660 | break;\r | |
661 | \r | |
662 | case self::PCDATA:\r | |
663 | // If the content model flag is set to the PCDATA state\r | |
664 | // Consume the next input character:\r | |
665 | $this->char++;\r | |
666 | $char = $this->char();\r | |
667 | \r | |
668 | if ($char === '!') {\r | |
669 | /* U+0021 EXCLAMATION MARK (!)\r | |
670 | Switch to the markup declaration open state. */\r | |
671 | $this->state = 'markupDeclarationOpen';\r | |
672 | \r | |
673 | } elseif ($char === '/') {\r | |
674 | /* U+002F SOLIDUS (/)\r | |
675 | Switch to the close tag open state. */\r | |
676 | $this->state = 'closeTagOpen';\r | |
677 | \r | |
678 | } elseif (preg_match('/^[A-Za-z]$/', $char)) {\r | |
679 | /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z\r | |
680 | Create a new start tag token, set its tag name to the lowercase\r | |
681 | version of the input character (add 0x0020 to the character's code\r | |
682 | point), then switch to the tag name state. (Don't emit the token\r | |
683 | yet; further details will be filled in before it is emitted.) */\r | |
684 | $this->token = array(\r | |
685 | 'name' => strtolower($char),\r | |
686 | 'type' => self::STARTTAG,\r | |
687 | 'attr' => array()\r | |
688 | );\r | |
689 | \r | |
690 | $this->state = 'tagName';\r | |
691 | \r | |
692 | } elseif ($char === '>') {\r | |
693 | /* U+003E GREATER-THAN SIGN (>)\r | |
694 | Parse error. Emit a U+003C LESS-THAN SIGN character token and a\r | |
695 | U+003E GREATER-THAN SIGN character token. Switch to the data state. */\r | |
696 | $this->emitToken(\r | |
697 | array(\r | |
698 | 'type' => self::CHARACTR,\r | |
699 | 'data' => '<>'\r | |
700 | )\r | |
701 | );\r | |
702 | \r | |
703 | $this->state = 'data';\r | |
704 | \r | |
705 | } elseif ($char === '?') {\r | |
706 | /* U+003F QUESTION MARK (?)\r | |
707 | Parse error. Switch to the bogus comment state. */\r | |
708 | $this->state = 'bogusComment';\r | |
709 | \r | |
710 | } else {\r | |
711 | /* Anything else\r | |
712 | Parse error. Emit a U+003C LESS-THAN SIGN character token and\r | |
713 | reconsume the current input character in the data state. */\r | |
714 | $this->emitToken(\r | |
715 | array(\r | |
716 | 'type' => self::CHARACTR,\r | |
717 | 'data' => '<'\r | |
718 | )\r | |
719 | );\r | |
720 | \r | |
721 | $this->char--;\r | |
722 | $this->state = 'data';\r | |
723 | }\r | |
724 | break;\r | |
725 | }\r | |
726 | }\r | |
727 | \r | |
728 | private function closeTagOpenState()\r | |
729 | {\r | |
730 | $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));\r | |
731 | $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;\r | |
732 | \r | |
733 | if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&\r | |
734 | (!$the_same || ($the_same && (!preg_match(\r | |
735 | '/[\t\n\x0b\x0c >\/]/',\r | |
736 | $this->character($this->char + 1 + strlen($next_node))\r | |
737 | ) || $this->EOF === $this->char)))\r | |
738 | ) {\r | |
739 | /* If the content model flag is set to the RCDATA or CDATA states then\r | |
740 | examine the next few characters. If they do not match the tag name of\r | |
741 | the last start tag token emitted (case insensitively), or if they do but\r | |
742 | they are not immediately followed by one of the following characters:\r | |
743 | * U+0009 CHARACTER TABULATION\r | |
744 | * U+000A LINE FEED (LF)\r | |
745 | * U+000B LINE TABULATION\r | |
746 | * U+000C FORM FEED (FF)\r | |
747 | * U+0020 SPACE\r | |
748 | * U+003E GREATER-THAN SIGN (>)\r | |
749 | * U+002F SOLIDUS (/)\r | |
750 | * EOF\r | |
751 | ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character\r | |
752 | token, a U+002F SOLIDUS character token, and switch to the data state\r | |
753 | to process the next input character. */\r | |
754 | $this->emitToken(\r | |
755 | array(\r | |
756 | 'type' => self::CHARACTR,\r | |
757 | 'data' => '</'\r | |
758 | )\r | |
759 | );\r | |
760 | \r | |
761 | $this->state = 'data';\r | |
762 | \r | |
763 | } else {\r | |
764 | /* Otherwise, if the content model flag is set to the PCDATA state,\r | |
765 | or if the next few characters do match that tag name, consume the\r | |
766 | next input character: */\r | |
767 | $this->char++;\r | |
768 | $char = $this->char();\r | |
769 | \r | |
770 | if (preg_match('/^[A-Za-z]$/', $char)) {\r | |
771 | /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z\r | |
772 | Create a new end tag token, set its tag name to the lowercase version\r | |
773 | of the input character (add 0x0020 to the character's code point), then\r | |
774 | switch to the tag name state. (Don't emit the token yet; further details\r | |
775 | will be filled in before it is emitted.) */\r | |
776 | $this->token = array(\r | |
777 | 'name' => strtolower($char),\r | |
778 | 'type' => self::ENDTAG\r | |
779 | );\r | |
780 | \r | |
781 | $this->state = 'tagName';\r | |
782 | \r | |
783 | } elseif ($char === '>') {\r | |
784 | /* U+003E GREATER-THAN SIGN (>)\r | |
785 | Parse error. Switch to the data state. */\r | |
786 | $this->state = 'data';\r | |
787 | \r | |
788 | } elseif ($this->char === $this->EOF) {\r | |
789 | /* EOF\r | |
790 | Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F\r | |
791 | SOLIDUS character token. Reconsume the EOF character in the data state. */\r | |
792 | $this->emitToken(\r | |
793 | array(\r | |
794 | 'type' => self::CHARACTR,\r | |
795 | 'data' => '</'\r | |
796 | )\r | |
797 | );\r | |
798 | \r | |
799 | $this->char--;\r | |
800 | $this->state = 'data';\r | |
801 | \r | |
802 | } else {\r | |
803 | /* Parse error. Switch to the bogus comment state. */\r | |
804 | $this->state = 'bogusComment';\r | |
805 | }\r | |
806 | }\r | |
807 | }\r | |
808 | \r | |
809 | private function tagNameState()\r | |
810 | {\r | |
811 | // Consume the next input character:\r | |
812 | $this->char++;\r | |
813 | $char = $this->character($this->char);\r | |
814 | \r | |
815 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
816 | /* U+0009 CHARACTER TABULATION\r | |
817 | U+000A LINE FEED (LF)\r | |
818 | U+000B LINE TABULATION\r | |
819 | U+000C FORM FEED (FF)\r | |
820 | U+0020 SPACE\r | |
821 | Switch to the before attribute name state. */\r | |
822 | $this->state = 'beforeAttributeName';\r | |
823 | \r | |
824 | } elseif ($char === '>') {\r | |
825 | /* U+003E GREATER-THAN SIGN (>)\r | |
826 | Emit the current tag token. Switch to the data state. */\r | |
827 | $this->emitToken($this->token);\r | |
828 | $this->state = 'data';\r | |
829 | \r | |
830 | } elseif ($this->char === $this->EOF) {\r | |
831 | /* EOF\r | |
832 | Parse error. Emit the current tag token. Reconsume the EOF\r | |
833 | character in the data state. */\r | |
834 | $this->emitToken($this->token);\r | |
835 | \r | |
836 | $this->char--;\r | |
837 | $this->state = 'data';\r | |
838 | \r | |
839 | } elseif ($char === '/') {\r | |
840 | /* U+002F SOLIDUS (/)\r | |
841 | Parse error unless this is a permitted slash. Switch to the before\r | |
842 | attribute name state. */\r | |
843 | $this->state = 'beforeAttributeName';\r | |
844 | \r | |
845 | } else {\r | |
846 | /* Anything else\r | |
847 | Append the current input character to the current tag token's tag name.\r | |
848 | Stay in the tag name state. */\r | |
849 | $this->token['name'] .= strtolower($char);\r | |
850 | $this->state = 'tagName';\r | |
851 | }\r | |
852 | }\r | |
853 | \r | |
854 | private function beforeAttributeNameState()\r | |
855 | {\r | |
856 | // Consume the next input character:\r | |
857 | $this->char++;\r | |
858 | $char = $this->character($this->char);\r | |
859 | \r | |
860 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
861 | /* U+0009 CHARACTER TABULATION\r | |
862 | U+000A LINE FEED (LF)\r | |
863 | U+000B LINE TABULATION\r | |
864 | U+000C FORM FEED (FF)\r | |
865 | U+0020 SPACE\r | |
866 | Stay in the before attribute name state. */\r | |
867 | $this->state = 'beforeAttributeName';\r | |
868 | \r | |
869 | } elseif ($char === '>') {\r | |
870 | /* U+003E GREATER-THAN SIGN (>)\r | |
871 | Emit the current tag token. Switch to the data state. */\r | |
872 | $this->emitToken($this->token);\r | |
873 | $this->state = 'data';\r | |
874 | \r | |
875 | } elseif ($char === '/') {\r | |
876 | /* U+002F SOLIDUS (/)\r | |
877 | Parse error unless this is a permitted slash. Stay in the before\r | |
878 | attribute name state. */\r | |
879 | $this->state = 'beforeAttributeName';\r | |
880 | \r | |
881 | } elseif ($this->char === $this->EOF) {\r | |
882 | /* EOF\r | |
883 | Parse error. Emit the current tag token. Reconsume the EOF\r | |
884 | character in the data state. */\r | |
885 | $this->emitToken($this->token);\r | |
886 | \r | |
887 | $this->char--;\r | |
888 | $this->state = 'data';\r | |
889 | \r | |
890 | } else {\r | |
891 | /* Anything else\r | |
892 | Start a new attribute in the current tag token. Set that attribute's\r | |
893 | name to the current input character, and its value to the empty string.\r | |
894 | Switch to the attribute name state. */\r | |
895 | $this->token['attr'][] = array(\r | |
896 | 'name' => strtolower($char),\r | |
897 | 'value' => null\r | |
898 | );\r | |
899 | \r | |
900 | $this->state = 'attributeName';\r | |
901 | }\r | |
902 | }\r | |
903 | \r | |
904 | private function attributeNameState()\r | |
905 | {\r | |
906 | // Consume the next input character:\r | |
907 | $this->char++;\r | |
908 | $char = $this->character($this->char);\r | |
909 | \r | |
910 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
911 | /* U+0009 CHARACTER TABULATION\r | |
912 | U+000A LINE FEED (LF)\r | |
913 | U+000B LINE TABULATION\r | |
914 | U+000C FORM FEED (FF)\r | |
915 | U+0020 SPACE\r | |
916 | Stay in the before attribute name state. */\r | |
917 | $this->state = 'afterAttributeName';\r | |
918 | \r | |
919 | } elseif ($char === '=') {\r | |
920 | /* U+003D EQUALS SIGN (=)\r | |
921 | Switch to the before attribute value state. */\r | |
922 | $this->state = 'beforeAttributeValue';\r | |
923 | \r | |
924 | } elseif ($char === '>') {\r | |
925 | /* U+003E GREATER-THAN SIGN (>)\r | |
926 | Emit the current tag token. Switch to the data state. */\r | |
927 | $this->emitToken($this->token);\r | |
928 | $this->state = 'data';\r | |
929 | \r | |
930 | } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {\r | |
931 | /* U+002F SOLIDUS (/)\r | |
932 | Parse error unless this is a permitted slash. Switch to the before\r | |
933 | attribute name state. */\r | |
934 | $this->state = 'beforeAttributeName';\r | |
935 | \r | |
936 | } elseif ($this->char === $this->EOF) {\r | |
937 | /* EOF\r | |
938 | Parse error. Emit the current tag token. Reconsume the EOF\r | |
939 | character in the data state. */\r | |
940 | $this->emitToken($this->token);\r | |
941 | \r | |
942 | $this->char--;\r | |
943 | $this->state = 'data';\r | |
944 | \r | |
945 | } else {\r | |
946 | /* Anything else\r | |
947 | Append the current input character to the current attribute's name.\r | |
948 | Stay in the attribute name state. */\r | |
949 | $last = count($this->token['attr']) - 1;\r | |
950 | $this->token['attr'][$last]['name'] .= strtolower($char);\r | |
951 | \r | |
952 | $this->state = 'attributeName';\r | |
953 | }\r | |
954 | }\r | |
955 | \r | |
956 | private function afterAttributeNameState()\r | |
957 | {\r | |
958 | // Consume the next input character:\r | |
959 | $this->char++;\r | |
960 | $char = $this->character($this->char);\r | |
961 | \r | |
962 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
963 | /* U+0009 CHARACTER TABULATION\r | |
964 | U+000A LINE FEED (LF)\r | |
965 | U+000B LINE TABULATION\r | |
966 | U+000C FORM FEED (FF)\r | |
967 | U+0020 SPACE\r | |
968 | Stay in the after attribute name state. */\r | |
969 | $this->state = 'afterAttributeName';\r | |
970 | \r | |
971 | } elseif ($char === '=') {\r | |
972 | /* U+003D EQUALS SIGN (=)\r | |
973 | Switch to the before attribute value state. */\r | |
974 | $this->state = 'beforeAttributeValue';\r | |
975 | \r | |
976 | } elseif ($char === '>') {\r | |
977 | /* U+003E GREATER-THAN SIGN (>)\r | |
978 | Emit the current tag token. Switch to the data state. */\r | |
979 | $this->emitToken($this->token);\r | |
980 | $this->state = 'data';\r | |
981 | \r | |
982 | } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {\r | |
983 | /* U+002F SOLIDUS (/)\r | |
984 | Parse error unless this is a permitted slash. Switch to the\r | |
985 | before attribute name state. */\r | |
986 | $this->state = 'beforeAttributeName';\r | |
987 | \r | |
988 | } elseif ($this->char === $this->EOF) {\r | |
989 | /* EOF\r | |
990 | Parse error. Emit the current tag token. Reconsume the EOF\r | |
991 | character in the data state. */\r | |
992 | $this->emitToken($this->token);\r | |
993 | \r | |
994 | $this->char--;\r | |
995 | $this->state = 'data';\r | |
996 | \r | |
997 | } else {\r | |
998 | /* Anything else\r | |
999 | Start a new attribute in the current tag token. Set that attribute's\r | |
1000 | name to the current input character, and its value to the empty string.\r | |
1001 | Switch to the attribute name state. */\r | |
1002 | $this->token['attr'][] = array(\r | |
1003 | 'name' => strtolower($char),\r | |
1004 | 'value' => null\r | |
1005 | );\r | |
1006 | \r | |
1007 | $this->state = 'attributeName';\r | |
1008 | }\r | |
1009 | }\r | |
1010 | \r | |
1011 | private function beforeAttributeValueState()\r | |
1012 | {\r | |
1013 | // Consume the next input character:\r | |
1014 | $this->char++;\r | |
1015 | $char = $this->character($this->char);\r | |
1016 | \r | |
1017 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1018 | /* U+0009 CHARACTER TABULATION\r | |
1019 | U+000A LINE FEED (LF)\r | |
1020 | U+000B LINE TABULATION\r | |
1021 | U+000C FORM FEED (FF)\r | |
1022 | U+0020 SPACE\r | |
1023 | Stay in the before attribute value state. */\r | |
1024 | $this->state = 'beforeAttributeValue';\r | |
1025 | \r | |
1026 | } elseif ($char === '"') {\r | |
1027 | /* U+0022 QUOTATION MARK (")\r | |
1028 | Switch to the attribute value (double-quoted) state. */\r | |
1029 | $this->state = 'attributeValueDoubleQuoted';\r | |
1030 | \r | |
1031 | } elseif ($char === '&') {\r | |
1032 | /* U+0026 AMPERSAND (&)\r | |
1033 | Switch to the attribute value (unquoted) state and reconsume\r | |
1034 | this input character. */\r | |
1035 | $this->char--;\r | |
1036 | $this->state = 'attributeValueUnquoted';\r | |
1037 | \r | |
1038 | } elseif ($char === '\'') {\r | |
1039 | /* U+0027 APOSTROPHE (')\r | |
1040 | Switch to the attribute value (single-quoted) state. */\r | |
1041 | $this->state = 'attributeValueSingleQuoted';\r | |
1042 | \r | |
1043 | } elseif ($char === '>') {\r | |
1044 | /* U+003E GREATER-THAN SIGN (>)\r | |
1045 | Emit the current tag token. Switch to the data state. */\r | |
1046 | $this->emitToken($this->token);\r | |
1047 | $this->state = 'data';\r | |
1048 | \r | |
1049 | } else {\r | |
1050 | /* Anything else\r | |
1051 | Append the current input character to the current attribute's value.\r | |
1052 | Switch to the attribute value (unquoted) state. */\r | |
1053 | $last = count($this->token['attr']) - 1;\r | |
1054 | $this->token['attr'][$last]['value'] .= $char;\r | |
1055 | \r | |
1056 | $this->state = 'attributeValueUnquoted';\r | |
1057 | }\r | |
1058 | }\r | |
1059 | \r | |
1060 | private function attributeValueDoubleQuotedState()\r | |
1061 | {\r | |
1062 | // Consume the next input character:\r | |
1063 | $this->char++;\r | |
1064 | $char = $this->character($this->char);\r | |
1065 | \r | |
1066 | if ($char === '"') {\r | |
1067 | /* U+0022 QUOTATION MARK (")\r | |
1068 | Switch to the before attribute name state. */\r | |
1069 | $this->state = 'beforeAttributeName';\r | |
1070 | \r | |
1071 | } elseif ($char === '&') {\r | |
1072 | /* U+0026 AMPERSAND (&)\r | |
1073 | Switch to the entity in attribute value state. */\r | |
1074 | $this->entityInAttributeValueState('double');\r | |
1075 | \r | |
1076 | } elseif ($this->char === $this->EOF) {\r | |
1077 | /* EOF\r | |
1078 | Parse error. Emit the current tag token. Reconsume the character\r | |
1079 | in the data state. */\r | |
1080 | $this->emitToken($this->token);\r | |
1081 | \r | |
1082 | $this->char--;\r | |
1083 | $this->state = 'data';\r | |
1084 | \r | |
1085 | } else {\r | |
1086 | /* Anything else\r | |
1087 | Append the current input character to the current attribute's value.\r | |
1088 | Stay in the attribute value (double-quoted) state. */\r | |
1089 | $last = count($this->token['attr']) - 1;\r | |
1090 | $this->token['attr'][$last]['value'] .= $char;\r | |
1091 | \r | |
1092 | $this->state = 'attributeValueDoubleQuoted';\r | |
1093 | }\r | |
1094 | }\r | |
1095 | \r | |
1096 | private function attributeValueSingleQuotedState()\r | |
1097 | {\r | |
1098 | // Consume the next input character:\r | |
1099 | $this->char++;\r | |
1100 | $char = $this->character($this->char);\r | |
1101 | \r | |
1102 | if ($char === '\'') {\r | |
1103 | /* U+0022 QUOTATION MARK (')\r | |
1104 | Switch to the before attribute name state. */\r | |
1105 | $this->state = 'beforeAttributeName';\r | |
1106 | \r | |
1107 | } elseif ($char === '&') {\r | |
1108 | /* U+0026 AMPERSAND (&)\r | |
1109 | Switch to the entity in attribute value state. */\r | |
1110 | $this->entityInAttributeValueState('single');\r | |
1111 | \r | |
1112 | } elseif ($this->char === $this->EOF) {\r | |
1113 | /* EOF\r | |
1114 | Parse error. Emit the current tag token. Reconsume the character\r | |
1115 | in the data state. */\r | |
1116 | $this->emitToken($this->token);\r | |
1117 | \r | |
1118 | $this->char--;\r | |
1119 | $this->state = 'data';\r | |
1120 | \r | |
1121 | } else {\r | |
1122 | /* Anything else\r | |
1123 | Append the current input character to the current attribute's value.\r | |
1124 | Stay in the attribute value (single-quoted) state. */\r | |
1125 | $last = count($this->token['attr']) - 1;\r | |
1126 | $this->token['attr'][$last]['value'] .= $char;\r | |
1127 | \r | |
1128 | $this->state = 'attributeValueSingleQuoted';\r | |
1129 | }\r | |
1130 | }\r | |
1131 | \r | |
1132 | private function attributeValueUnquotedState()\r | |
1133 | {\r | |
1134 | // Consume the next input character:\r | |
1135 | $this->char++;\r | |
1136 | $char = $this->character($this->char);\r | |
1137 | \r | |
1138 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1139 | /* U+0009 CHARACTER TABULATION\r | |
1140 | U+000A LINE FEED (LF)\r | |
1141 | U+000B LINE TABULATION\r | |
1142 | U+000C FORM FEED (FF)\r | |
1143 | U+0020 SPACE\r | |
1144 | Switch to the before attribute name state. */\r | |
1145 | $this->state = 'beforeAttributeName';\r | |
1146 | \r | |
1147 | } elseif ($char === '&') {\r | |
1148 | /* U+0026 AMPERSAND (&)\r | |
1149 | Switch to the entity in attribute value state. */\r | |
1150 | $this->entityInAttributeValueState();\r | |
1151 | \r | |
1152 | } elseif ($char === '>') {\r | |
1153 | /* U+003E GREATER-THAN SIGN (>)\r | |
1154 | Emit the current tag token. Switch to the data state. */\r | |
1155 | $this->emitToken($this->token);\r | |
1156 | $this->state = 'data';\r | |
1157 | \r | |
1158 | } else {\r | |
1159 | /* Anything else\r | |
1160 | Append the current input character to the current attribute's value.\r | |
1161 | Stay in the attribute value (unquoted) state. */\r | |
1162 | $last = count($this->token['attr']) - 1;\r | |
1163 | $this->token['attr'][$last]['value'] .= $char;\r | |
1164 | \r | |
1165 | $this->state = 'attributeValueUnquoted';\r | |
1166 | }\r | |
1167 | }\r | |
1168 | \r | |
1169 | private function entityInAttributeValueState()\r | |
1170 | {\r | |
1171 | // Attempt to consume an entity.\r | |
1172 | $entity = $this->entity();\r | |
1173 | \r | |
1174 | // If nothing is returned, append a U+0026 AMPERSAND character to the\r | |
1175 | // current attribute's value. Otherwise, emit the character token that\r | |
1176 | // was returned.\r | |
1177 | $char = (!$entity)\r | |
1178 | ? '&'\r | |
1179 | : $entity;\r | |
1180 | \r | |
1181 | $last = count($this->token['attr']) - 1;\r | |
1182 | $this->token['attr'][$last]['value'] .= $char;\r | |
1183 | }\r | |
1184 | \r | |
1185 | private function bogusCommentState()\r | |
1186 | {\r | |
1187 | /* Consume every character up to the first U+003E GREATER-THAN SIGN\r | |
1188 | character (>) or the end of the file (EOF), whichever comes first. Emit\r | |
1189 | a comment token whose data is the concatenation of all the characters\r | |
1190 | starting from and including the character that caused the state machine\r | |
1191 | to switch into the bogus comment state, up to and including the last\r | |
1192 | consumed character before the U+003E character, if any, or up to the\r | |
1193 | end of the file otherwise. (If the comment was started by the end of\r | |
1194 | the file (EOF), the token is empty.) */\r | |
1195 | $data = $this->characters('^>', $this->char);\r | |
1196 | $this->emitToken(\r | |
1197 | array(\r | |
1198 | 'data' => $data,\r | |
1199 | 'type' => self::COMMENT\r | |
1200 | )\r | |
1201 | );\r | |
1202 | \r | |
1203 | $this->char += strlen($data);\r | |
1204 | \r | |
1205 | /* Switch to the data state. */\r | |
1206 | $this->state = 'data';\r | |
1207 | \r | |
1208 | /* If the end of the file was reached, reconsume the EOF character. */\r | |
1209 | if ($this->char === $this->EOF) {\r | |
1210 | $this->char = $this->EOF - 1;\r | |
1211 | }\r | |
1212 | }\r | |
1213 | \r | |
1214 | private function markupDeclarationOpenState()\r | |
1215 | {\r | |
1216 | /* If the next two characters are both U+002D HYPHEN-MINUS (-)\r | |
1217 | characters, consume those two characters, create a comment token whose\r | |
1218 | data is the empty string, and switch to the comment state. */\r | |
1219 | if ($this->character($this->char + 1, 2) === '--') {\r | |
1220 | $this->char += 2;\r | |
1221 | $this->state = 'comment';\r | |
1222 | $this->token = array(\r | |
1223 | 'data' => null,\r | |
1224 | 'type' => self::COMMENT\r | |
1225 | );\r | |
1226 | \r | |
1227 | /* Otherwise if the next seven chacacters are a case-insensitive match\r | |
1228 | for the word "DOCTYPE", then consume those characters and switch to the\r | |
1229 | DOCTYPE state. */\r | |
1230 | } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {\r | |
1231 | $this->char += 7;\r | |
1232 | $this->state = 'doctype';\r | |
1233 | \r | |
1234 | /* Otherwise, is is a parse error. Switch to the bogus comment state.\r | |
1235 | The next character that is consumed, if any, is the first character\r | |
1236 | that will be in the comment. */\r | |
1237 | } else {\r | |
1238 | $this->char++;\r | |
1239 | $this->state = 'bogusComment';\r | |
1240 | }\r | |
1241 | }\r | |
1242 | \r | |
1243 | private function commentState()\r | |
1244 | {\r | |
1245 | /* Consume the next input character: */\r | |
1246 | $this->char++;\r | |
1247 | $char = $this->char();\r | |
1248 | \r | |
1249 | /* U+002D HYPHEN-MINUS (-) */\r | |
1250 | if ($char === '-') {\r | |
1251 | /* Switch to the comment dash state */\r | |
1252 | $this->state = 'commentDash';\r | |
1253 | \r | |
1254 | /* EOF */\r | |
1255 | } elseif ($this->char === $this->EOF) {\r | |
1256 | /* Parse error. Emit the comment token. Reconsume the EOF character\r | |
1257 | in the data state. */\r | |
1258 | $this->emitToken($this->token);\r | |
1259 | $this->char--;\r | |
1260 | $this->state = 'data';\r | |
1261 | \r | |
1262 | /* Anything else */\r | |
1263 | } else {\r | |
1264 | /* Append the input character to the comment token's data. Stay in\r | |
1265 | the comment state. */\r | |
1266 | $this->token['data'] .= $char;\r | |
1267 | }\r | |
1268 | }\r | |
1269 | \r | |
1270 | private function commentDashState()\r | |
1271 | {\r | |
1272 | /* Consume the next input character: */\r | |
1273 | $this->char++;\r | |
1274 | $char = $this->char();\r | |
1275 | \r | |
1276 | /* U+002D HYPHEN-MINUS (-) */\r | |
1277 | if ($char === '-') {\r | |
1278 | /* Switch to the comment end state */\r | |
1279 | $this->state = 'commentEnd';\r | |
1280 | \r | |
1281 | /* EOF */\r | |
1282 | } elseif ($this->char === $this->EOF) {\r | |
1283 | /* Parse error. Emit the comment token. Reconsume the EOF character\r | |
1284 | in the data state. */\r | |
1285 | $this->emitToken($this->token);\r | |
1286 | $this->char--;\r | |
1287 | $this->state = 'data';\r | |
1288 | \r | |
1289 | /* Anything else */\r | |
1290 | } else {\r | |
1291 | /* Append a U+002D HYPHEN-MINUS (-) character and the input\r | |
1292 | character to the comment token's data. Switch to the comment state. */\r | |
1293 | $this->token['data'] .= '-' . $char;\r | |
1294 | $this->state = 'comment';\r | |
1295 | }\r | |
1296 | }\r | |
1297 | \r | |
1298 | private function commentEndState()\r | |
1299 | {\r | |
1300 | /* Consume the next input character: */\r | |
1301 | $this->char++;\r | |
1302 | $char = $this->char();\r | |
1303 | \r | |
1304 | if ($char === '>') {\r | |
1305 | $this->emitToken($this->token);\r | |
1306 | $this->state = 'data';\r | |
1307 | \r | |
1308 | } elseif ($char === '-') {\r | |
1309 | $this->token['data'] .= '-';\r | |
1310 | \r | |
1311 | } elseif ($this->char === $this->EOF) {\r | |
1312 | $this->emitToken($this->token);\r | |
1313 | $this->char--;\r | |
1314 | $this->state = 'data';\r | |
1315 | \r | |
1316 | } else {\r | |
1317 | $this->token['data'] .= '--' . $char;\r | |
1318 | $this->state = 'comment';\r | |
1319 | }\r | |
1320 | }\r | |
1321 | \r | |
1322 | private function doctypeState()\r | |
1323 | {\r | |
1324 | /* Consume the next input character: */\r | |
1325 | $this->char++;\r | |
1326 | $char = $this->char();\r | |
1327 | \r | |
1328 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1329 | $this->state = 'beforeDoctypeName';\r | |
1330 | \r | |
1331 | } else {\r | |
1332 | $this->char--;\r | |
1333 | $this->state = 'beforeDoctypeName';\r | |
1334 | }\r | |
1335 | }\r | |
1336 | \r | |
1337 | private function beforeDoctypeNameState()\r | |
1338 | {\r | |
1339 | /* Consume the next input character: */\r | |
1340 | $this->char++;\r | |
1341 | $char = $this->char();\r | |
1342 | \r | |
1343 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1344 | // Stay in the before DOCTYPE name state.\r | |
1345 | \r | |
1346 | } elseif (preg_match('/^[a-z]$/', $char)) {\r | |
1347 | $this->token = array(\r | |
1348 | 'name' => strtoupper($char),\r | |
1349 | 'type' => self::DOCTYPE,\r | |
1350 | 'error' => true\r | |
1351 | );\r | |
1352 | \r | |
1353 | $this->state = 'doctypeName';\r | |
1354 | \r | |
1355 | } elseif ($char === '>') {\r | |
1356 | $this->emitToken(\r | |
1357 | array(\r | |
1358 | 'name' => null,\r | |
1359 | 'type' => self::DOCTYPE,\r | |
1360 | 'error' => true\r | |
1361 | )\r | |
1362 | );\r | |
1363 | \r | |
1364 | $this->state = 'data';\r | |
1365 | \r | |
1366 | } elseif ($this->char === $this->EOF) {\r | |
1367 | $this->emitToken(\r | |
1368 | array(\r | |
1369 | 'name' => null,\r | |
1370 | 'type' => self::DOCTYPE,\r | |
1371 | 'error' => true\r | |
1372 | )\r | |
1373 | );\r | |
1374 | \r | |
1375 | $this->char--;\r | |
1376 | $this->state = 'data';\r | |
1377 | \r | |
1378 | } else {\r | |
1379 | $this->token = array(\r | |
1380 | 'name' => $char,\r | |
1381 | 'type' => self::DOCTYPE,\r | |
1382 | 'error' => true\r | |
1383 | );\r | |
1384 | \r | |
1385 | $this->state = 'doctypeName';\r | |
1386 | }\r | |
1387 | }\r | |
1388 | \r | |
1389 | private function doctypeNameState()\r | |
1390 | {\r | |
1391 | /* Consume the next input character: */\r | |
1392 | $this->char++;\r | |
1393 | $char = $this->char();\r | |
1394 | \r | |
1395 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1396 | $this->state = 'AfterDoctypeName';\r | |
1397 | \r | |
1398 | } elseif ($char === '>') {\r | |
1399 | $this->emitToken($this->token);\r | |
1400 | $this->state = 'data';\r | |
1401 | \r | |
1402 | } elseif (preg_match('/^[a-z]$/', $char)) {\r | |
1403 | $this->token['name'] .= strtoupper($char);\r | |
1404 | \r | |
1405 | } elseif ($this->char === $this->EOF) {\r | |
1406 | $this->emitToken($this->token);\r | |
1407 | $this->char--;\r | |
1408 | $this->state = 'data';\r | |
1409 | \r | |
1410 | } else {\r | |
1411 | $this->token['name'] .= $char;\r | |
1412 | }\r | |
1413 | \r | |
1414 | $this->token['error'] = ($this->token['name'] === 'HTML')\r | |
1415 | ? false\r | |
1416 | : true;\r | |
1417 | }\r | |
1418 | \r | |
1419 | private function afterDoctypeNameState()\r | |
1420 | {\r | |
1421 | /* Consume the next input character: */\r | |
1422 | $this->char++;\r | |
1423 | $char = $this->char();\r | |
1424 | \r | |
1425 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {\r | |
1426 | // Stay in the DOCTYPE name state.\r | |
1427 | \r | |
1428 | } elseif ($char === '>') {\r | |
1429 | $this->emitToken($this->token);\r | |
1430 | $this->state = 'data';\r | |
1431 | \r | |
1432 | } elseif ($this->char === $this->EOF) {\r | |
1433 | $this->emitToken($this->token);\r | |
1434 | $this->char--;\r | |
1435 | $this->state = 'data';\r | |
1436 | \r | |
1437 | } else {\r | |
1438 | $this->token['error'] = true;\r | |
1439 | $this->state = 'bogusDoctype';\r | |
1440 | }\r | |
1441 | }\r | |
1442 | \r | |
1443 | private function bogusDoctypeState()\r | |
1444 | {\r | |
1445 | /* Consume the next input character: */\r | |
1446 | $this->char++;\r | |
1447 | $char = $this->char();\r | |
1448 | \r | |
1449 | if ($char === '>') {\r | |
1450 | $this->emitToken($this->token);\r | |
1451 | $this->state = 'data';\r | |
1452 | \r | |
1453 | } elseif ($this->char === $this->EOF) {\r | |
1454 | $this->emitToken($this->token);\r | |
1455 | $this->char--;\r | |
1456 | $this->state = 'data';\r | |
1457 | \r | |
1458 | } else {\r | |
1459 | // Stay in the bogus DOCTYPE state.\r | |
1460 | }\r | |
1461 | }\r | |
1462 | \r | |
1463 | private function entity()\r | |
1464 | {\r | |
1465 | $start = $this->char;\r | |
1466 | \r | |
1467 | // This section defines how to consume an entity. This definition is\r | |
1468 | // used when parsing entities in text and in attributes.\r | |
1469 | \r | |
1470 | // The behaviour depends on the identity of the next character (the\r | |
1471 | // one immediately after the U+0026 AMPERSAND character):\r | |
1472 | \r | |
1473 | switch ($this->character($this->char + 1)) {\r | |
1474 | // U+0023 NUMBER SIGN (#)\r | |
1475 | case '#':\r | |
1476 | \r | |
1477 | // The behaviour further depends on the character after the\r | |
1478 | // U+0023 NUMBER SIGN:\r | |
1479 | switch ($this->character($this->char + 1)) {\r | |
1480 | // U+0078 LATIN SMALL LETTER X\r | |
1481 | // U+0058 LATIN CAPITAL LETTER X\r | |
1482 | case 'x':\r | |
1483 | case 'X':\r | |
1484 | // Follow the steps below, but using the range of\r | |
1485 | // characters U+0030 DIGIT ZERO through to U+0039 DIGIT\r | |
1486 | // NINE, U+0061 LATIN SMALL LETTER A through to U+0066\r | |
1487 | // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER\r | |
1488 | // A, through to U+0046 LATIN CAPITAL LETTER F (in other\r | |
1489 | // words, 0-9, A-F, a-f).\r | |
1490 | $char = 1;\r | |
1491 | $char_class = '0-9A-Fa-f';\r | |
1492 | break;\r | |
1493 | \r | |
1494 | // Anything else\r | |
1495 | default:\r | |
1496 | // Follow the steps below, but using the range of\r | |
1497 | // characters U+0030 DIGIT ZERO through to U+0039 DIGIT\r | |
1498 | // NINE (i.e. just 0-9).\r | |
1499 | $char = 0;\r | |
1500 | $char_class = '0-9';\r | |
1501 | break;\r | |
1502 | }\r | |
1503 | \r | |
1504 | // Consume as many characters as match the range of characters\r | |
1505 | // given above.\r | |
1506 | $this->char++;\r | |
1507 | $e_name = $this->characters($char_class, $this->char + $char + 1);\r | |
1508 | $entity = $this->character($start, $this->char);\r | |
1509 | $cond = strlen($e_name) > 0;\r | |
1510 | \r | |
1511 | // The rest of the parsing happens bellow.\r | |
1512 | break;\r | |
1513 | \r | |
1514 | // Anything else\r | |
1515 | default:\r | |
1516 | // Consume the maximum number of characters possible, with the\r | |
1517 | // consumed characters case-sensitively matching one of the\r | |
1518 | // identifiers in the first column of the entities table.\r | |
1519 | $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);\r | |
1520 | $len = strlen($e_name);\r | |
1521 | \r | |
1522 | for ($c = 1; $c <= $len; $c++) {\r | |
1523 | $id = substr($e_name, 0, $c);\r | |
1524 | $this->char++;\r | |
1525 | \r | |
1526 | if (in_array($id, $this->entities)) {\r | |
1527 | if ($e_name[$c - 1] !== ';') {\r | |
1528 | if ($c < $len && $e_name[$c] == ';') {\r | |
1529 | $this->char++; // consume extra semicolon\r | |
1530 | }\r | |
1531 | }\r | |
1532 | $entity = $id;\r | |
1533 | break;\r | |
1534 | }\r | |
1535 | }\r | |
1536 | \r | |
1537 | $cond = isset($entity);\r | |
1538 | // The rest of the parsing happens bellow.\r | |
1539 | break;\r | |
1540 | }\r | |
1541 | \r | |
1542 | if (!$cond) {\r | |
1543 | // If no match can be made, then this is a parse error. No\r | |
1544 | // characters are consumed, and nothing is returned.\r | |
1545 | $this->char = $start;\r | |
1546 | return false;\r | |
1547 | }\r | |
1548 | \r | |
1549 | // Return a character token for the character corresponding to the\r | |
1550 | // entity name (as given by the second column of the entities table).\r | |
1551 | return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');\r | |
1552 | }\r | |
1553 | \r | |
1554 | private function emitToken($token)\r | |
1555 | {\r | |
1556 | $emit = $this->tree->emitToken($token);\r | |
1557 | \r | |
1558 | if (is_int($emit)) {\r | |
1559 | $this->content_model = $emit;\r | |
1560 | \r | |
1561 | } elseif ($token['type'] === self::ENDTAG) {\r | |
1562 | $this->content_model = self::PCDATA;\r | |
1563 | }\r | |
1564 | }\r | |
1565 | \r | |
1566 | private function EOF()\r | |
1567 | {\r | |
1568 | $this->state = null;\r | |
1569 | $this->tree->emitToken(\r | |
1570 | array(\r | |
1571 | 'type' => self::EOF\r | |
1572 | )\r | |
1573 | );\r | |
1574 | }\r | |
1575 | }\r | |
1576 | \r | |
1577 | class HTML5TreeConstructer\r | |
1578 | {\r | |
1579 | public $stack = array();\r | |
1580 | \r | |
1581 | private $phase;\r | |
1582 | private $mode;\r | |
1583 | private $dom;\r | |
1584 | private $foster_parent = null;\r | |
1585 | private $a_formatting = array();\r | |
1586 | \r | |
1587 | private $head_pointer = null;\r | |
1588 | private $form_pointer = null;\r | |
1589 | \r | |
1590 | private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');\r | |
1591 | private $formatting = array(\r | |
1592 | 'a',\r | |
1593 | 'b',\r | |
1594 | 'big',\r | |
1595 | 'em',\r | |
1596 | 'font',\r | |
1597 | 'i',\r | |
1598 | 'nobr',\r | |
1599 | 's',\r | |
1600 | 'small',\r | |
1601 | 'strike',\r | |
1602 | 'strong',\r | |
1603 | 'tt',\r | |
1604 | 'u'\r | |
1605 | );\r | |
1606 | private $special = array(\r | |
1607 | 'address',\r | |
1608 | 'area',\r | |
1609 | 'base',\r | |
1610 | 'basefont',\r | |
1611 | 'bgsound',\r | |
1612 | 'blockquote',\r | |
1613 | 'body',\r | |
1614 | 'br',\r | |
1615 | 'center',\r | |
1616 | 'col',\r | |
1617 | 'colgroup',\r | |
1618 | 'dd',\r | |
1619 | 'dir',\r | |
1620 | 'div',\r | |
1621 | 'dl',\r | |
1622 | 'dt',\r | |
1623 | 'embed',\r | |
1624 | 'fieldset',\r | |
1625 | 'form',\r | |
1626 | 'frame',\r | |
1627 | 'frameset',\r | |
1628 | 'h1',\r | |
1629 | 'h2',\r | |
1630 | 'h3',\r | |
1631 | 'h4',\r | |
1632 | 'h5',\r | |
1633 | 'h6',\r | |
1634 | 'head',\r | |
1635 | 'hr',\r | |
1636 | 'iframe',\r | |
1637 | 'image',\r | |
1638 | 'img',\r | |
1639 | 'input',\r | |
1640 | 'isindex',\r | |
1641 | 'li',\r | |
1642 | 'link',\r | |
1643 | 'listing',\r | |
1644 | 'menu',\r | |
1645 | 'meta',\r | |
1646 | 'noembed',\r | |
1647 | 'noframes',\r | |
1648 | 'noscript',\r | |
1649 | 'ol',\r | |
1650 | 'optgroup',\r | |
1651 | 'option',\r | |
1652 | 'p',\r | |
1653 | 'param',\r | |
1654 | 'plaintext',\r | |
1655 | 'pre',\r | |
1656 | 'script',\r | |
1657 | 'select',\r | |
1658 | 'spacer',\r | |
1659 | 'style',\r | |
1660 | 'tbody',\r | |
1661 | 'textarea',\r | |
1662 | 'tfoot',\r | |
1663 | 'thead',\r | |
1664 | 'title',\r | |
1665 | 'tr',\r | |
1666 | 'ul',\r | |
1667 | 'wbr'\r | |
1668 | );\r | |
1669 | \r | |
1670 | // The different phases.\r | |
1671 | const INIT_PHASE = 0;\r | |
1672 | const ROOT_PHASE = 1;\r | |
1673 | const MAIN_PHASE = 2;\r | |
1674 | const END_PHASE = 3;\r | |
1675 | \r | |
1676 | // The different insertion modes for the main phase.\r | |
1677 | const BEFOR_HEAD = 0;\r | |
1678 | const IN_HEAD = 1;\r | |
1679 | const AFTER_HEAD = 2;\r | |
1680 | const IN_BODY = 3;\r | |
1681 | const IN_TABLE = 4;\r | |
1682 | const IN_CAPTION = 5;\r | |
1683 | const IN_CGROUP = 6;\r | |
1684 | const IN_TBODY = 7;\r | |
1685 | const IN_ROW = 8;\r | |
1686 | const IN_CELL = 9;\r | |
1687 | const IN_SELECT = 10;\r | |
1688 | const AFTER_BODY = 11;\r | |
1689 | const IN_FRAME = 12;\r | |
1690 | const AFTR_FRAME = 13;\r | |
1691 | \r | |
1692 | // The different types of elements.\r | |
1693 | const SPECIAL = 0;\r | |
1694 | const SCOPING = 1;\r | |
1695 | const FORMATTING = 2;\r | |
1696 | const PHRASING = 3;\r | |
1697 | \r | |
1698 | const MARKER = 0;\r | |
1699 | \r | |
1700 | public function __construct()\r | |
1701 | {\r | |
1702 | $this->phase = self::INIT_PHASE;\r | |
1703 | $this->mode = self::BEFOR_HEAD;\r | |
1704 | $this->dom = new DOMDocument;\r | |
1705 | \r | |
1706 | $this->dom->encoding = 'UTF-8';\r | |
1707 | $this->dom->preserveWhiteSpace = true;\r | |
1708 | $this->dom->substituteEntities = true;\r | |
1709 | $this->dom->strictErrorChecking = false;\r | |
1710 | }\r | |
1711 | \r | |
1712 | // Process tag tokens\r | |
1713 | public function emitToken($token)\r | |
1714 | {\r | |
1715 | switch ($this->phase) {\r | |
1716 | case self::INIT_PHASE:\r | |
1717 | return $this->initPhase($token);\r | |
1718 | break;\r | |
1719 | case self::ROOT_PHASE:\r | |
1720 | return $this->rootElementPhase($token);\r | |
1721 | break;\r | |
1722 | case self::MAIN_PHASE:\r | |
1723 | return $this->mainPhase($token);\r | |
1724 | break;\r | |
1725 | case self::END_PHASE :\r | |
1726 | return $this->trailingEndPhase($token);\r | |
1727 | break;\r | |
1728 | }\r | |
1729 | }\r | |
1730 | \r | |
1731 | private function initPhase($token)\r | |
1732 | {\r | |
1733 | /* Initially, the tree construction stage must handle each token\r | |
1734 | emitted from the tokenisation stage as follows: */\r | |
1735 | \r | |
1736 | /* A DOCTYPE token that is marked as being in error\r | |
1737 | A comment token\r | |
1738 | A start tag token\r | |
1739 | An end tag token\r | |
1740 | A character token that is not one of one of U+0009 CHARACTER TABULATION,\r | |
1741 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1742 | or U+0020 SPACE\r | |
1743 | An end-of-file token */\r | |
1744 | if ((isset($token['error']) && $token['error']) ||\r | |
1745 | $token['type'] === HTML5::COMMENT ||\r | |
1746 | $token['type'] === HTML5::STARTTAG ||\r | |
1747 | $token['type'] === HTML5::ENDTAG ||\r | |
1748 | $token['type'] === HTML5::EOF ||\r | |
1749 | ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&\r | |
1750 | !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))\r | |
1751 | ) {\r | |
1752 | /* This specification does not define how to handle this case. In\r | |
1753 | particular, user agents may ignore the entirety of this specification\r | |
1754 | altogether for such documents, and instead invoke special parse modes\r | |
1755 | with a greater emphasis on backwards compatibility. */\r | |
1756 | \r | |
1757 | $this->phase = self::ROOT_PHASE;\r | |
1758 | return $this->rootElementPhase($token);\r | |
1759 | \r | |
1760 | /* A DOCTYPE token marked as being correct */\r | |
1761 | } elseif (isset($token['error']) && !$token['error']) {\r | |
1762 | /* Append a DocumentType node to the Document node, with the name\r | |
1763 | attribute set to the name given in the DOCTYPE token (which will be\r | |
1764 | "HTML"), and the other attributes specific to DocumentType objects\r | |
1765 | set to null, empty lists, or the empty string as appropriate. */\r | |
1766 | $doctype = new DOMDocumentType(null, null, 'HTML');\r | |
1767 | \r | |
1768 | /* Then, switch to the root element phase of the tree construction\r | |
1769 | stage. */\r | |
1770 | $this->phase = self::ROOT_PHASE;\r | |
1771 | \r | |
1772 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
1773 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1774 | or U+0020 SPACE */\r | |
1775 | } elseif (isset($token['data']) && preg_match(\r | |
1776 | '/^[\t\n\x0b\x0c ]+$/',\r | |
1777 | $token['data']\r | |
1778 | )\r | |
1779 | ) {\r | |
1780 | /* Append that character to the Document node. */\r | |
1781 | $text = $this->dom->createTextNode($token['data']);\r | |
1782 | $this->dom->appendChild($text);\r | |
1783 | }\r | |
1784 | }\r | |
1785 | \r | |
1786 | private function rootElementPhase($token)\r | |
1787 | {\r | |
1788 | /* After the initial phase, as each token is emitted from the tokenisation\r | |
1789 | stage, it must be processed as described in this section. */\r | |
1790 | \r | |
1791 | /* A DOCTYPE token */\r | |
1792 | if ($token['type'] === HTML5::DOCTYPE) {\r | |
1793 | // Parse error. Ignore the token.\r | |
1794 | \r | |
1795 | /* A comment token */\r | |
1796 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
1797 | /* Append a Comment node to the Document object with the data\r | |
1798 | attribute set to the data given in the comment token. */\r | |
1799 | $comment = $this->dom->createComment($token['data']);\r | |
1800 | $this->dom->appendChild($comment);\r | |
1801 | \r | |
1802 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
1803 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1804 | or U+0020 SPACE */\r | |
1805 | } elseif ($token['type'] === HTML5::CHARACTR &&\r | |
1806 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
1807 | ) {\r | |
1808 | /* Append that character to the Document node. */\r | |
1809 | $text = $this->dom->createTextNode($token['data']);\r | |
1810 | $this->dom->appendChild($text);\r | |
1811 | \r | |
1812 | /* A character token that is not one of U+0009 CHARACTER TABULATION,\r | |
1813 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED\r | |
1814 | (FF), or U+0020 SPACE\r | |
1815 | A start tag token\r | |
1816 | An end tag token\r | |
1817 | An end-of-file token */\r | |
1818 | } elseif (($token['type'] === HTML5::CHARACTR &&\r | |
1819 | !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||\r | |
1820 | $token['type'] === HTML5::STARTTAG ||\r | |
1821 | $token['type'] === HTML5::ENDTAG ||\r | |
1822 | $token['type'] === HTML5::EOF\r | |
1823 | ) {\r | |
1824 | /* Create an HTMLElement node with the tag name html, in the HTML\r | |
1825 | namespace. Append it to the Document object. Switch to the main\r | |
1826 | phase and reprocess the current token. */\r | |
1827 | $html = $this->dom->createElement('html');\r | |
1828 | $this->dom->appendChild($html);\r | |
1829 | $this->stack[] = $html;\r | |
1830 | \r | |
1831 | $this->phase = self::MAIN_PHASE;\r | |
1832 | return $this->mainPhase($token);\r | |
1833 | }\r | |
1834 | }\r | |
1835 | \r | |
1836 | private function mainPhase($token)\r | |
1837 | {\r | |
1838 | /* Tokens in the main phase must be handled as follows: */\r | |
1839 | \r | |
1840 | /* A DOCTYPE token */\r | |
1841 | if ($token['type'] === HTML5::DOCTYPE) {\r | |
1842 | // Parse error. Ignore the token.\r | |
1843 | \r | |
1844 | /* A start tag token with the tag name "html" */\r | |
1845 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {\r | |
1846 | /* If this start tag token was not the first start tag token, then\r | |
1847 | it is a parse error. */\r | |
1848 | \r | |
1849 | /* For each attribute on the token, check to see if the attribute\r | |
1850 | is already present on the top element of the stack of open elements.\r | |
1851 | If it is not, add the attribute and its corresponding value to that\r | |
1852 | element. */\r | |
1853 | foreach ($token['attr'] as $attr) {\r | |
1854 | if (!$this->stack[0]->hasAttribute($attr['name'])) {\r | |
1855 | $this->stack[0]->setAttribute($attr['name'], $attr['value']);\r | |
1856 | }\r | |
1857 | }\r | |
1858 | \r | |
1859 | /* An end-of-file token */\r | |
1860 | } elseif ($token['type'] === HTML5::EOF) {\r | |
1861 | /* Generate implied end tags. */\r | |
1862 | $this->generateImpliedEndTags();\r | |
1863 | \r | |
1864 | /* Anything else. */\r | |
1865 | } else {\r | |
1866 | /* Depends on the insertion mode: */\r | |
1867 | switch ($this->mode) {\r | |
1868 | case self::BEFOR_HEAD:\r | |
1869 | return $this->beforeHead($token);\r | |
1870 | break;\r | |
1871 | case self::IN_HEAD:\r | |
1872 | return $this->inHead($token);\r | |
1873 | break;\r | |
1874 | case self::AFTER_HEAD:\r | |
1875 | return $this->afterHead($token);\r | |
1876 | break;\r | |
1877 | case self::IN_BODY:\r | |
1878 | return $this->inBody($token);\r | |
1879 | break;\r | |
1880 | case self::IN_TABLE:\r | |
1881 | return $this->inTable($token);\r | |
1882 | break;\r | |
1883 | case self::IN_CAPTION:\r | |
1884 | return $this->inCaption($token);\r | |
1885 | break;\r | |
1886 | case self::IN_CGROUP:\r | |
1887 | return $this->inColumnGroup($token);\r | |
1888 | break;\r | |
1889 | case self::IN_TBODY:\r | |
1890 | return $this->inTableBody($token);\r | |
1891 | break;\r | |
1892 | case self::IN_ROW:\r | |
1893 | return $this->inRow($token);\r | |
1894 | break;\r | |
1895 | case self::IN_CELL:\r | |
1896 | return $this->inCell($token);\r | |
1897 | break;\r | |
1898 | case self::IN_SELECT:\r | |
1899 | return $this->inSelect($token);\r | |
1900 | break;\r | |
1901 | case self::AFTER_BODY:\r | |
1902 | return $this->afterBody($token);\r | |
1903 | break;\r | |
1904 | case self::IN_FRAME:\r | |
1905 | return $this->inFrameset($token);\r | |
1906 | break;\r | |
1907 | case self::AFTR_FRAME:\r | |
1908 | return $this->afterFrameset($token);\r | |
1909 | break;\r | |
1910 | case self::END_PHASE:\r | |
1911 | return $this->trailingEndPhase($token);\r | |
1912 | break;\r | |
1913 | }\r | |
1914 | }\r | |
1915 | }\r | |
1916 | \r | |
1917 | private function beforeHead($token)\r | |
1918 | {\r | |
1919 | /* Handle the token as follows: */\r | |
1920 | \r | |
1921 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
1922 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1923 | or U+0020 SPACE */\r | |
1924 | if ($token['type'] === HTML5::CHARACTR &&\r | |
1925 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
1926 | ) {\r | |
1927 | /* Append the character to the current node. */\r | |
1928 | $this->insertText($token['data']);\r | |
1929 | \r | |
1930 | /* A comment token */\r | |
1931 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
1932 | /* Append a Comment node to the current node with the data attribute\r | |
1933 | set to the data given in the comment token. */\r | |
1934 | $this->insertComment($token['data']);\r | |
1935 | \r | |
1936 | /* A start tag token with the tag name "head" */\r | |
1937 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {\r | |
1938 | /* Create an element for the token, append the new element to the\r | |
1939 | current node and push it onto the stack of open elements. */\r | |
1940 | $element = $this->insertElement($token);\r | |
1941 | \r | |
1942 | /* Set the head element pointer to this new element node. */\r | |
1943 | $this->head_pointer = $element;\r | |
1944 | \r | |
1945 | /* Change the insertion mode to "in head". */\r | |
1946 | $this->mode = self::IN_HEAD;\r | |
1947 | \r | |
1948 | /* A start tag token whose tag name is one of: "base", "link", "meta",\r | |
1949 | "script", "style", "title". Or an end tag with the tag name "html".\r | |
1950 | Or a character token that is not one of U+0009 CHARACTER TABULATION,\r | |
1951 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1952 | or U+0020 SPACE. Or any other start tag token */\r | |
1953 | } elseif ($token['type'] === HTML5::STARTTAG ||\r | |
1954 | ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||\r | |
1955 | ($token['type'] === HTML5::CHARACTR && !preg_match(\r | |
1956 | '/^[\t\n\x0b\x0c ]$/',\r | |
1957 | $token['data']\r | |
1958 | ))\r | |
1959 | ) {\r | |
1960 | /* Act as if a start tag token with the tag name "head" and no\r | |
1961 | attributes had been seen, then reprocess the current token. */\r | |
1962 | $this->beforeHead(\r | |
1963 | array(\r | |
1964 | 'name' => 'head',\r | |
1965 | 'type' => HTML5::STARTTAG,\r | |
1966 | 'attr' => array()\r | |
1967 | )\r | |
1968 | );\r | |
1969 | \r | |
1970 | return $this->inHead($token);\r | |
1971 | \r | |
1972 | /* Any other end tag */\r | |
1973 | } elseif ($token['type'] === HTML5::ENDTAG) {\r | |
1974 | /* Parse error. Ignore the token. */\r | |
1975 | }\r | |
1976 | }\r | |
1977 | \r | |
1978 | private function inHead($token)\r | |
1979 | {\r | |
1980 | /* Handle the token as follows: */\r | |
1981 | \r | |
1982 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
1983 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
1984 | or U+0020 SPACE.\r | |
1985 | \r | |
1986 | THIS DIFFERS FROM THE SPEC: If the current node is either a title, style\r | |
1987 | or script element, append the character to the current node regardless\r | |
1988 | of its content. */\r | |
1989 | if (($token['type'] === HTML5::CHARACTR &&\r | |
1990 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (\r | |
1991 | $token['type'] === HTML5::CHARACTR && in_array(\r | |
1992 | end($this->stack)->nodeName,\r | |
1993 | array('title', 'style', 'script')\r | |
1994 | ))\r | |
1995 | ) {\r | |
1996 | /* Append the character to the current node. */\r | |
1997 | $this->insertText($token['data']);\r | |
1998 | \r | |
1999 | /* A comment token */\r | |
2000 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
2001 | /* Append a Comment node to the current node with the data attribute\r | |
2002 | set to the data given in the comment token. */\r | |
2003 | $this->insertComment($token['data']);\r | |
2004 | \r | |
2005 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
2006 | in_array($token['name'], array('title', 'style', 'script'))\r | |
2007 | ) {\r | |
2008 | array_pop($this->stack);\r | |
2009 | return HTML5::PCDATA;\r | |
2010 | \r | |
2011 | /* A start tag with the tag name "title" */\r | |
2012 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {\r | |
2013 | /* Create an element for the token and append the new element to the\r | |
2014 | node pointed to by the head element pointer, or, if that is null\r | |
2015 | (innerHTML case), to the current node. */\r | |
2016 | if ($this->head_pointer !== null) {\r | |
2017 | $element = $this->insertElement($token, false);\r | |
2018 | $this->head_pointer->appendChild($element);\r | |
2019 | \r | |
2020 | } else {\r | |
2021 | $element = $this->insertElement($token);\r | |
2022 | }\r | |
2023 | \r | |
2024 | /* Switch the tokeniser's content model flag to the RCDATA state. */\r | |
2025 | return HTML5::RCDATA;\r | |
2026 | \r | |
2027 | /* A start tag with the tag name "style" */\r | |
2028 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {\r | |
2029 | /* Create an element for the token and append the new element to the\r | |
2030 | node pointed to by the head element pointer, or, if that is null\r | |
2031 | (innerHTML case), to the current node. */\r | |
2032 | if ($this->head_pointer !== null) {\r | |
2033 | $element = $this->insertElement($token, false);\r | |
2034 | $this->head_pointer->appendChild($element);\r | |
2035 | \r | |
2036 | } else {\r | |
2037 | $this->insertElement($token);\r | |
2038 | }\r | |
2039 | \r | |
2040 | /* Switch the tokeniser's content model flag to the CDATA state. */\r | |
2041 | return HTML5::CDATA;\r | |
2042 | \r | |
2043 | /* A start tag with the tag name "script" */\r | |
2044 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {\r | |
2045 | /* Create an element for the token. */\r | |
2046 | $element = $this->insertElement($token, false);\r | |
2047 | $this->head_pointer->appendChild($element);\r | |
2048 | \r | |
2049 | /* Switch the tokeniser's content model flag to the CDATA state. */\r | |
2050 | return HTML5::CDATA;\r | |
2051 | \r | |
2052 | /* A start tag with the tag name "base", "link", or "meta" */\r | |
2053 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
2054 | $token['name'],\r | |
2055 | array('base', 'link', 'meta')\r | |
2056 | )\r | |
2057 | ) {\r | |
2058 | /* Create an element for the token and append the new element to the\r | |
2059 | node pointed to by the head element pointer, or, if that is null\r | |
2060 | (innerHTML case), to the current node. */\r | |
2061 | if ($this->head_pointer !== null) {\r | |
2062 | $element = $this->insertElement($token, false);\r | |
2063 | $this->head_pointer->appendChild($element);\r | |
2064 | array_pop($this->stack);\r | |
2065 | \r | |
2066 | } else {\r | |
2067 | $this->insertElement($token);\r | |
2068 | }\r | |
2069 | \r | |
2070 | /* An end tag with the tag name "head" */\r | |
2071 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {\r | |
2072 | /* If the current node is a head element, pop the current node off\r | |
2073 | the stack of open elements. */\r | |
2074 | if ($this->head_pointer->isSameNode(end($this->stack))) {\r | |
2075 | array_pop($this->stack);\r | |
2076 | \r | |
2077 | /* Otherwise, this is a parse error. */\r | |
2078 | } else {\r | |
2079 | // k\r | |
2080 | }\r | |
2081 | \r | |
2082 | /* Change the insertion mode to "after head". */\r | |
2083 | $this->mode = self::AFTER_HEAD;\r | |
2084 | \r | |
2085 | /* A start tag with the tag name "head" or an end tag except "html". */\r | |
2086 | } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||\r | |
2087 | ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')\r | |
2088 | ) {\r | |
2089 | // Parse error. Ignore the token.\r | |
2090 | \r | |
2091 | /* Anything else */\r | |
2092 | } else {\r | |
2093 | /* If the current node is a head element, act as if an end tag\r | |
2094 | token with the tag name "head" had been seen. */\r | |
2095 | if ($this->head_pointer->isSameNode(end($this->stack))) {\r | |
2096 | $this->inHead(\r | |
2097 | array(\r | |
2098 | 'name' => 'head',\r | |
2099 | 'type' => HTML5::ENDTAG\r | |
2100 | )\r | |
2101 | );\r | |
2102 | \r | |
2103 | /* Otherwise, change the insertion mode to "after head". */\r | |
2104 | } else {\r | |
2105 | $this->mode = self::AFTER_HEAD;\r | |
2106 | }\r | |
2107 | \r | |
2108 | /* Then, reprocess the current token. */\r | |
2109 | return $this->afterHead($token);\r | |
2110 | }\r | |
2111 | }\r | |
2112 | \r | |
2113 | private function afterHead($token)\r | |
2114 | {\r | |
2115 | /* Handle the token as follows: */\r | |
2116 | \r | |
2117 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
2118 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
2119 | or U+0020 SPACE */\r | |
2120 | if ($token['type'] === HTML5::CHARACTR &&\r | |
2121 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
2122 | ) {\r | |
2123 | /* Append the character to the current node. */\r | |
2124 | $this->insertText($token['data']);\r | |
2125 | \r | |
2126 | /* A comment token */\r | |
2127 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
2128 | /* Append a Comment node to the current node with the data attribute\r | |
2129 | set to the data given in the comment token. */\r | |
2130 | $this->insertComment($token['data']);\r | |
2131 | \r | |
2132 | /* A start tag token with the tag name "body" */\r | |
2133 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {\r | |
2134 | /* Insert a body element for the token. */\r | |
2135 | $this->insertElement($token);\r | |
2136 | \r | |
2137 | /* Change the insertion mode to "in body". */\r | |
2138 | $this->mode = self::IN_BODY;\r | |
2139 | \r | |
2140 | /* A start tag token with the tag name "frameset" */\r | |
2141 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {\r | |
2142 | /* Insert a frameset element for the token. */\r | |
2143 | $this->insertElement($token);\r | |
2144 | \r | |
2145 | /* Change the insertion mode to "in frameset". */\r | |
2146 | $this->mode = self::IN_FRAME;\r | |
2147 | \r | |
2148 | /* A start tag token whose tag name is one of: "base", "link", "meta",\r | |
2149 | "script", "style", "title" */\r | |
2150 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
2151 | $token['name'],\r | |
2152 | array('base', 'link', 'meta', 'script', 'style', 'title')\r | |
2153 | )\r | |
2154 | ) {\r | |
2155 | /* Parse error. Switch the insertion mode back to "in head" and\r | |
2156 | reprocess the token. */\r | |
2157 | $this->mode = self::IN_HEAD;\r | |
2158 | return $this->inHead($token);\r | |
2159 | \r | |
2160 | /* Anything else */\r | |
2161 | } else {\r | |
2162 | /* Act as if a start tag token with the tag name "body" and no\r | |
2163 | attributes had been seen, and then reprocess the current token. */\r | |
2164 | $this->afterHead(\r | |
2165 | array(\r | |
2166 | 'name' => 'body',\r | |
2167 | 'type' => HTML5::STARTTAG,\r | |
2168 | 'attr' => array()\r | |
2169 | )\r | |
2170 | );\r | |
2171 | \r | |
2172 | return $this->inBody($token);\r | |
2173 | }\r | |
2174 | }\r | |
2175 | \r | |
2176 | private function inBody($token)\r | |
2177 | {\r | |
2178 | /* Handle the token as follows: */\r | |
2179 | \r | |
2180 | switch ($token['type']) {\r | |
2181 | /* A character token */\r | |
2182 | case HTML5::CHARACTR:\r | |
2183 | /* Reconstruct the active formatting elements, if any. */\r | |
2184 | $this->reconstructActiveFormattingElements();\r | |
2185 | \r | |
2186 | /* Append the token's character to the current node. */\r | |
2187 | $this->insertText($token['data']);\r | |
2188 | break;\r | |
2189 | \r | |
2190 | /* A comment token */\r | |
2191 | case HTML5::COMMENT:\r | |
2192 | /* Append a Comment node to the current node with the data\r | |
2193 | attribute set to the data given in the comment token. */\r | |
2194 | $this->insertComment($token['data']);\r | |
2195 | break;\r | |
2196 | \r | |
2197 | case HTML5::STARTTAG:\r | |
2198 | switch ($token['name']) {\r | |
2199 | /* A start tag token whose tag name is one of: "script",\r | |
2200 | "style" */\r | |
2201 | case 'script':\r | |
2202 | case 'style':\r | |
2203 | /* Process the token as if the insertion mode had been "in\r | |
2204 | head". */\r | |
2205 | return $this->inHead($token);\r | |
2206 | break;\r | |
2207 | \r | |
2208 | /* A start tag token whose tag name is one of: "base", "link",\r | |
2209 | "meta", "title" */\r | |
2210 | case 'base':\r | |
2211 | case 'link':\r | |
2212 | case 'meta':\r | |
2213 | case 'title':\r | |
2214 | /* Parse error. Process the token as if the insertion mode\r | |
2215 | had been "in head". */\r | |
2216 | return $this->inHead($token);\r | |
2217 | break;\r | |
2218 | \r | |
2219 | /* A start tag token with the tag name "body" */\r | |
2220 | case 'body':\r | |
2221 | /* Parse error. If the second element on the stack of open\r | |
2222 | elements is not a body element, or, if the stack of open\r | |
2223 | elements has only one node on it, then ignore the token.\r | |
2224 | (innerHTML case) */\r | |
2225 | if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {\r | |
2226 | // Ignore\r | |
2227 | \r | |
2228 | /* Otherwise, for each attribute on the token, check to see\r | |
2229 | if the attribute is already present on the body element (the\r | |
2230 | second element) on the stack of open elements. If it is not,\r | |
2231 | add the attribute and its corresponding value to that\r | |
2232 | element. */\r | |
2233 | } else {\r | |
2234 | foreach ($token['attr'] as $attr) {\r | |
2235 | if (!$this->stack[1]->hasAttribute($attr['name'])) {\r | |
2236 | $this->stack[1]->setAttribute($attr['name'], $attr['value']);\r | |
2237 | }\r | |
2238 | }\r | |
2239 | }\r | |
2240 | break;\r | |
2241 | \r | |
2242 | /* A start tag whose tag name is one of: "address",\r | |
2243 | "blockquote", "center", "dir", "div", "dl", "fieldset",\r | |
2244 | "listing", "menu", "ol", "p", "ul" */\r | |
2245 | case 'address':\r | |
2246 | case 'blockquote':\r | |
2247 | case 'center':\r | |
2248 | case 'dir':\r | |
2249 | case 'div':\r | |
2250 | case 'dl':\r | |
2251 | case 'fieldset':\r | |
2252 | case 'listing':\r | |
2253 | case 'menu':\r | |
2254 | case 'ol':\r | |
2255 | case 'p':\r | |
2256 | case 'ul':\r | |
2257 | /* If the stack of open elements has a p element in scope,\r | |
2258 | then act as if an end tag with the tag name p had been\r | |
2259 | seen. */\r | |
2260 | if ($this->elementInScope('p')) {\r | |
2261 | $this->emitToken(\r | |
2262 | array(\r | |
2263 | 'name' => 'p',\r | |
2264 | 'type' => HTML5::ENDTAG\r | |
2265 | )\r | |
2266 | );\r | |
2267 | }\r | |
2268 | \r | |
2269 | /* Insert an HTML element for the token. */\r | |
2270 | $this->insertElement($token);\r | |
2271 | break;\r | |
2272 | \r | |
2273 | /* A start tag whose tag name is "form" */\r | |
2274 | case 'form':\r | |
2275 | /* If the form element pointer is not null, ignore the\r | |
2276 | token with a parse error. */\r | |
2277 | if ($this->form_pointer !== null) {\r | |
2278 | // Ignore.\r | |
2279 | \r | |
2280 | /* Otherwise: */\r | |
2281 | } else {\r | |
2282 | /* If the stack of open elements has a p element in\r | |
2283 | scope, then act as if an end tag with the tag name p\r | |
2284 | had been seen. */\r | |
2285 | if ($this->elementInScope('p')) {\r | |
2286 | $this->emitToken(\r | |
2287 | array(\r | |
2288 | 'name' => 'p',\r | |
2289 | 'type' => HTML5::ENDTAG\r | |
2290 | )\r | |
2291 | );\r | |
2292 | }\r | |
2293 | \r | |
2294 | /* Insert an HTML element for the token, and set the\r | |
2295 | form element pointer to point to the element created. */\r | |
2296 | $element = $this->insertElement($token);\r | |
2297 | $this->form_pointer = $element;\r | |
2298 | }\r | |
2299 | break;\r | |
2300 | \r | |
2301 | /* A start tag whose tag name is "li", "dd" or "dt" */\r | |
2302 | case 'li':\r | |
2303 | case 'dd':\r | |
2304 | case 'dt':\r | |
2305 | /* If the stack of open elements has a p element in scope,\r | |
2306 | then act as if an end tag with the tag name p had been\r | |
2307 | seen. */\r | |
2308 | if ($this->elementInScope('p')) {\r | |
2309 | $this->emitToken(\r | |
2310 | array(\r | |
2311 | 'name' => 'p',\r | |
2312 | 'type' => HTML5::ENDTAG\r | |
2313 | )\r | |
2314 | );\r | |
2315 | }\r | |
2316 | \r | |
2317 | $stack_length = count($this->stack) - 1;\r | |
2318 | \r | |
2319 | for ($n = $stack_length; 0 <= $n; $n--) {\r | |
2320 | /* 1. Initialise node to be the current node (the\r | |
2321 | bottommost node of the stack). */\r | |
2322 | $stop = false;\r | |
2323 | $node = $this->stack[$n];\r | |
2324 | $cat = $this->getElementCategory($node->tagName);\r | |
2325 | \r | |
2326 | /* 2. If node is an li, dd or dt element, then pop all\r | |
2327 | the nodes from the current node up to node, including\r | |
2328 | node, then stop this algorithm. */\r | |
2329 | if ($token['name'] === $node->tagName || ($token['name'] !== 'li'\r | |
2330 | && ($node->tagName === 'dd' || $node->tagName === 'dt'))\r | |
2331 | ) {\r | |
2332 | for ($x = $stack_length; $x >= $n; $x--) {\r | |
2333 | array_pop($this->stack);\r | |
2334 | }\r | |
2335 | \r | |
2336 | break;\r | |
2337 | }\r | |
2338 | \r | |
2339 | /* 3. If node is not in the formatting category, and is\r | |
2340 | not in the phrasing category, and is not an address or\r | |
2341 | div element, then stop this algorithm. */\r | |
2342 | if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&\r | |
2343 | $node->tagName !== 'address' && $node->tagName !== 'div'\r | |
2344 | ) {\r | |
2345 | break;\r | |
2346 | }\r | |
2347 | }\r | |
2348 | \r | |
2349 | /* Finally, insert an HTML element with the same tag\r | |
2350 | name as the token's. */\r | |
2351 | $this->insertElement($token);\r | |
2352 | break;\r | |
2353 | \r | |
2354 | /* A start tag token whose tag name is "plaintext" */\r | |
2355 | case 'plaintext':\r | |
2356 | /* If the stack of open elements has a p element in scope,\r | |
2357 | then act as if an end tag with the tag name p had been\r | |
2358 | seen. */\r | |
2359 | if ($this->elementInScope('p')) {\r | |
2360 | $this->emitToken(\r | |
2361 | array(\r | |
2362 | 'name' => 'p',\r | |
2363 | 'type' => HTML5::ENDTAG\r | |
2364 | )\r | |
2365 | );\r | |
2366 | }\r | |
2367 | \r | |
2368 | /* Insert an HTML element for the token. */\r | |
2369 | $this->insertElement($token);\r | |
2370 | \r | |
2371 | return HTML5::PLAINTEXT;\r | |
2372 | break;\r | |
2373 | \r | |
2374 | /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",\r | |
2375 | "h5", "h6" */\r | |
2376 | case 'h1':\r | |
2377 | case 'h2':\r | |
2378 | case 'h3':\r | |
2379 | case 'h4':\r | |
2380 | case 'h5':\r | |
2381 | case 'h6':\r | |
2382 | /* If the stack of open elements has a p element in scope,\r | |
2383 | then act as if an end tag with the tag name p had been seen. */\r | |
2384 | if ($this->elementInScope('p')) {\r | |
2385 | $this->emitToken(\r | |
2386 | array(\r | |
2387 | 'name' => 'p',\r | |
2388 | 'type' => HTML5::ENDTAG\r | |
2389 | )\r | |
2390 | );\r | |
2391 | }\r | |
2392 | \r | |
2393 | /* If the stack of open elements has in scope an element whose\r | |
2394 | tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then\r | |
2395 | this is a parse error; pop elements from the stack until an\r | |
2396 | element with one of those tag names has been popped from the\r | |
2397 | stack. */\r | |
2398 | while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {\r | |
2399 | array_pop($this->stack);\r | |
2400 | }\r | |
2401 | \r | |
2402 | /* Insert an HTML element for the token. */\r | |
2403 | $this->insertElement($token);\r | |
2404 | break;\r | |
2405 | \r | |
2406 | /* A start tag whose tag name is "a" */\r | |
2407 | case 'a':\r | |
2408 | /* If the list of active formatting elements contains\r | |
2409 | an element whose tag name is "a" between the end of the\r | |
2410 | list and the last marker on the list (or the start of\r | |
2411 | the list if there is no marker on the list), then this\r | |
2412 | is a parse error; act as if an end tag with the tag name\r | |
2413 | "a" had been seen, then remove that element from the list\r | |
2414 | of active formatting elements and the stack of open\r | |
2415 | elements if the end tag didn't already remove it (it\r | |
2416 | might not have if the element is not in table scope). */\r | |
2417 | $leng = count($this->a_formatting);\r | |
2418 | \r | |
2419 | for ($n = $leng - 1; $n >= 0; $n--) {\r | |
2420 | if ($this->a_formatting[$n] === self::MARKER) {\r | |
2421 | break;\r | |
2422 | \r | |
2423 | } elseif ($this->a_formatting[$n]->nodeName === 'a') {\r | |
2424 | $this->emitToken(\r | |
2425 | array(\r | |
2426 | 'name' => 'a',\r | |
2427 | 'type' => HTML5::ENDTAG\r | |
2428 | )\r | |
2429 | );\r | |
2430 | break;\r | |
2431 | }\r | |
2432 | }\r | |
2433 | \r | |
2434 | /* Reconstruct the active formatting elements, if any. */\r | |
2435 | $this->reconstructActiveFormattingElements();\r | |
2436 | \r | |
2437 | /* Insert an HTML element for the token. */\r | |
2438 | $el = $this->insertElement($token);\r | |
2439 | \r | |
2440 | /* Add that element to the list of active formatting\r | |
2441 | elements. */\r | |
2442 | $this->a_formatting[] = $el;\r | |
2443 | break;\r | |
2444 | \r | |
2445 | /* A start tag whose tag name is one of: "b", "big", "em", "font",\r | |
2446 | "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */\r | |
2447 | case 'b':\r | |
2448 | case 'big':\r | |
2449 | case 'em':\r | |
2450 | case 'font':\r | |
2451 | case 'i':\r | |
2452 | case 'nobr':\r | |
2453 | case 's':\r | |
2454 | case 'small':\r | |
2455 | case 'strike':\r | |
2456 | case 'strong':\r | |
2457 | case 'tt':\r | |
2458 | case 'u':\r | |
2459 | /* Reconstruct the active formatting elements, if any. */\r | |
2460 | $this->reconstructActiveFormattingElements();\r | |
2461 | \r | |
2462 | /* Insert an HTML element for the token. */\r | |
2463 | $el = $this->insertElement($token);\r | |
2464 | \r | |
2465 | /* Add that element to the list of active formatting\r | |
2466 | elements. */\r | |
2467 | $this->a_formatting[] = $el;\r | |
2468 | break;\r | |
2469 | \r | |
2470 | /* A start tag token whose tag name is "button" */\r | |
2471 | case 'button':\r | |
2472 | /* If the stack of open elements has a button element in scope,\r | |
2473 | then this is a parse error; act as if an end tag with the tag\r | |
2474 | name "button" had been seen, then reprocess the token. (We don't\r | |
2475 | do that. Unnecessary.) */\r | |
2476 | if ($this->elementInScope('button')) {\r | |
2477 | $this->inBody(\r | |
2478 | array(\r | |
2479 | 'name' => 'button',\r | |
2480 | 'type' => HTML5::ENDTAG\r | |
2481 | )\r | |
2482 | );\r | |
2483 | }\r | |
2484 | \r | |
2485 | /* Reconstruct the active formatting elements, if any. */\r | |
2486 | $this->reconstructActiveFormattingElements();\r | |
2487 | \r | |
2488 | /* Insert an HTML element for the token. */\r | |
2489 | $this->insertElement($token);\r | |
2490 | \r | |
2491 | /* Insert a marker at the end of the list of active\r | |
2492 | formatting elements. */\r | |
2493 | $this->a_formatting[] = self::MARKER;\r | |
2494 | break;\r | |
2495 | \r | |
2496 | /* A start tag token whose tag name is one of: "marquee", "object" */\r | |
2497 | case 'marquee':\r | |
2498 | case 'object':\r | |
2499 | /* Reconstruct the active formatting elements, if any. */\r | |
2500 | $this->reconstructActiveFormattingElements();\r | |
2501 | \r | |
2502 | /* Insert an HTML element for the token. */\r | |
2503 | $this->insertElement($token);\r | |
2504 | \r | |
2505 | /* Insert a marker at the end of the list of active\r | |
2506 | formatting elements. */\r | |
2507 | $this->a_formatting[] = self::MARKER;\r | |
2508 | break;\r | |
2509 | \r | |
2510 | /* A start tag token whose tag name is "xmp" */\r | |
2511 | case 'xmp':\r | |
2512 | /* Reconstruct the active formatting elements, if any. */\r | |
2513 | $this->reconstructActiveFormattingElements();\r | |
2514 | \r | |
2515 | /* Insert an HTML element for the token. */\r | |
2516 | $this->insertElement($token);\r | |
2517 | \r | |
2518 | /* Switch the content model flag to the CDATA state. */\r | |
2519 | return HTML5::CDATA;\r | |
2520 | break;\r | |
2521 | \r | |
2522 | /* A start tag whose tag name is "table" */\r | |
2523 | case 'table':\r | |
2524 | /* If the stack of open elements has a p element in scope,\r | |
2525 | then act as if an end tag with the tag name p had been seen. */\r | |
2526 | if ($this->elementInScope('p')) {\r | |
2527 | $this->emitToken(\r | |
2528 | array(\r | |
2529 | 'name' => 'p',\r | |
2530 | 'type' => HTML5::ENDTAG\r | |
2531 | )\r | |
2532 | );\r | |
2533 | }\r | |
2534 | \r | |
2535 | /* Insert an HTML element for the token. */\r | |
2536 | $this->insertElement($token);\r | |
2537 | \r | |
2538 | /* Change the insertion mode to "in table". */\r | |
2539 | $this->mode = self::IN_TABLE;\r | |
2540 | break;\r | |
2541 | \r | |
2542 | /* A start tag whose tag name is one of: "area", "basefont",\r | |
2543 | "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */\r | |
2544 | case 'area':\r | |
2545 | case 'basefont':\r | |
2546 | case 'bgsound':\r | |
2547 | case 'br':\r | |
2548 | case 'embed':\r | |
2549 | case 'img':\r | |
2550 | case 'param':\r | |
2551 | case 'spacer':\r | |
2552 | case 'wbr':\r | |
2553 | /* Reconstruct the active formatting elements, if any. */\r | |
2554 | $this->reconstructActiveFormattingElements();\r | |
2555 | \r | |
2556 | /* Insert an HTML element for the token. */\r | |
2557 | $this->insertElement($token);\r | |
2558 | \r | |
2559 | /* Immediately pop the current node off the stack of open elements. */\r | |
2560 | array_pop($this->stack);\r | |
2561 | break;\r | |
2562 | \r | |
2563 | /* A start tag whose tag name is "hr" */\r | |
2564 | case 'hr':\r | |
2565 | /* If the stack of open elements has a p element in scope,\r | |
2566 | then act as if an end tag with the tag name p had been seen. */\r | |
2567 | if ($this->elementInScope('p')) {\r | |
2568 | $this->emitToken(\r | |
2569 | array(\r | |
2570 | 'name' => 'p',\r | |
2571 | 'type' => HTML5::ENDTAG\r | |
2572 | )\r | |
2573 | );\r | |
2574 | }\r | |
2575 | \r | |
2576 | /* Insert an HTML element for the token. */\r | |
2577 | $this->insertElement($token);\r | |
2578 | \r | |
2579 | /* Immediately pop the current node off the stack of open elements. */\r | |
2580 | array_pop($this->stack);\r | |
2581 | break;\r | |
2582 | \r | |
2583 | /* A start tag whose tag name is "image" */\r | |
2584 | case 'image':\r | |
2585 | /* Parse error. Change the token's tag name to "img" and\r | |
2586 | reprocess it. (Don't ask.) */\r | |
2587 | $token['name'] = 'img';\r | |
2588 | return $this->inBody($token);\r | |
2589 | break;\r | |
2590 | \r | |
2591 | /* A start tag whose tag name is "input" */\r | |
2592 | case 'input':\r | |
2593 | /* Reconstruct the active formatting elements, if any. */\r | |
2594 | $this->reconstructActiveFormattingElements();\r | |
2595 | \r | |
2596 | /* Insert an input element for the token. */\r | |
2597 | $element = $this->insertElement($token, false);\r | |
2598 | \r | |
2599 | /* If the form element pointer is not null, then associate the\r | |
2600 | input element with the form element pointed to by the form\r | |
2601 | element pointer. */\r | |
2602 | $this->form_pointer !== null\r | |
2603 | ? $this->form_pointer->appendChild($element)\r | |
2604 | : end($this->stack)->appendChild($element);\r | |
2605 | \r | |
2606 | /* Pop that input element off the stack of open elements. */\r | |
2607 | array_pop($this->stack);\r | |
2608 | break;\r | |
2609 | \r | |
2610 | /* A start tag whose tag name is "isindex" */\r | |
2611 | case 'isindex':\r | |
2612 | /* Parse error. */\r | |
2613 | // w/e\r | |
2614 | \r | |
2615 | /* If the form element pointer is not null,\r | |
2616 | then ignore the token. */\r | |
2617 | if ($this->form_pointer === null) {\r | |
2618 | /* Act as if a start tag token with the tag name "form" had\r | |
2619 | been seen. */\r | |
2620 | $this->inBody(\r | |
2621 | array(\r | |
2622 | 'name' => 'body',\r | |
2623 | 'type' => HTML5::STARTTAG,\r | |
2624 | 'attr' => array()\r | |
2625 | )\r | |
2626 | );\r | |
2627 | \r | |
2628 | /* Act as if a start tag token with the tag name "hr" had\r | |
2629 | been seen. */\r | |
2630 | $this->inBody(\r | |
2631 | array(\r | |
2632 | 'name' => 'hr',\r | |
2633 | 'type' => HTML5::STARTTAG,\r | |
2634 | 'attr' => array()\r | |
2635 | )\r | |
2636 | );\r | |
2637 | \r | |
2638 | /* Act as if a start tag token with the tag name "p" had\r | |
2639 | been seen. */\r | |
2640 | $this->inBody(\r | |
2641 | array(\r | |
2642 | 'name' => 'p',\r | |
2643 | 'type' => HTML5::STARTTAG,\r | |
2644 | 'attr' => array()\r | |
2645 | )\r | |
2646 | );\r | |
2647 | \r | |
2648 | /* Act as if a start tag token with the tag name "label"\r | |
2649 | had been seen. */\r | |
2650 | $this->inBody(\r | |
2651 | array(\r | |
2652 | 'name' => 'label',\r | |
2653 | 'type' => HTML5::STARTTAG,\r | |
2654 | 'attr' => array()\r | |
2655 | )\r | |
2656 | );\r | |
2657 | \r | |
2658 | /* Act as if a stream of character tokens had been seen. */\r | |
2659 | $this->insertText(\r | |
2660 | 'This is a searchable index. ' .\r | |
2661 | 'Insert your search keywords here: '\r | |
2662 | );\r | |
2663 | \r | |
2664 | /* Act as if a start tag token with the tag name "input"\r | |
2665 | had been seen, with all the attributes from the "isindex"\r | |
2666 | token, except with the "name" attribute set to the value\r | |
2667 | "isindex" (ignoring any explicit "name" attribute). */\r | |
2668 | $attr = $token['attr'];\r | |
2669 | $attr[] = array('name' => 'name', 'value' => 'isindex');\r | |
2670 | \r | |
2671 | $this->inBody(\r | |
2672 | array(\r | |
2673 | 'name' => 'input',\r | |
2674 | 'type' => HTML5::STARTTAG,\r | |
2675 | 'attr' => $attr\r | |
2676 | )\r | |
2677 | );\r | |
2678 | \r | |
2679 | /* Act as if a stream of character tokens had been seen\r | |
2680 | (see below for what they should say). */\r | |
2681 | $this->insertText(\r | |
2682 | 'This is a searchable index. ' .\r | |
2683 | 'Insert your search keywords here: '\r | |
2684 | );\r | |
2685 | \r | |
2686 | /* Act as if an end tag token with the tag name "label"\r | |
2687 | had been seen. */\r | |
2688 | $this->inBody(\r | |
2689 | array(\r | |
2690 | 'name' => 'label',\r | |
2691 | 'type' => HTML5::ENDTAG\r | |
2692 | )\r | |
2693 | );\r | |
2694 | \r | |
2695 | /* Act as if an end tag token with the tag name "p" had\r | |
2696 | been seen. */\r | |
2697 | $this->inBody(\r | |
2698 | array(\r | |
2699 | 'name' => 'p',\r | |
2700 | 'type' => HTML5::ENDTAG\r | |
2701 | )\r | |
2702 | );\r | |
2703 | \r | |
2704 | /* Act as if a start tag token with the tag name "hr" had\r | |
2705 | been seen. */\r | |
2706 | $this->inBody(\r | |
2707 | array(\r | |
2708 | 'name' => 'hr',\r | |
2709 | 'type' => HTML5::ENDTAG\r | |
2710 | )\r | |
2711 | );\r | |
2712 | \r | |
2713 | /* Act as if an end tag token with the tag name "form" had\r | |
2714 | been seen. */\r | |
2715 | $this->inBody(\r | |
2716 | array(\r | |
2717 | 'name' => 'form',\r | |
2718 | 'type' => HTML5::ENDTAG\r | |
2719 | )\r | |
2720 | );\r | |
2721 | }\r | |
2722 | break;\r | |
2723 | \r | |
2724 | /* A start tag whose tag name is "textarea" */\r | |
2725 | case 'textarea':\r | |
2726 | $this->insertElement($token);\r | |
2727 | \r | |
2728 | /* Switch the tokeniser's content model flag to the\r | |
2729 | RCDATA state. */\r | |
2730 | return HTML5::RCDATA;\r | |
2731 | break;\r | |
2732 | \r | |
2733 | /* A start tag whose tag name is one of: "iframe", "noembed",\r | |
2734 | "noframes" */\r | |
2735 | case 'iframe':\r | |
2736 | case 'noembed':\r | |
2737 | case 'noframes':\r | |
2738 | $this->insertElement($token);\r | |
2739 | \r | |
2740 | /* Switch the tokeniser's content model flag to the CDATA state. */\r | |
2741 | return HTML5::CDATA;\r | |
2742 | break;\r | |
2743 | \r | |
2744 | /* A start tag whose tag name is "select" */\r | |
2745 | case 'select':\r | |
2746 | /* Reconstruct the active formatting elements, if any. */\r | |
2747 | $this->reconstructActiveFormattingElements();\r | |
2748 | \r | |
2749 | /* Insert an HTML element for the token. */\r | |
2750 | $this->insertElement($token);\r | |
2751 | \r | |
2752 | /* Change the insertion mode to "in select". */\r | |
2753 | $this->mode = self::IN_SELECT;\r | |
2754 | break;\r | |
2755 | \r | |
2756 | /* A start or end tag whose tag name is one of: "caption", "col",\r | |
2757 | "colgroup", "frame", "frameset", "head", "option", "optgroup",\r | |
2758 | "tbody", "td", "tfoot", "th", "thead", "tr". */\r | |
2759 | case 'caption':\r | |
2760 | case 'col':\r | |
2761 | case 'colgroup':\r | |
2762 | case 'frame':\r | |
2763 | case 'frameset':\r | |
2764 | case 'head':\r | |
2765 | case 'option':\r | |
2766 | case 'optgroup':\r | |
2767 | case 'tbody':\r | |
2768 | case 'td':\r | |
2769 | case 'tfoot':\r | |
2770 | case 'th':\r | |
2771 | case 'thead':\r | |
2772 | case 'tr':\r | |
2773 | // Parse error. Ignore the token.\r | |
2774 | break;\r | |
2775 | \r | |
2776 | /* A start or end tag whose tag name is one of: "event-source",\r | |
2777 | "section", "nav", "article", "aside", "header", "footer",\r | |
2778 | "datagrid", "command" */\r | |
2779 | case 'event-source':\r | |
2780 | case 'section':\r | |
2781 | case 'nav':\r | |
2782 | case 'article':\r | |
2783 | case 'aside':\r | |
2784 | case 'header':\r | |
2785 | case 'footer':\r | |
2786 | case 'datagrid':\r | |
2787 | case 'command':\r | |
2788 | // Work in progress!\r | |
2789 | break;\r | |
2790 | \r | |
2791 | /* A start tag token not covered by the previous entries */\r | |
2792 | default:\r | |
2793 | /* Reconstruct the active formatting elements, if any. */\r | |
2794 | $this->reconstructActiveFormattingElements();\r | |
2795 | \r | |
2796 | $this->insertElement($token, true, true);\r | |
2797 | break;\r | |
2798 | }\r | |
2799 | break;\r | |
2800 | \r | |
2801 | case HTML5::ENDTAG:\r | |
2802 | switch ($token['name']) {\r | |
2803 | /* An end tag with the tag name "body" */\r | |
2804 | case 'body':\r | |
2805 | /* If the second element in the stack of open elements is\r | |
2806 | not a body element, this is a parse error. Ignore the token.\r | |
2807 | (innerHTML case) */\r | |
2808 | if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {\r | |
2809 | // Ignore.\r | |
2810 | \r | |
2811 | /* If the current node is not the body element, then this\r | |
2812 | is a parse error. */\r | |
2813 | } elseif (end($this->stack)->nodeName !== 'body') {\r | |
2814 | // Parse error.\r | |
2815 | }\r | |
2816 | \r | |
2817 | /* Change the insertion mode to "after body". */\r | |
2818 | $this->mode = self::AFTER_BODY;\r | |
2819 | break;\r | |
2820 | \r | |
2821 | /* An end tag with the tag name "html" */\r | |
2822 | case 'html':\r | |
2823 | /* Act as if an end tag with tag name "body" had been seen,\r | |
2824 | then, if that token wasn't ignored, reprocess the current\r | |
2825 | token. */\r | |
2826 | $this->inBody(\r | |
2827 | array(\r | |
2828 | 'name' => 'body',\r | |
2829 | 'type' => HTML5::ENDTAG\r | |
2830 | )\r | |
2831 | );\r | |
2832 | \r | |
2833 | return $this->afterBody($token);\r | |
2834 | break;\r | |
2835 | \r | |
2836 | /* An end tag whose tag name is one of: "address", "blockquote",\r | |
2837 | "center", "dir", "div", "dl", "fieldset", "listing", "menu",\r | |
2838 | "ol", "pre", "ul" */\r | |
2839 | case 'address':\r | |
2840 | case 'blockquote':\r | |
2841 | case 'center':\r | |
2842 | case 'dir':\r | |
2843 | case 'div':\r | |
2844 | case 'dl':\r | |
2845 | case 'fieldset':\r | |
2846 | case 'listing':\r | |
2847 | case 'menu':\r | |
2848 | case 'ol':\r | |
2849 | case 'pre':\r | |
2850 | case 'ul':\r | |
2851 | /* If the stack of open elements has an element in scope\r | |
2852 | with the same tag name as that of the token, then generate\r | |
2853 | implied end tags. */\r | |
2854 | if ($this->elementInScope($token['name'])) {\r | |
2855 | $this->generateImpliedEndTags();\r | |
2856 | \r | |
2857 | /* Now, if the current node is not an element with\r | |
2858 | the same tag name as that of the token, then this\r | |
2859 | is a parse error. */\r | |
2860 | // w/e\r | |
2861 | \r | |
2862 | /* If the stack of open elements has an element in\r | |
2863 | scope with the same tag name as that of the token,\r | |
2864 | then pop elements from this stack until an element\r | |
2865 | with that tag name has been popped from the stack. */\r | |
2866 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
2867 | if ($this->stack[$n]->nodeName === $token['name']) {\r | |
2868 | $n = -1;\r | |
2869 | }\r | |
2870 | \r | |
2871 | array_pop($this->stack);\r | |
2872 | }\r | |
2873 | }\r | |
2874 | break;\r | |
2875 | \r | |
2876 | /* An end tag whose tag name is "form" */\r | |
2877 | case 'form':\r | |
2878 | /* If the stack of open elements has an element in scope\r | |
2879 | with the same tag name as that of the token, then generate\r | |
2880 | implied end tags. */\r | |
2881 | if ($this->elementInScope($token['name'])) {\r | |
2882 | $this->generateImpliedEndTags();\r | |
2883 | \r | |
2884 | }\r | |
2885 | \r | |
2886 | if (end($this->stack)->nodeName !== $token['name']) {\r | |
2887 | /* Now, if the current node is not an element with the\r | |
2888 | same tag name as that of the token, then this is a parse\r | |
2889 | error. */\r | |
2890 | // w/e\r | |
2891 | \r | |
2892 | } else {\r | |
2893 | /* Otherwise, if the current node is an element with\r | |
2894 | the same tag name as that of the token pop that element\r | |
2895 | from the stack. */\r | |
2896 | array_pop($this->stack);\r | |
2897 | }\r | |
2898 | \r | |
2899 | /* In any case, set the form element pointer to null. */\r | |
2900 | $this->form_pointer = null;\r | |
2901 | break;\r | |
2902 | \r | |
2903 | /* An end tag whose tag name is "p" */\r | |
2904 | case 'p':\r | |
2905 | /* If the stack of open elements has a p element in scope,\r | |
2906 | then generate implied end tags, except for p elements. */\r | |
2907 | if ($this->elementInScope('p')) {\r | |
2908 | $this->generateImpliedEndTags(array('p'));\r | |
2909 | \r | |
2910 | /* If the current node is not a p element, then this is\r | |
2911 | a parse error. */\r | |
2912 | // k\r | |
2913 | \r | |
2914 | /* If the stack of open elements has a p element in\r | |
2915 | scope, then pop elements from this stack until the stack\r | |
2916 | no longer has a p element in scope. */\r | |
2917 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
2918 | if ($this->elementInScope('p')) {\r | |
2919 | array_pop($this->stack);\r | |
2920 | \r | |
2921 | } else {\r | |
2922 | break;\r | |
2923 | }\r | |
2924 | }\r | |
2925 | }\r | |
2926 | break;\r | |
2927 | \r | |
2928 | /* An end tag whose tag name is "dd", "dt", or "li" */\r | |
2929 | case 'dd':\r | |
2930 | case 'dt':\r | |
2931 | case 'li':\r | |
2932 | /* If the stack of open elements has an element in scope\r | |
2933 | whose tag name matches the tag name of the token, then\r | |
2934 | generate implied end tags, except for elements with the\r | |
2935 | same tag name as the token. */\r | |
2936 | if ($this->elementInScope($token['name'])) {\r | |
2937 | $this->generateImpliedEndTags(array($token['name']));\r | |
2938 | \r | |
2939 | /* If the current node is not an element with the same\r | |
2940 | tag name as the token, then this is a parse error. */\r | |
2941 | // w/e\r | |
2942 | \r | |
2943 | /* If the stack of open elements has an element in scope\r | |
2944 | whose tag name matches the tag name of the token, then\r | |
2945 | pop elements from this stack until an element with that\r | |
2946 | tag name has been popped from the stack. */\r | |
2947 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
2948 | if ($this->stack[$n]->nodeName === $token['name']) {\r | |
2949 | $n = -1;\r | |
2950 | }\r | |
2951 | \r | |
2952 | array_pop($this->stack);\r | |
2953 | }\r | |
2954 | }\r | |
2955 | break;\r | |
2956 | \r | |
2957 | /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",\r | |
2958 | "h5", "h6" */\r | |
2959 | case 'h1':\r | |
2960 | case 'h2':\r | |
2961 | case 'h3':\r | |
2962 | case 'h4':\r | |
2963 | case 'h5':\r | |
2964 | case 'h6':\r | |
2965 | $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');\r | |
2966 | \r | |
2967 | /* If the stack of open elements has in scope an element whose\r | |
2968 | tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then\r | |
2969 | generate implied end tags. */\r | |
2970 | if ($this->elementInScope($elements)) {\r | |
2971 | $this->generateImpliedEndTags();\r | |
2972 | \r | |
2973 | /* Now, if the current node is not an element with the same\r | |
2974 | tag name as that of the token, then this is a parse error. */\r | |
2975 | // w/e\r | |
2976 | \r | |
2977 | /* If the stack of open elements has in scope an element\r | |
2978 | whose tag name is one of "h1", "h2", "h3", "h4", "h5", or\r | |
2979 | "h6", then pop elements from the stack until an element\r | |
2980 | with one of those tag names has been popped from the stack. */\r | |
2981 | while ($this->elementInScope($elements)) {\r | |
2982 | array_pop($this->stack);\r | |
2983 | }\r | |
2984 | }\r | |
2985 | break;\r | |
2986 | \r | |
2987 | /* An end tag whose tag name is one of: "a", "b", "big", "em",\r | |
2988 | "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */\r | |
2989 | case 'a':\r | |
2990 | case 'b':\r | |
2991 | case 'big':\r | |
2992 | case 'em':\r | |
2993 | case 'font':\r | |
2994 | case 'i':\r | |
2995 | case 'nobr':\r | |
2996 | case 's':\r | |
2997 | case 'small':\r | |
2998 | case 'strike':\r | |
2999 | case 'strong':\r | |
3000 | case 'tt':\r | |
3001 | case 'u':\r | |
3002 | /* 1. Let the formatting element be the last element in\r | |
3003 | the list of active formatting elements that:\r | |
3004 | * is between the end of the list and the last scope\r | |
3005 | marker in the list, if any, or the start of the list\r | |
3006 | otherwise, and\r | |
3007 | * has the same tag name as the token.\r | |
3008 | */\r | |
3009 | while (true) {\r | |
3010 | for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {\r | |
3011 | if ($this->a_formatting[$a] === self::MARKER) {\r | |
3012 | break;\r | |
3013 | \r | |
3014 | } elseif ($this->a_formatting[$a]->tagName === $token['name']) {\r | |
3015 | $formatting_element = $this->a_formatting[$a];\r | |
3016 | $in_stack = in_array($formatting_element, $this->stack, true);\r | |
3017 | $fe_af_pos = $a;\r | |
3018 | break;\r | |
3019 | }\r | |
3020 | }\r | |
3021 | \r | |
3022 | /* If there is no such node, or, if that node is\r | |
3023 | also in the stack of open elements but the element\r | |
3024 | is not in scope, then this is a parse error. Abort\r | |
3025 | these steps. The token is ignored. */\r | |
3026 | if (!isset($formatting_element) || ($in_stack &&\r | |
3027 | !$this->elementInScope($token['name']))\r | |
3028 | ) {\r | |
3029 | break;\r | |
3030 | \r | |
3031 | /* Otherwise, if there is such a node, but that node\r | |
3032 | is not in the stack of open elements, then this is a\r | |
3033 | parse error; remove the element from the list, and\r | |
3034 | abort these steps. */\r | |
3035 | } elseif (isset($formatting_element) && !$in_stack) {\r | |
3036 | unset($this->a_formatting[$fe_af_pos]);\r | |
3037 | $this->a_formatting = array_merge($this->a_formatting);\r | |
3038 | break;\r | |
3039 | }\r | |
3040 | \r | |
3041 | /* 2. Let the furthest block be the topmost node in the\r | |
3042 | stack of open elements that is lower in the stack\r | |
3043 | than the formatting element, and is not an element in\r | |
3044 | the phrasing or formatting categories. There might\r | |
3045 | not be one. */\r | |
3046 | $fe_s_pos = array_search($formatting_element, $this->stack, true);\r | |
3047 | $length = count($this->stack);\r | |
3048 | \r | |
3049 | for ($s = $fe_s_pos + 1; $s < $length; $s++) {\r | |
3050 | $category = $this->getElementCategory($this->stack[$s]->nodeName);\r | |
3051 | \r | |
3052 | if ($category !== self::PHRASING && $category !== self::FORMATTING) {\r | |
3053 | $furthest_block = $this->stack[$s];\r | |
3054 | }\r | |
3055 | }\r | |
3056 | \r | |
3057 | /* 3. If there is no furthest block, then the UA must\r | |
3058 | skip the subsequent steps and instead just pop all\r | |
3059 | the nodes from the bottom of the stack of open\r | |
3060 | elements, from the current node up to the formatting\r | |
3061 | element, and remove the formatting element from the\r | |
3062 | list of active formatting elements. */\r | |
3063 | if (!isset($furthest_block)) {\r | |
3064 | for ($n = $length - 1; $n >= $fe_s_pos; $n--) {\r | |
3065 | array_pop($this->stack);\r | |
3066 | }\r | |
3067 | \r | |
3068 | unset($this->a_formatting[$fe_af_pos]);\r | |
3069 | $this->a_formatting = array_merge($this->a_formatting);\r | |
3070 | break;\r | |
3071 | }\r | |
3072 | \r | |
3073 | /* 4. Let the common ancestor be the element\r | |
3074 | immediately above the formatting element in the stack\r | |
3075 | of open elements. */\r | |
3076 | $common_ancestor = $this->stack[$fe_s_pos - 1];\r | |
3077 | \r | |
3078 | /* 5. If the furthest block has a parent node, then\r | |
3079 | remove the furthest block from its parent node. */\r | |
3080 | if ($furthest_block->parentNode !== null) {\r | |
3081 | $furthest_block->parentNode->removeChild($furthest_block);\r | |
3082 | }\r | |
3083 | \r | |
3084 | /* 6. Let a bookmark note the position of the\r | |
3085 | formatting element in the list of active formatting\r | |
3086 | elements relative to the elements on either side\r | |
3087 | of it in the list. */\r | |
3088 | $bookmark = $fe_af_pos;\r | |
3089 | \r | |
3090 | /* 7. Let node and last node be the furthest block.\r | |
3091 | Follow these steps: */\r | |
3092 | $node = $furthest_block;\r | |
3093 | $last_node = $furthest_block;\r | |
3094 | \r | |
3095 | while (true) {\r | |
3096 | for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {\r | |
3097 | /* 7.1 Let node be the element immediately\r | |
3098 | prior to node in the stack of open elements. */\r | |
3099 | $node = $this->stack[$n];\r | |
3100 | \r | |
3101 | /* 7.2 If node is not in the list of active\r | |
3102 | formatting elements, then remove node from\r | |
3103 | the stack of open elements and then go back\r | |
3104 | to step 1. */\r | |
3105 | if (!in_array($node, $this->a_formatting, true)) {\r | |
3106 | unset($this->stack[$n]);\r | |
3107 | $this->stack = array_merge($this->stack);\r | |
3108 | \r | |
3109 | } else {\r | |
3110 | break;\r | |
3111 | }\r | |
3112 | }\r | |
3113 | \r | |
3114 | /* 7.3 Otherwise, if node is the formatting\r | |
3115 | element, then go to the next step in the overall\r | |
3116 | algorithm. */\r | |
3117 | if ($node === $formatting_element) {\r | |
3118 | break;\r | |
3119 | \r | |
3120 | /* 7.4 Otherwise, if last node is the furthest\r | |
3121 | block, then move the aforementioned bookmark to\r | |
3122 | be immediately after the node in the list of\r | |
3123 | active formatting elements. */\r | |
3124 | } elseif ($last_node === $furthest_block) {\r | |
3125 | $bookmark = array_search($node, $this->a_formatting, true) + 1;\r | |
3126 | }\r | |
3127 | \r | |
3128 | /* 7.5 If node has any children, perform a\r | |
3129 | shallow clone of node, replace the entry for\r | |
3130 | node in the list of active formatting elements\r | |
3131 | with an entry for the clone, replace the entry\r | |
3132 | for node in the stack of open elements with an\r | |
3133 | entry for the clone, and let node be the clone. */\r | |
3134 | if ($node->hasChildNodes()) {\r | |
3135 | $clone = $node->cloneNode();\r | |
3136 | $s_pos = array_search($node, $this->stack, true);\r | |
3137 | $a_pos = array_search($node, $this->a_formatting, true);\r | |
3138 | \r | |
3139 | $this->stack[$s_pos] = $clone;\r | |
3140 | $this->a_formatting[$a_pos] = $clone;\r | |
3141 | $node = $clone;\r | |
3142 | }\r | |
3143 | \r | |
3144 | /* 7.6 Insert last node into node, first removing\r | |
3145 | it from its previous parent node if any. */\r | |
3146 | if ($last_node->parentNode !== null) {\r | |
3147 | $last_node->parentNode->removeChild($last_node);\r | |
3148 | }\r | |
3149 | \r | |
3150 | $node->appendChild($last_node);\r | |
3151 | \r | |
3152 | /* 7.7 Let last node be node. */\r | |
3153 | $last_node = $node;\r | |
3154 | }\r | |
3155 | \r | |
3156 | /* 8. Insert whatever last node ended up being in\r | |
3157 | the previous step into the common ancestor node,\r | |
3158 | first removing it from its previous parent node if\r | |
3159 | any. */\r | |
3160 | if ($last_node->parentNode !== null) {\r | |
3161 | $last_node->parentNode->removeChild($last_node);\r | |
3162 | }\r | |
3163 | \r | |
3164 | $common_ancestor->appendChild($last_node);\r | |
3165 | \r | |
3166 | /* 9. Perform a shallow clone of the formatting\r | |
3167 | element. */\r | |
3168 | $clone = $formatting_element->cloneNode();\r | |
3169 | \r | |
3170 | /* 10. Take all of the child nodes of the furthest\r | |
3171 | block and append them to the clone created in the\r | |
3172 | last step. */\r | |
3173 | while ($furthest_block->hasChildNodes()) {\r | |
3174 | $child = $furthest_block->firstChild;\r | |
3175 | $furthest_block->removeChild($child);\r | |
3176 | $clone->appendChild($child);\r | |
3177 | }\r | |
3178 | \r | |
3179 | /* 11. Append that clone to the furthest block. */\r | |
3180 | $furthest_block->appendChild($clone);\r | |
3181 | \r | |
3182 | /* 12. Remove the formatting element from the list\r | |
3183 | of active formatting elements, and insert the clone\r | |
3184 | into the list of active formatting elements at the\r | |
3185 | position of the aforementioned bookmark. */\r | |
3186 | $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);\r | |
3187 | unset($this->a_formatting[$fe_af_pos]);\r | |
3188 | $this->a_formatting = array_merge($this->a_formatting);\r | |
3189 | \r | |
3190 | $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);\r | |
3191 | $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));\r | |
3192 | $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);\r | |
3193 | \r | |
3194 | /* 13. Remove the formatting element from the stack\r | |
3195 | of open elements, and insert the clone into the stack\r | |
3196 | of open elements immediately after (i.e. in a more\r | |
3197 | deeply nested position than) the position of the\r | |
3198 | furthest block in that stack. */\r | |
3199 | $fe_s_pos = array_search($formatting_element, $this->stack, true);\r | |
3200 | $fb_s_pos = array_search($furthest_block, $this->stack, true);\r | |
3201 | unset($this->stack[$fe_s_pos]);\r | |
3202 | \r | |
3203 | $s_part1 = array_slice($this->stack, 0, $fb_s_pos);\r | |
3204 | $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));\r | |
3205 | $this->stack = array_merge($s_part1, array($clone), $s_part2);\r | |
3206 | \r | |
3207 | /* 14. Jump back to step 1 in this series of steps. */\r | |
3208 | unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);\r | |
3209 | }\r | |
3210 | break;\r | |
3211 | \r | |
3212 | /* An end tag token whose tag name is one of: "button",\r | |
3213 | "marquee", "object" */\r | |
3214 | case 'button':\r | |
3215 | case 'marquee':\r | |
3216 | case 'object':\r | |
3217 | /* If the stack of open elements has an element in scope whose\r | |
3218 | tag name matches the tag name of the token, then generate implied\r | |
3219 | tags. */\r | |
3220 | if ($this->elementInScope($token['name'])) {\r | |
3221 | $this->generateImpliedEndTags();\r | |
3222 | \r | |
3223 | /* Now, if the current node is not an element with the same\r | |
3224 | tag name as the token, then this is a parse error. */\r | |
3225 | // k\r | |
3226 | \r | |
3227 | /* Now, if the stack of open elements has an element in scope\r | |
3228 | whose tag name matches the tag name of the token, then pop\r | |
3229 | elements from the stack until that element has been popped from\r | |
3230 | the stack, and clear the list of active formatting elements up\r | |
3231 | to the last marker. */\r | |
3232 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
3233 | if ($this->stack[$n]->nodeName === $token['name']) {\r | |
3234 | $n = -1;\r | |
3235 | }\r | |
3236 | \r | |
3237 | array_pop($this->stack);\r | |
3238 | }\r | |
3239 | \r | |
3240 | $marker = end(array_keys($this->a_formatting, self::MARKER, true));\r | |
3241 | \r | |
3242 | for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {\r | |
3243 | array_pop($this->a_formatting);\r | |
3244 | }\r | |
3245 | }\r | |
3246 | break;\r | |
3247 | \r | |
3248 | /* Or an end tag whose tag name is one of: "area", "basefont",\r | |
3249 | "bgsound", "br", "embed", "hr", "iframe", "image", "img",\r | |
3250 | "input", "isindex", "noembed", "noframes", "param", "select",\r | |
3251 | "spacer", "table", "textarea", "wbr" */\r | |
3252 | case 'area':\r | |
3253 | case 'basefont':\r | |
3254 | case 'bgsound':\r | |
3255 | case 'br':\r | |
3256 | case 'embed':\r | |
3257 | case 'hr':\r | |
3258 | case 'iframe':\r | |
3259 | case 'image':\r | |
3260 | case 'img':\r | |
3261 | case 'input':\r | |
3262 | case 'isindex':\r | |
3263 | case 'noembed':\r | |
3264 | case 'noframes':\r | |
3265 | case 'param':\r | |
3266 | case 'select':\r | |
3267 | case 'spacer':\r | |
3268 | case 'table':\r | |
3269 | case 'textarea':\r | |
3270 | case 'wbr':\r | |
3271 | // Parse error. Ignore the token.\r | |
3272 | break;\r | |
3273 | \r | |
3274 | /* An end tag token not covered by the previous entries */\r | |
3275 | default:\r | |
3276 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
3277 | /* Initialise node to be the current node (the bottommost\r | |
3278 | node of the stack). */\r | |
3279 | $node = end($this->stack);\r | |
3280 | \r | |
3281 | /* If node has the same tag name as the end tag token,\r | |
3282 | then: */\r | |
3283 | if ($token['name'] === $node->nodeName) {\r | |
3284 | /* Generate implied end tags. */\r | |
3285 | $this->generateImpliedEndTags();\r | |
3286 | \r | |
3287 | /* If the tag name of the end tag token does not\r | |
3288 | match the tag name of the current node, this is a\r | |
3289 | parse error. */\r | |
3290 | // k\r | |
3291 | \r | |
3292 | /* Pop all the nodes from the current node up to\r | |
3293 | node, including node, then stop this algorithm. */\r | |
3294 | for ($x = count($this->stack) - $n; $x >= $n; $x--) {\r | |
3295 | array_pop($this->stack);\r | |
3296 | }\r | |
3297 | \r | |
3298 | } else {\r | |
3299 | $category = $this->getElementCategory($node);\r | |
3300 | \r | |
3301 | if ($category !== self::SPECIAL && $category !== self::SCOPING) {\r | |
3302 | /* Otherwise, if node is in neither the formatting\r | |
3303 | category nor the phrasing category, then this is a\r | |
3304 | parse error. Stop this algorithm. The end tag token\r | |
3305 | is ignored. */\r | |
3306 | return false;\r | |
3307 | }\r | |
3308 | }\r | |
3309 | }\r | |
3310 | break;\r | |
3311 | }\r | |
3312 | break;\r | |
3313 | }\r | |
3314 | }\r | |
3315 | \r | |
3316 | private function inTable($token)\r | |
3317 | {\r | |
3318 | $clear = array('html', 'table');\r | |
3319 | \r | |
3320 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
3321 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
3322 | or U+0020 SPACE */\r | |
3323 | if ($token['type'] === HTML5::CHARACTR &&\r | |
3324 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
3325 | ) {\r | |
3326 | /* Append the character to the current node. */\r | |
3327 | $text = $this->dom->createTextNode($token['data']);\r | |
3328 | end($this->stack)->appendChild($text);\r | |
3329 | \r | |
3330 | /* A comment token */\r | |
3331 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
3332 | /* Append a Comment node to the current node with the data\r | |
3333 | attribute set to the data given in the comment token. */\r | |
3334 | $comment = $this->dom->createComment($token['data']);\r | |
3335 | end($this->stack)->appendChild($comment);\r | |
3336 | \r | |
3337 | /* A start tag whose tag name is "caption" */\r | |
3338 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3339 | $token['name'] === 'caption'\r | |
3340 | ) {\r | |
3341 | /* Clear the stack back to a table context. */\r | |
3342 | $this->clearStackToTableContext($clear);\r | |
3343 | \r | |
3344 | /* Insert a marker at the end of the list of active\r | |
3345 | formatting elements. */\r | |
3346 | $this->a_formatting[] = self::MARKER;\r | |
3347 | \r | |
3348 | /* Insert an HTML element for the token, then switch the\r | |
3349 | insertion mode to "in caption". */\r | |
3350 | $this->insertElement($token);\r | |
3351 | $this->mode = self::IN_CAPTION;\r | |
3352 | \r | |
3353 | /* A start tag whose tag name is "colgroup" */\r | |
3354 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3355 | $token['name'] === 'colgroup'\r | |
3356 | ) {\r | |
3357 | /* Clear the stack back to a table context. */\r | |
3358 | $this->clearStackToTableContext($clear);\r | |
3359 | \r | |
3360 | /* Insert an HTML element for the token, then switch the\r | |
3361 | insertion mode to "in column group". */\r | |
3362 | $this->insertElement($token);\r | |
3363 | $this->mode = self::IN_CGROUP;\r | |
3364 | \r | |
3365 | /* A start tag whose tag name is "col" */\r | |
3366 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3367 | $token['name'] === 'col'\r | |
3368 | ) {\r | |
3369 | $this->inTable(\r | |
3370 | array(\r | |
3371 | 'name' => 'colgroup',\r | |
3372 | 'type' => HTML5::STARTTAG,\r | |
3373 | 'attr' => array()\r | |
3374 | )\r | |
3375 | );\r | |
3376 | \r | |
3377 | $this->inColumnGroup($token);\r | |
3378 | \r | |
3379 | /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */\r | |
3380 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
3381 | $token['name'],\r | |
3382 | array('tbody', 'tfoot', 'thead')\r | |
3383 | )\r | |
3384 | ) {\r | |
3385 | /* Clear the stack back to a table context. */\r | |
3386 | $this->clearStackToTableContext($clear);\r | |
3387 | \r | |
3388 | /* Insert an HTML element for the token, then switch the insertion\r | |
3389 | mode to "in table body". */\r | |
3390 | $this->insertElement($token);\r | |
3391 | $this->mode = self::IN_TBODY;\r | |
3392 | \r | |
3393 | /* A start tag whose tag name is one of: "td", "th", "tr" */\r | |
3394 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3395 | in_array($token['name'], array('td', 'th', 'tr'))\r | |
3396 | ) {\r | |
3397 | /* Act as if a start tag token with the tag name "tbody" had been\r | |
3398 | seen, then reprocess the current token. */\r | |
3399 | $this->inTable(\r | |
3400 | array(\r | |
3401 | 'name' => 'tbody',\r | |
3402 | 'type' => HTML5::STARTTAG,\r | |
3403 | 'attr' => array()\r | |
3404 | )\r | |
3405 | );\r | |
3406 | \r | |
3407 | return $this->inTableBody($token);\r | |
3408 | \r | |
3409 | /* A start tag whose tag name is "table" */\r | |
3410 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3411 | $token['name'] === 'table'\r | |
3412 | ) {\r | |
3413 | /* Parse error. Act as if an end tag token with the tag name "table"\r | |
3414 | had been seen, then, if that token wasn't ignored, reprocess the\r | |
3415 | current token. */\r | |
3416 | $this->inTable(\r | |
3417 | array(\r | |
3418 | 'name' => 'table',\r | |
3419 | 'type' => HTML5::ENDTAG\r | |
3420 | )\r | |
3421 | );\r | |
3422 | \r | |
3423 | return $this->mainPhase($token);\r | |
3424 | \r | |
3425 | /* An end tag whose tag name is "table" */\r | |
3426 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
3427 | $token['name'] === 'table'\r | |
3428 | ) {\r | |
3429 | /* If the stack of open elements does not have an element in table\r | |
3430 | scope with the same tag name as the token, this is a parse error.\r | |
3431 | Ignore the token. (innerHTML case) */\r | |
3432 | if (!$this->elementInScope($token['name'], true)) {\r | |
3433 | return false;\r | |
3434 | \r | |
3435 | /* Otherwise: */\r | |
3436 | } else {\r | |
3437 | /* Generate implied end tags. */\r | |
3438 | $this->generateImpliedEndTags();\r | |
3439 | \r | |
3440 | /* Now, if the current node is not a table element, then this\r | |
3441 | is a parse error. */\r | |
3442 | // w/e\r | |
3443 | \r | |
3444 | /* Pop elements from this stack until a table element has been\r | |
3445 | popped from the stack. */\r | |
3446 | while (true) {\r | |
3447 | $current = end($this->stack)->nodeName;\r | |
3448 | array_pop($this->stack);\r | |
3449 | \r | |
3450 | if ($current === 'table') {\r | |
3451 | break;\r | |
3452 | }\r | |
3453 | }\r | |
3454 | \r | |
3455 | /* Reset the insertion mode appropriately. */\r | |
3456 | $this->resetInsertionMode();\r | |
3457 | }\r | |
3458 | \r | |
3459 | /* An end tag whose tag name is one of: "body", "caption", "col",\r | |
3460 | "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */\r | |
3461 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3462 | $token['name'],\r | |
3463 | array(\r | |
3464 | 'body',\r | |
3465 | 'caption',\r | |
3466 | 'col',\r | |
3467 | 'colgroup',\r | |
3468 | 'html',\r | |
3469 | 'tbody',\r | |
3470 | 'td',\r | |
3471 | 'tfoot',\r | |
3472 | 'th',\r | |
3473 | 'thead',\r | |
3474 | 'tr'\r | |
3475 | )\r | |
3476 | )\r | |
3477 | ) {\r | |
3478 | // Parse error. Ignore the token.\r | |
3479 | \r | |
3480 | /* Anything else */\r | |
3481 | } else {\r | |
3482 | /* Parse error. Process the token as if the insertion mode was "in\r | |
3483 | body", with the following exception: */\r | |
3484 | \r | |
3485 | /* If the current node is a table, tbody, tfoot, thead, or tr\r | |
3486 | element, then, whenever a node would be inserted into the current\r | |
3487 | node, it must instead be inserted into the foster parent element. */\r | |
3488 | if (in_array(\r | |
3489 | end($this->stack)->nodeName,\r | |
3490 | array('table', 'tbody', 'tfoot', 'thead', 'tr')\r | |
3491 | )\r | |
3492 | ) {\r | |
3493 | /* The foster parent element is the parent element of the last\r | |
3494 | table element in the stack of open elements, if there is a\r | |
3495 | table element and it has such a parent element. If there is no\r | |
3496 | table element in the stack of open elements (innerHTML case),\r | |
3497 | then the foster parent element is the first element in the\r | |
3498 | stack of open elements (the html element). Otherwise, if there\r | |
3499 | is a table element in the stack of open elements, but the last\r | |
3500 | table element in the stack of open elements has no parent, or\r | |
3501 | its parent node is not an element, then the foster parent\r | |
3502 | element is the element before the last table element in the\r | |
3503 | stack of open elements. */\r | |
3504 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
3505 | if ($this->stack[$n]->nodeName === 'table') {\r | |
3506 | $table = $this->stack[$n];\r | |
3507 | break;\r | |
3508 | }\r | |
3509 | }\r | |
3510 | \r | |
3511 | if (isset($table) && $table->parentNode !== null) {\r | |
3512 | $this->foster_parent = $table->parentNode;\r | |
3513 | \r | |
3514 | } elseif (!isset($table)) {\r | |
3515 | $this->foster_parent = $this->stack[0];\r | |
3516 | \r | |
3517 | } elseif (isset($table) && ($table->parentNode === null ||\r | |
3518 | $table->parentNode->nodeType !== XML_ELEMENT_NODE)\r | |
3519 | ) {\r | |
3520 | $this->foster_parent = $this->stack[$n - 1];\r | |
3521 | }\r | |
3522 | }\r | |
3523 | \r | |
3524 | $this->inBody($token);\r | |
3525 | }\r | |
3526 | }\r | |
3527 | \r | |
3528 | private function inCaption($token)\r | |
3529 | {\r | |
3530 | /* An end tag whose tag name is "caption" */\r | |
3531 | if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {\r | |
3532 | /* If the stack of open elements does not have an element in table\r | |
3533 | scope with the same tag name as the token, this is a parse error.\r | |
3534 | Ignore the token. (innerHTML case) */\r | |
3535 | if (!$this->elementInScope($token['name'], true)) {\r | |
3536 | // Ignore\r | |
3537 | \r | |
3538 | /* Otherwise: */\r | |
3539 | } else {\r | |
3540 | /* Generate implied end tags. */\r | |
3541 | $this->generateImpliedEndTags();\r | |
3542 | \r | |
3543 | /* Now, if the current node is not a caption element, then this\r | |
3544 | is a parse error. */\r | |
3545 | // w/e\r | |
3546 | \r | |
3547 | /* Pop elements from this stack until a caption element has\r | |
3548 | been popped from the stack. */\r | |
3549 | while (true) {\r | |
3550 | $node = end($this->stack)->nodeName;\r | |
3551 | array_pop($this->stack);\r | |
3552 | \r | |
3553 | if ($node === 'caption') {\r | |
3554 | break;\r | |
3555 | }\r | |
3556 | }\r | |
3557 | \r | |
3558 | /* Clear the list of active formatting elements up to the last\r | |
3559 | marker. */\r | |
3560 | $this->clearTheActiveFormattingElementsUpToTheLastMarker();\r | |
3561 | \r | |
3562 | /* Switch the insertion mode to "in table". */\r | |
3563 | $this->mode = self::IN_TABLE;\r | |
3564 | }\r | |
3565 | \r | |
3566 | /* A start tag whose tag name is one of: "caption", "col", "colgroup",\r | |
3567 | "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag\r | |
3568 | name is "table" */\r | |
3569 | } elseif (($token['type'] === HTML5::STARTTAG && in_array(\r | |
3570 | $token['name'],\r | |
3571 | array(\r | |
3572 | 'caption',\r | |
3573 | 'col',\r | |
3574 | 'colgroup',\r | |
3575 | 'tbody',\r | |
3576 | 'td',\r | |
3577 | 'tfoot',\r | |
3578 | 'th',\r | |
3579 | 'thead',\r | |
3580 | 'tr'\r | |
3581 | )\r | |
3582 | )) || ($token['type'] === HTML5::ENDTAG &&\r | |
3583 | $token['name'] === 'table')\r | |
3584 | ) {\r | |
3585 | /* Parse error. Act as if an end tag with the tag name "caption"\r | |
3586 | had been seen, then, if that token wasn't ignored, reprocess the\r | |
3587 | current token. */\r | |
3588 | $this->inCaption(\r | |
3589 | array(\r | |
3590 | 'name' => 'caption',\r | |
3591 | 'type' => HTML5::ENDTAG\r | |
3592 | )\r | |
3593 | );\r | |
3594 | \r | |
3595 | return $this->inTable($token);\r | |
3596 | \r | |
3597 | /* An end tag whose tag name is one of: "body", "col", "colgroup",\r | |
3598 | "html", "tbody", "td", "tfoot", "th", "thead", "tr" */\r | |
3599 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3600 | $token['name'],\r | |
3601 | array(\r | |
3602 | 'body',\r | |
3603 | 'col',\r | |
3604 | 'colgroup',\r | |
3605 | 'html',\r | |
3606 | 'tbody',\r | |
3607 | 'tfoot',\r | |
3608 | 'th',\r | |
3609 | 'thead',\r | |
3610 | 'tr'\r | |
3611 | )\r | |
3612 | )\r | |
3613 | ) {\r | |
3614 | // Parse error. Ignore the token.\r | |
3615 | \r | |
3616 | /* Anything else */\r | |
3617 | } else {\r | |
3618 | /* Process the token as if the insertion mode was "in body". */\r | |
3619 | $this->inBody($token);\r | |
3620 | }\r | |
3621 | }\r | |
3622 | \r | |
3623 | private function inColumnGroup($token)\r | |
3624 | {\r | |
3625 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
3626 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
3627 | or U+0020 SPACE */\r | |
3628 | if ($token['type'] === HTML5::CHARACTR &&\r | |
3629 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
3630 | ) {\r | |
3631 | /* Append the character to the current node. */\r | |
3632 | $text = $this->dom->createTextNode($token['data']);\r | |
3633 | end($this->stack)->appendChild($text);\r | |
3634 | \r | |
3635 | /* A comment token */\r | |
3636 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
3637 | /* Append a Comment node to the current node with the data\r | |
3638 | attribute set to the data given in the comment token. */\r | |
3639 | $comment = $this->dom->createComment($token['data']);\r | |
3640 | end($this->stack)->appendChild($comment);\r | |
3641 | \r | |
3642 | /* A start tag whose tag name is "col" */\r | |
3643 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {\r | |
3644 | /* Insert a col element for the token. Immediately pop the current\r | |
3645 | node off the stack of open elements. */\r | |
3646 | $this->insertElement($token);\r | |
3647 | array_pop($this->stack);\r | |
3648 | \r | |
3649 | /* An end tag whose tag name is "colgroup" */\r | |
3650 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
3651 | $token['name'] === 'colgroup'\r | |
3652 | ) {\r | |
3653 | /* If the current node is the root html element, then this is a\r | |
3654 | parse error, ignore the token. (innerHTML case) */\r | |
3655 | if (end($this->stack)->nodeName === 'html') {\r | |
3656 | // Ignore\r | |
3657 | \r | |
3658 | /* Otherwise, pop the current node (which will be a colgroup\r | |
3659 | element) from the stack of open elements. Switch the insertion\r | |
3660 | mode to "in table". */\r | |
3661 | } else {\r | |
3662 | array_pop($this->stack);\r | |
3663 | $this->mode = self::IN_TABLE;\r | |
3664 | }\r | |
3665 | \r | |
3666 | /* An end tag whose tag name is "col" */\r | |
3667 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {\r | |
3668 | /* Parse error. Ignore the token. */\r | |
3669 | \r | |
3670 | /* Anything else */\r | |
3671 | } else {\r | |
3672 | /* Act as if an end tag with the tag name "colgroup" had been seen,\r | |
3673 | and then, if that token wasn't ignored, reprocess the current token. */\r | |
3674 | $this->inColumnGroup(\r | |
3675 | array(\r | |
3676 | 'name' => 'colgroup',\r | |
3677 | 'type' => HTML5::ENDTAG\r | |
3678 | )\r | |
3679 | );\r | |
3680 | \r | |
3681 | return $this->inTable($token);\r | |
3682 | }\r | |
3683 | }\r | |
3684 | \r | |
3685 | private function inTableBody($token)\r | |
3686 | {\r | |
3687 | $clear = array('tbody', 'tfoot', 'thead', 'html');\r | |
3688 | \r | |
3689 | /* A start tag whose tag name is "tr" */\r | |
3690 | if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {\r | |
3691 | /* Clear the stack back to a table body context. */\r | |
3692 | $this->clearStackToTableContext($clear);\r | |
3693 | \r | |
3694 | /* Insert a tr element for the token, then switch the insertion\r | |
3695 | mode to "in row". */\r | |
3696 | $this->insertElement($token);\r | |
3697 | $this->mode = self::IN_ROW;\r | |
3698 | \r | |
3699 | /* A start tag whose tag name is one of: "th", "td" */\r | |
3700 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
3701 | ($token['name'] === 'th' || $token['name'] === 'td')\r | |
3702 | ) {\r | |
3703 | /* Parse error. Act as if a start tag with the tag name "tr" had\r | |
3704 | been seen, then reprocess the current token. */\r | |
3705 | $this->inTableBody(\r | |
3706 | array(\r | |
3707 | 'name' => 'tr',\r | |
3708 | 'type' => HTML5::STARTTAG,\r | |
3709 | 'attr' => array()\r | |
3710 | )\r | |
3711 | );\r | |
3712 | \r | |
3713 | return $this->inRow($token);\r | |
3714 | \r | |
3715 | /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */\r | |
3716 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
3717 | in_array($token['name'], array('tbody', 'tfoot', 'thead'))\r | |
3718 | ) {\r | |
3719 | /* If the stack of open elements does not have an element in table\r | |
3720 | scope with the same tag name as the token, this is a parse error.\r | |
3721 | Ignore the token. */\r | |
3722 | if (!$this->elementInScope($token['name'], true)) {\r | |
3723 | // Ignore\r | |
3724 | \r | |
3725 | /* Otherwise: */\r | |
3726 | } else {\r | |
3727 | /* Clear the stack back to a table body context. */\r | |
3728 | $this->clearStackToTableContext($clear);\r | |
3729 | \r | |
3730 | /* Pop the current node from the stack of open elements. Switch\r | |
3731 | the insertion mode to "in table". */\r | |
3732 | array_pop($this->stack);\r | |
3733 | $this->mode = self::IN_TABLE;\r | |
3734 | }\r | |
3735 | \r | |
3736 | /* A start tag whose tag name is one of: "caption", "col", "colgroup",\r | |
3737 | "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */\r | |
3738 | } elseif (($token['type'] === HTML5::STARTTAG && in_array(\r | |
3739 | $token['name'],\r | |
3740 | array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')\r | |
3741 | )) ||\r | |
3742 | ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')\r | |
3743 | ) {\r | |
3744 | /* If the stack of open elements does not have a tbody, thead, or\r | |
3745 | tfoot element in table scope, this is a parse error. Ignore the\r | |
3746 | token. (innerHTML case) */\r | |
3747 | if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {\r | |
3748 | // Ignore.\r | |
3749 | \r | |
3750 | /* Otherwise: */\r | |
3751 | } else {\r | |
3752 | /* Clear the stack back to a table body context. */\r | |
3753 | $this->clearStackToTableContext($clear);\r | |
3754 | \r | |
3755 | /* Act as if an end tag with the same tag name as the current\r | |
3756 | node ("tbody", "tfoot", or "thead") had been seen, then\r | |
3757 | reprocess the current token. */\r | |
3758 | $this->inTableBody(\r | |
3759 | array(\r | |
3760 | 'name' => end($this->stack)->nodeName,\r | |
3761 | 'type' => HTML5::ENDTAG\r | |
3762 | )\r | |
3763 | );\r | |
3764 | \r | |
3765 | return $this->mainPhase($token);\r | |
3766 | }\r | |
3767 | \r | |
3768 | /* An end tag whose tag name is one of: "body", "caption", "col",\r | |
3769 | "colgroup", "html", "td", "th", "tr" */\r | |
3770 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3771 | $token['name'],\r | |
3772 | array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')\r | |
3773 | )\r | |
3774 | ) {\r | |
3775 | /* Parse error. Ignore the token. */\r | |
3776 | \r | |
3777 | /* Anything else */\r | |
3778 | } else {\r | |
3779 | /* Process the token as if the insertion mode was "in table". */\r | |
3780 | $this->inTable($token);\r | |
3781 | }\r | |
3782 | }\r | |
3783 | \r | |
3784 | private function inRow($token)\r | |
3785 | {\r | |
3786 | $clear = array('tr', 'html');\r | |
3787 | \r | |
3788 | /* A start tag whose tag name is one of: "th", "td" */\r | |
3789 | if ($token['type'] === HTML5::STARTTAG &&\r | |
3790 | ($token['name'] === 'th' || $token['name'] === 'td')\r | |
3791 | ) {\r | |
3792 | /* Clear the stack back to a table row context. */\r | |
3793 | $this->clearStackToTableContext($clear);\r | |
3794 | \r | |
3795 | /* Insert an HTML element for the token, then switch the insertion\r | |
3796 | mode to "in cell". */\r | |
3797 | $this->insertElement($token);\r | |
3798 | $this->mode = self::IN_CELL;\r | |
3799 | \r | |
3800 | /* Insert a marker at the end of the list of active formatting\r | |
3801 | elements. */\r | |
3802 | $this->a_formatting[] = self::MARKER;\r | |
3803 | \r | |
3804 | /* An end tag whose tag name is "tr" */\r | |
3805 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {\r | |
3806 | /* If the stack of open elements does not have an element in table\r | |
3807 | scope with the same tag name as the token, this is a parse error.\r | |
3808 | Ignore the token. (innerHTML case) */\r | |
3809 | if (!$this->elementInScope($token['name'], true)) {\r | |
3810 | // Ignore.\r | |
3811 | \r | |
3812 | /* Otherwise: */\r | |
3813 | } else {\r | |
3814 | /* Clear the stack back to a table row context. */\r | |
3815 | $this->clearStackToTableContext($clear);\r | |
3816 | \r | |
3817 | /* Pop the current node (which will be a tr element) from the\r | |
3818 | stack of open elements. Switch the insertion mode to "in table\r | |
3819 | body". */\r | |
3820 | array_pop($this->stack);\r | |
3821 | $this->mode = self::IN_TBODY;\r | |
3822 | }\r | |
3823 | \r | |
3824 | /* A start tag whose tag name is one of: "caption", "col", "colgroup",\r | |
3825 | "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */\r | |
3826 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
3827 | $token['name'],\r | |
3828 | array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')\r | |
3829 | )\r | |
3830 | ) {\r | |
3831 | /* Act as if an end tag with the tag name "tr" had been seen, then,\r | |
3832 | if that token wasn't ignored, reprocess the current token. */\r | |
3833 | $this->inRow(\r | |
3834 | array(\r | |
3835 | 'name' => 'tr',\r | |
3836 | 'type' => HTML5::ENDTAG\r | |
3837 | )\r | |
3838 | );\r | |
3839 | \r | |
3840 | return $this->inCell($token);\r | |
3841 | \r | |
3842 | /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */\r | |
3843 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
3844 | in_array($token['name'], array('tbody', 'tfoot', 'thead'))\r | |
3845 | ) {\r | |
3846 | /* If the stack of open elements does not have an element in table\r | |
3847 | scope with the same tag name as the token, this is a parse error.\r | |
3848 | Ignore the token. */\r | |
3849 | if (!$this->elementInScope($token['name'], true)) {\r | |
3850 | // Ignore.\r | |
3851 | \r | |
3852 | /* Otherwise: */\r | |
3853 | } else {\r | |
3854 | /* Otherwise, act as if an end tag with the tag name "tr" had\r | |
3855 | been seen, then reprocess the current token. */\r | |
3856 | $this->inRow(\r | |
3857 | array(\r | |
3858 | 'name' => 'tr',\r | |
3859 | 'type' => HTML5::ENDTAG\r | |
3860 | )\r | |
3861 | );\r | |
3862 | \r | |
3863 | return $this->inCell($token);\r | |
3864 | }\r | |
3865 | \r | |
3866 | /* An end tag whose tag name is one of: "body", "caption", "col",\r | |
3867 | "colgroup", "html", "td", "th" */\r | |
3868 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3869 | $token['name'],\r | |
3870 | array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')\r | |
3871 | )\r | |
3872 | ) {\r | |
3873 | /* Parse error. Ignore the token. */\r | |
3874 | \r | |
3875 | /* Anything else */\r | |
3876 | } else {\r | |
3877 | /* Process the token as if the insertion mode was "in table". */\r | |
3878 | $this->inTable($token);\r | |
3879 | }\r | |
3880 | }\r | |
3881 | \r | |
3882 | private function inCell($token)\r | |
3883 | {\r | |
3884 | /* An end tag whose tag name is one of: "td", "th" */\r | |
3885 | if ($token['type'] === HTML5::ENDTAG &&\r | |
3886 | ($token['name'] === 'td' || $token['name'] === 'th')\r | |
3887 | ) {\r | |
3888 | /* If the stack of open elements does not have an element in table\r | |
3889 | scope with the same tag name as that of the token, then this is a\r | |
3890 | parse error and the token must be ignored. */\r | |
3891 | if (!$this->elementInScope($token['name'], true)) {\r | |
3892 | // Ignore.\r | |
3893 | \r | |
3894 | /* Otherwise: */\r | |
3895 | } else {\r | |
3896 | /* Generate implied end tags, except for elements with the same\r | |
3897 | tag name as the token. */\r | |
3898 | $this->generateImpliedEndTags(array($token['name']));\r | |
3899 | \r | |
3900 | /* Now, if the current node is not an element with the same tag\r | |
3901 | name as the token, then this is a parse error. */\r | |
3902 | // k\r | |
3903 | \r | |
3904 | /* Pop elements from this stack until an element with the same\r | |
3905 | tag name as the token has been popped from the stack. */\r | |
3906 | while (true) {\r | |
3907 | $node = end($this->stack)->nodeName;\r | |
3908 | array_pop($this->stack);\r | |
3909 | \r | |
3910 | if ($node === $token['name']) {\r | |
3911 | break;\r | |
3912 | }\r | |
3913 | }\r | |
3914 | \r | |
3915 | /* Clear the list of active formatting elements up to the last\r | |
3916 | marker. */\r | |
3917 | $this->clearTheActiveFormattingElementsUpToTheLastMarker();\r | |
3918 | \r | |
3919 | /* Switch the insertion mode to "in row". (The current node\r | |
3920 | will be a tr element at this point.) */\r | |
3921 | $this->mode = self::IN_ROW;\r | |
3922 | }\r | |
3923 | \r | |
3924 | /* A start tag whose tag name is one of: "caption", "col", "colgroup",\r | |
3925 | "tbody", "td", "tfoot", "th", "thead", "tr" */\r | |
3926 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
3927 | $token['name'],\r | |
3928 | array(\r | |
3929 | 'caption',\r | |
3930 | 'col',\r | |
3931 | 'colgroup',\r | |
3932 | 'tbody',\r | |
3933 | 'td',\r | |
3934 | 'tfoot',\r | |
3935 | 'th',\r | |
3936 | 'thead',\r | |
3937 | 'tr'\r | |
3938 | )\r | |
3939 | )\r | |
3940 | ) {\r | |
3941 | /* If the stack of open elements does not have a td or th element\r | |
3942 | in table scope, then this is a parse error; ignore the token.\r | |
3943 | (innerHTML case) */\r | |
3944 | if (!$this->elementInScope(array('td', 'th'), true)) {\r | |
3945 | // Ignore.\r | |
3946 | \r | |
3947 | /* Otherwise, close the cell (see below) and reprocess the current\r | |
3948 | token. */\r | |
3949 | } else {\r | |
3950 | $this->closeCell();\r | |
3951 | return $this->inRow($token);\r | |
3952 | }\r | |
3953 | \r | |
3954 | /* A start tag whose tag name is one of: "caption", "col", "colgroup",\r | |
3955 | "tbody", "td", "tfoot", "th", "thead", "tr" */\r | |
3956 | } elseif ($token['type'] === HTML5::STARTTAG && in_array(\r | |
3957 | $token['name'],\r | |
3958 | array(\r | |
3959 | 'caption',\r | |
3960 | 'col',\r | |
3961 | 'colgroup',\r | |
3962 | 'tbody',\r | |
3963 | 'td',\r | |
3964 | 'tfoot',\r | |
3965 | 'th',\r | |
3966 | 'thead',\r | |
3967 | 'tr'\r | |
3968 | )\r | |
3969 | )\r | |
3970 | ) {\r | |
3971 | /* If the stack of open elements does not have a td or th element\r | |
3972 | in table scope, then this is a parse error; ignore the token.\r | |
3973 | (innerHTML case) */\r | |
3974 | if (!$this->elementInScope(array('td', 'th'), true)) {\r | |
3975 | // Ignore.\r | |
3976 | \r | |
3977 | /* Otherwise, close the cell (see below) and reprocess the current\r | |
3978 | token. */\r | |
3979 | } else {\r | |
3980 | $this->closeCell();\r | |
3981 | return $this->inRow($token);\r | |
3982 | }\r | |
3983 | \r | |
3984 | /* An end tag whose tag name is one of: "body", "caption", "col",\r | |
3985 | "colgroup", "html" */\r | |
3986 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3987 | $token['name'],\r | |
3988 | array('body', 'caption', 'col', 'colgroup', 'html')\r | |
3989 | )\r | |
3990 | ) {\r | |
3991 | /* Parse error. Ignore the token. */\r | |
3992 | \r | |
3993 | /* An end tag whose tag name is one of: "table", "tbody", "tfoot",\r | |
3994 | "thead", "tr" */\r | |
3995 | } elseif ($token['type'] === HTML5::ENDTAG && in_array(\r | |
3996 | $token['name'],\r | |
3997 | array('table', 'tbody', 'tfoot', 'thead', 'tr')\r | |
3998 | )\r | |
3999 | ) {\r | |
4000 | /* If the stack of open elements does not have an element in table\r | |
4001 | scope with the same tag name as that of the token (which can only\r | |
4002 | happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),\r | |
4003 | then this is a parse error and the token must be ignored. */\r | |
4004 | if (!$this->elementInScope($token['name'], true)) {\r | |
4005 | // Ignore.\r | |
4006 | \r | |
4007 | /* Otherwise, close the cell (see below) and reprocess the current\r | |
4008 | token. */\r | |
4009 | } else {\r | |
4010 | $this->closeCell();\r | |
4011 | return $this->inRow($token);\r | |
4012 | }\r | |
4013 | \r | |
4014 | /* Anything else */\r | |
4015 | } else {\r | |
4016 | /* Process the token as if the insertion mode was "in body". */\r | |
4017 | $this->inBody($token);\r | |
4018 | }\r | |
4019 | }\r | |
4020 | \r | |
4021 | private function inSelect($token)\r | |
4022 | {\r | |
4023 | /* Handle the token as follows: */\r | |
4024 | \r | |
4025 | /* A character token */\r | |
4026 | if ($token['type'] === HTML5::CHARACTR) {\r | |
4027 | /* Append the token's character to the current node. */\r | |
4028 | $this->insertText($token['data']);\r | |
4029 | \r | |
4030 | /* A comment token */\r | |
4031 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
4032 | /* Append a Comment node to the current node with the data\r | |
4033 | attribute set to the data given in the comment token. */\r | |
4034 | $this->insertComment($token['data']);\r | |
4035 | \r | |
4036 | /* A start tag token whose tag name is "option" */\r | |
4037 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
4038 | $token['name'] === 'option'\r | |
4039 | ) {\r | |
4040 | /* If the current node is an option element, act as if an end tag\r | |
4041 | with the tag name "option" had been seen. */\r | |
4042 | if (end($this->stack)->nodeName === 'option') {\r | |
4043 | $this->inSelect(\r | |
4044 | array(\r | |
4045 | 'name' => 'option',\r | |
4046 | 'type' => HTML5::ENDTAG\r | |
4047 | )\r | |
4048 | );\r | |
4049 | }\r | |
4050 | \r | |
4051 | /* Insert an HTML element for the token. */\r | |
4052 | $this->insertElement($token);\r | |
4053 | \r | |
4054 | /* A start tag token whose tag name is "optgroup" */\r | |
4055 | } elseif ($token['type'] === HTML5::STARTTAG &&\r | |
4056 | $token['name'] === 'optgroup'\r | |
4057 | ) {\r | |
4058 | /* If the current node is an option element, act as if an end tag\r | |
4059 | with the tag name "option" had been seen. */\r | |
4060 | if (end($this->stack)->nodeName === 'option') {\r | |
4061 | $this->inSelect(\r | |
4062 | array(\r | |
4063 | 'name' => 'option',\r | |
4064 | 'type' => HTML5::ENDTAG\r | |
4065 | )\r | |
4066 | );\r | |
4067 | }\r | |
4068 | \r | |
4069 | /* If the current node is an optgroup element, act as if an end tag\r | |
4070 | with the tag name "optgroup" had been seen. */\r | |
4071 | if (end($this->stack)->nodeName === 'optgroup') {\r | |
4072 | $this->inSelect(\r | |
4073 | array(\r | |
4074 | 'name' => 'optgroup',\r | |
4075 | 'type' => HTML5::ENDTAG\r | |
4076 | )\r | |
4077 | );\r | |
4078 | }\r | |
4079 | \r | |
4080 | /* Insert an HTML element for the token. */\r | |
4081 | $this->insertElement($token);\r | |
4082 | \r | |
4083 | /* An end tag token whose tag name is "optgroup" */\r | |
4084 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
4085 | $token['name'] === 'optgroup'\r | |
4086 | ) {\r | |
4087 | /* First, if the current node is an option element, and the node\r | |
4088 | immediately before it in the stack of open elements is an optgroup\r | |
4089 | element, then act as if an end tag with the tag name "option" had\r | |
4090 | been seen. */\r | |
4091 | $elements_in_stack = count($this->stack);\r | |
4092 | \r | |
4093 | if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&\r | |
4094 | $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'\r | |
4095 | ) {\r | |
4096 | $this->inSelect(\r | |
4097 | array(\r | |
4098 | 'name' => 'option',\r | |
4099 | 'type' => HTML5::ENDTAG\r | |
4100 | )\r | |
4101 | );\r | |
4102 | }\r | |
4103 | \r | |
4104 | /* If the current node is an optgroup element, then pop that node\r | |
4105 | from the stack of open elements. Otherwise, this is a parse error,\r | |
4106 | ignore the token. */\r | |
4107 | if ($this->stack[$elements_in_stack - 1] === 'optgroup') {\r | |
4108 | array_pop($this->stack);\r | |
4109 | }\r | |
4110 | \r | |
4111 | /* An end tag token whose tag name is "option" */\r | |
4112 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
4113 | $token['name'] === 'option'\r | |
4114 | ) {\r | |
4115 | /* If the current node is an option element, then pop that node\r | |
4116 | from the stack of open elements. Otherwise, this is a parse error,\r | |
4117 | ignore the token. */\r | |
4118 | if (end($this->stack)->nodeName === 'option') {\r | |
4119 | array_pop($this->stack);\r | |
4120 | }\r | |
4121 | \r | |
4122 | /* An end tag whose tag name is "select" */\r | |
4123 | } elseif ($token['type'] === HTML5::ENDTAG &&\r | |
4124 | $token['name'] === 'select'\r | |
4125 | ) {\r | |
4126 | /* If the stack of open elements does not have an element in table\r | |
4127 | scope with the same tag name as the token, this is a parse error.\r | |
4128 | Ignore the token. (innerHTML case) */\r | |
4129 | if (!$this->elementInScope($token['name'], true)) {\r | |
4130 | // w/e\r | |
4131 | \r | |
4132 | /* Otherwise: */\r | |
4133 | } else {\r | |
4134 | /* Pop elements from the stack of open elements until a select\r | |
4135 | element has been popped from the stack. */\r | |
4136 | while (true) {\r | |
4137 | $current = end($this->stack)->nodeName;\r | |
4138 | array_pop($this->stack);\r | |
4139 | \r | |
4140 | if ($current === 'select') {\r | |
4141 | break;\r | |
4142 | }\r | |
4143 | }\r | |
4144 | \r | |
4145 | /* Reset the insertion mode appropriately. */\r | |
4146 | $this->resetInsertionMode();\r | |
4147 | }\r | |
4148 | \r | |
4149 | /* A start tag whose tag name is "select" */\r | |
4150 | } elseif ($token['name'] === 'select' &&\r | |
4151 | $token['type'] === HTML5::STARTTAG\r | |
4152 | ) {\r | |
4153 | /* Parse error. Act as if the token had been an end tag with the\r | |
4154 | tag name "select" instead. */\r | |
4155 | $this->inSelect(\r | |
4156 | array(\r | |
4157 | 'name' => 'select',\r | |
4158 | 'type' => HTML5::ENDTAG\r | |
4159 | )\r | |
4160 | );\r | |
4161 | \r | |
4162 | /* An end tag whose tag name is one of: "caption", "table", "tbody",\r | |
4163 | "tfoot", "thead", "tr", "td", "th" */\r | |
4164 | } elseif (in_array(\r | |
4165 | $token['name'],\r | |
4166 | array(\r | |
4167 | 'caption',\r | |
4168 | 'table',\r | |
4169 | 'tbody',\r | |
4170 | 'tfoot',\r | |
4171 | 'thead',\r | |
4172 | 'tr',\r | |
4173 | 'td',\r | |
4174 | 'th'\r | |
4175 | )\r | |
4176 | ) && $token['type'] === HTML5::ENDTAG\r | |
4177 | ) {\r | |
4178 | /* Parse error. */\r | |
4179 | // w/e\r | |
4180 | \r | |
4181 | /* If the stack of open elements has an element in table scope with\r | |
4182 | the same tag name as that of the token, then act as if an end tag\r | |
4183 | with the tag name "select" had been seen, and reprocess the token.\r | |
4184 | Otherwise, ignore the token. */\r | |
4185 | if ($this->elementInScope($token['name'], true)) {\r | |
4186 | $this->inSelect(\r | |
4187 | array(\r | |
4188 | 'name' => 'select',\r | |
4189 | 'type' => HTML5::ENDTAG\r | |
4190 | )\r | |
4191 | );\r | |
4192 | \r | |
4193 | $this->mainPhase($token);\r | |
4194 | }\r | |
4195 | \r | |
4196 | /* Anything else */\r | |
4197 | } else {\r | |
4198 | /* Parse error. Ignore the token. */\r | |
4199 | }\r | |
4200 | }\r | |
4201 | \r | |
4202 | private function afterBody($token)\r | |
4203 | {\r | |
4204 | /* Handle the token as follows: */\r | |
4205 | \r | |
4206 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
4207 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
4208 | or U+0020 SPACE */\r | |
4209 | if ($token['type'] === HTML5::CHARACTR &&\r | |
4210 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
4211 | ) {\r | |
4212 | /* Process the token as it would be processed if the insertion mode\r | |
4213 | was "in body". */\r | |
4214 | $this->inBody($token);\r | |
4215 | \r | |
4216 | /* A comment token */\r | |
4217 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
4218 | /* Append a Comment node to the first element in the stack of open\r | |
4219 | elements (the html element), with the data attribute set to the\r | |
4220 | data given in the comment token. */\r | |
4221 | $comment = $this->dom->createComment($token['data']);\r | |
4222 | $this->stack[0]->appendChild($comment);\r | |
4223 | \r | |
4224 | /* An end tag with the tag name "html" */\r | |
4225 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {\r | |
4226 | /* If the parser was originally created in order to handle the\r | |
4227 | setting of an element's innerHTML attribute, this is a parse error;\r | |
4228 | ignore the token. (The element will be an html element in this\r | |
4229 | case.) (innerHTML case) */\r | |
4230 | \r | |
4231 | /* Otherwise, switch to the trailing end phase. */\r | |
4232 | $this->phase = self::END_PHASE;\r | |
4233 | \r | |
4234 | /* Anything else */\r | |
4235 | } else {\r | |
4236 | /* Parse error. Set the insertion mode to "in body" and reprocess\r | |
4237 | the token. */\r | |
4238 | $this->mode = self::IN_BODY;\r | |
4239 | return $this->inBody($token);\r | |
4240 | }\r | |
4241 | }\r | |
4242 | \r | |
4243 | private function inFrameset($token)\r | |
4244 | {\r | |
4245 | /* Handle the token as follows: */\r | |
4246 | \r | |
4247 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
4248 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
4249 | U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */\r | |
4250 | if ($token['type'] === HTML5::CHARACTR &&\r | |
4251 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
4252 | ) {\r | |
4253 | /* Append the character to the current node. */\r | |
4254 | $this->insertText($token['data']);\r | |
4255 | \r | |
4256 | /* A comment token */\r | |
4257 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
4258 | /* Append a Comment node to the current node with the data\r | |
4259 | attribute set to the data given in the comment token. */\r | |
4260 | $this->insertComment($token['data']);\r | |
4261 | \r | |
4262 | /* A start tag with the tag name "frameset" */\r | |
4263 | } elseif ($token['name'] === 'frameset' &&\r | |
4264 | $token['type'] === HTML5::STARTTAG\r | |
4265 | ) {\r | |
4266 | $this->insertElement($token);\r | |
4267 | \r | |
4268 | /* An end tag with the tag name "frameset" */\r | |
4269 | } elseif ($token['name'] === 'frameset' &&\r | |
4270 | $token['type'] === HTML5::ENDTAG\r | |
4271 | ) {\r | |
4272 | /* If the current node is the root html element, then this is a\r | |
4273 | parse error; ignore the token. (innerHTML case) */\r | |
4274 | if (end($this->stack)->nodeName === 'html') {\r | |
4275 | // Ignore\r | |
4276 | \r | |
4277 | } else {\r | |
4278 | /* Otherwise, pop the current node from the stack of open\r | |
4279 | elements. */\r | |
4280 | array_pop($this->stack);\r | |
4281 | \r | |
4282 | /* If the parser was not originally created in order to handle\r | |
4283 | the setting of an element's innerHTML attribute (innerHTML case),\r | |
4284 | and the current node is no longer a frameset element, then change\r | |
4285 | the insertion mode to "after frameset". */\r | |
4286 | $this->mode = self::AFTR_FRAME;\r | |
4287 | }\r | |
4288 | \r | |
4289 | /* A start tag with the tag name "frame" */\r | |
4290 | } elseif ($token['name'] === 'frame' &&\r | |
4291 | $token['type'] === HTML5::STARTTAG\r | |
4292 | ) {\r | |
4293 | /* Insert an HTML element for the token. */\r | |
4294 | $this->insertElement($token);\r | |
4295 | \r | |
4296 | /* Immediately pop the current node off the stack of open elements. */\r | |
4297 | array_pop($this->stack);\r | |
4298 | \r | |
4299 | /* A start tag with the tag name "noframes" */\r | |
4300 | } elseif ($token['name'] === 'noframes' &&\r | |
4301 | $token['type'] === HTML5::STARTTAG\r | |
4302 | ) {\r | |
4303 | /* Process the token as if the insertion mode had been "in body". */\r | |
4304 | $this->inBody($token);\r | |
4305 | \r | |
4306 | /* Anything else */\r | |
4307 | } else {\r | |
4308 | /* Parse error. Ignore the token. */\r | |
4309 | }\r | |
4310 | }\r | |
4311 | \r | |
4312 | private function afterFrameset($token)\r | |
4313 | {\r | |
4314 | /* Handle the token as follows: */\r | |
4315 | \r | |
4316 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
4317 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
4318 | U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */\r | |
4319 | if ($token['type'] === HTML5::CHARACTR &&\r | |
4320 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
4321 | ) {\r | |
4322 | /* Append the character to the current node. */\r | |
4323 | $this->insertText($token['data']);\r | |
4324 | \r | |
4325 | /* A comment token */\r | |
4326 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
4327 | /* Append a Comment node to the current node with the data\r | |
4328 | attribute set to the data given in the comment token. */\r | |
4329 | $this->insertComment($token['data']);\r | |
4330 | \r | |
4331 | /* An end tag with the tag name "html" */\r | |
4332 | } elseif ($token['name'] === 'html' &&\r | |
4333 | $token['type'] === HTML5::ENDTAG\r | |
4334 | ) {\r | |
4335 | /* Switch to the trailing end phase. */\r | |
4336 | $this->phase = self::END_PHASE;\r | |
4337 | \r | |
4338 | /* A start tag with the tag name "noframes" */\r | |
4339 | } elseif ($token['name'] === 'noframes' &&\r | |
4340 | $token['type'] === HTML5::STARTTAG\r | |
4341 | ) {\r | |
4342 | /* Process the token as if the insertion mode had been "in body". */\r | |
4343 | $this->inBody($token);\r | |
4344 | \r | |
4345 | /* Anything else */\r | |
4346 | } else {\r | |
4347 | /* Parse error. Ignore the token. */\r | |
4348 | }\r | |
4349 | }\r | |
4350 | \r | |
4351 | private function trailingEndPhase($token)\r | |
4352 | {\r | |
4353 | /* After the main phase, as each token is emitted from the tokenisation\r | |
4354 | stage, it must be processed as described in this section. */\r | |
4355 | \r | |
4356 | /* A DOCTYPE token */\r | |
4357 | if ($token['type'] === HTML5::DOCTYPE) {\r | |
4358 | // Parse error. Ignore the token.\r | |
4359 | \r | |
4360 | /* A comment token */\r | |
4361 | } elseif ($token['type'] === HTML5::COMMENT) {\r | |
4362 | /* Append a Comment node to the Document object with the data\r | |
4363 | attribute set to the data given in the comment token. */\r | |
4364 | $comment = $this->dom->createComment($token['data']);\r | |
4365 | $this->dom->appendChild($comment);\r | |
4366 | \r | |
4367 | /* A character token that is one of one of U+0009 CHARACTER TABULATION,\r | |
4368 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
4369 | or U+0020 SPACE */\r | |
4370 | } elseif ($token['type'] === HTML5::CHARACTR &&\r | |
4371 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])\r | |
4372 | ) {\r | |
4373 | /* Process the token as it would be processed in the main phase. */\r | |
4374 | $this->mainPhase($token);\r | |
4375 | \r | |
4376 | /* A character token that is not one of U+0009 CHARACTER TABULATION,\r | |
4377 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),\r | |
4378 | or U+0020 SPACE. Or a start tag token. Or an end tag token. */\r | |
4379 | } elseif (($token['type'] === HTML5::CHARACTR &&\r | |
4380 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||\r | |
4381 | $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG\r | |
4382 | ) {\r | |
4383 | /* Parse error. Switch back to the main phase and reprocess the\r | |
4384 | token. */\r | |
4385 | $this->phase = self::MAIN_PHASE;\r | |
4386 | return $this->mainPhase($token);\r | |
4387 | \r | |
4388 | /* An end-of-file token */\r | |
4389 | } elseif ($token['type'] === HTML5::EOF) {\r | |
4390 | /* OMG DONE!! */\r | |
4391 | }\r | |
4392 | }\r | |
4393 | \r | |
4394 | private function insertElement($token, $append = true, $check = false)\r | |
4395 | {\r | |
4396 | // Proprietary workaround for libxml2's limitations with tag names\r | |
4397 | if ($check) {\r | |
4398 | // Slightly modified HTML5 tag-name modification,\r | |
4399 | // removing anything that's not an ASCII letter, digit, or hyphen\r | |
4400 | $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);\r | |
4401 | // Remove leading hyphens and numbers\r | |
4402 | $token['name'] = ltrim($token['name'], '-0..9');\r | |
4403 | // In theory, this should ever be needed, but just in case\r | |
4404 | if ($token['name'] === '') {\r | |
4405 | $token['name'] = 'span';\r | |
4406 | } // arbitrary generic choice\r | |
4407 | }\r | |
4408 | \r | |
4409 | $el = $this->dom->createElement($token['name']);\r | |
4410 | \r | |
4411 | foreach ($token['attr'] as $attr) {\r | |
4412 | if (!$el->hasAttribute($attr['name'])) {\r | |
4413 | $el->setAttribute($attr['name'], $attr['value']);\r | |
4414 | }\r | |
4415 | }\r | |
4416 | \r | |
4417 | $this->appendToRealParent($el);\r | |
4418 | $this->stack[] = $el;\r | |
4419 | \r | |
4420 | return $el;\r | |
4421 | }\r | |
4422 | \r | |
4423 | private function insertText($data)\r | |
4424 | {\r | |
4425 | $text = $this->dom->createTextNode($data);\r | |
4426 | $this->appendToRealParent($text);\r | |
4427 | }\r | |
4428 | \r | |
4429 | private function insertComment($data)\r | |
4430 | {\r | |
4431 | $comment = $this->dom->createComment($data);\r | |
4432 | $this->appendToRealParent($comment);\r | |
4433 | }\r | |
4434 | \r | |
4435 | private function appendToRealParent($node)\r | |
4436 | {\r | |
4437 | if ($this->foster_parent === null) {\r | |
4438 | end($this->stack)->appendChild($node);\r | |
4439 | \r | |
4440 | } elseif ($this->foster_parent !== null) {\r | |
4441 | /* If the foster parent element is the parent element of the\r | |
4442 | last table element in the stack of open elements, then the new\r | |
4443 | node must be inserted immediately before the last table element\r | |
4444 | in the stack of open elements in the foster parent element;\r | |
4445 | otherwise, the new node must be appended to the foster parent\r | |
4446 | element. */\r | |
4447 | for ($n = count($this->stack) - 1; $n >= 0; $n--) {\r | |
4448 | if ($this->stack[$n]->nodeName === 'table' &&\r | |
4449 | $this->stack[$n]->parentNode !== null\r | |
4450 | ) {\r | |
4451 | $table = $this->stack[$n];\r | |
4452 | break;\r | |
4453 | }\r | |
4454 | }\r | |
4455 | \r | |
4456 | if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {\r | |
4457 | $this->foster_parent->insertBefore($node, $table);\r | |
4458 | } else {\r | |
4459 | $this->foster_parent->appendChild($node);\r | |
4460 | }\r | |
4461 | \r | |
4462 | $this->foster_parent = null;\r | |
4463 | }\r | |
4464 | }\r | |
4465 | \r | |
4466 | private function elementInScope($el, $table = false)\r | |
4467 | {\r | |
4468 | if (is_array($el)) {\r | |
4469 | foreach ($el as $element) {\r | |
4470 | if ($this->elementInScope($element, $table)) {\r | |
4471 | return true;\r | |
4472 | }\r | |
4473 | }\r | |
4474 | \r | |
4475 | return false;\r | |
4476 | }\r | |
4477 | \r | |
4478 | $leng = count($this->stack);\r | |
4479 | \r | |
4480 | for ($n = 0; $n < $leng; $n++) {\r | |
4481 | /* 1. Initialise node to be the current node (the bottommost node of\r | |
4482 | the stack). */\r | |
4483 | $node = $this->stack[$leng - 1 - $n];\r | |
4484 | \r | |
4485 | if ($node->tagName === $el) {\r | |
4486 | /* 2. If node is the target node, terminate in a match state. */\r | |
4487 | return true;\r | |
4488 | \r | |
4489 | } elseif ($node->tagName === 'table') {\r | |
4490 | /* 3. Otherwise, if node is a table element, terminate in a failure\r | |
4491 | state. */\r | |
4492 | return false;\r | |
4493 | \r | |
4494 | } elseif ($table === true && in_array(\r | |
4495 | $node->tagName,\r | |
4496 | array(\r | |
4497 | 'caption',\r | |
4498 | 'td',\r | |
4499 | 'th',\r | |
4500 | 'button',\r | |
4501 | 'marquee',\r | |
4502 | 'object'\r | |
4503 | )\r | |
4504 | )\r | |
4505 | ) {\r | |
4506 | /* 4. Otherwise, if the algorithm is the "has an element in scope"\r | |
4507 | variant (rather than the "has an element in table scope" variant),\r | |
4508 | and node is one of the following, terminate in a failure state. */\r | |
4509 | return false;\r | |
4510 | \r | |
4511 | } elseif ($node === $node->ownerDocument->documentElement) {\r | |
4512 | /* 5. Otherwise, if node is an html element (root element), terminate\r | |
4513 | in a failure state. (This can only happen if the node is the topmost\r | |
4514 | node of the stack of open elements, and prevents the next step from\r | |
4515 | being invoked if there are no more elements in the stack.) */\r | |
4516 | return false;\r | |
4517 | }\r | |
4518 | \r | |
4519 | /* Otherwise, set node to the previous entry in the stack of open\r | |
4520 | elements and return to step 2. (This will never fail, since the loop\r | |
4521 | will always terminate in the previous step if the top of the stack\r | |
4522 | is reached.) */\r | |
4523 | }\r | |
4524 | }\r | |
4525 | \r | |
4526 | private function reconstructActiveFormattingElements()\r | |
4527 | {\r | |
4528 | /* 1. If there are no entries in the list of active formatting elements,\r | |
4529 | then there is nothing to reconstruct; stop this algorithm. */\r | |
4530 | $formatting_elements = count($this->a_formatting);\r | |
4531 | \r | |
4532 | if ($formatting_elements === 0) {\r | |
4533 | return false;\r | |
4534 | }\r | |
4535 | \r | |
4536 | /* 3. Let entry be the last (most recently added) element in the list\r | |
4537 | of active formatting elements. */\r | |
4538 | $entry = end($this->a_formatting);\r | |
4539 | \r | |
4540 | /* 2. If the last (most recently added) entry in the list of active\r | |
4541 | formatting elements is a marker, or if it is an element that is in the\r | |
4542 | stack of open elements, then there is nothing to reconstruct; stop this\r | |
4543 | algorithm. */\r | |
4544 | if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {\r | |
4545 | return false;\r | |
4546 | }\r | |
4547 | \r | |
4548 | for ($a = $formatting_elements - 1; $a >= 0; true) {\r | |
4549 | /* 4. If there are no entries before entry in the list of active\r | |
4550 | formatting elements, then jump to step 8. */\r | |
4551 | if ($a === 0) {\r | |
4552 | $step_seven = false;\r | |
4553 | break;\r | |
4554 | }\r | |
4555 | \r | |
4556 | /* 5. Let entry be the entry one earlier than entry in the list of\r | |
4557 | active formatting elements. */\r | |
4558 | $a--;\r | |
4559 | $entry = $this->a_formatting[$a];\r | |
4560 | \r | |
4561 | /* 6. If entry is neither a marker nor an element that is also in\r | |
4562 | thetack of open elements, go to step 4. */\r | |
4563 | if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {\r | |
4564 | break;\r | |
4565 | }\r | |
4566 | }\r | |
4567 | \r | |
4568 | while (true) {\r | |
4569 | /* 7. Let entry be the element one later than entry in the list of\r | |
4570 | active formatting elements. */\r | |
4571 | if (isset($step_seven) && $step_seven === true) {\r | |
4572 | $a++;\r | |
4573 | $entry = $this->a_formatting[$a];\r | |
4574 | }\r | |
4575 | \r | |
4576 | /* 8. Perform a shallow clone of the element entry to obtain clone. */\r | |
4577 | $clone = $entry->cloneNode();\r | |
4578 | \r | |
4579 | /* 9. Append clone to the current node and push it onto the stack\r | |
4580 | of open elements so that it is the new current node. */\r | |
4581 | end($this->stack)->appendChild($clone);\r | |
4582 | $this->stack[] = $clone;\r | |
4583 | \r | |
4584 | /* 10. Replace the entry for entry in the list with an entry for\r | |
4585 | clone. */\r | |
4586 | $this->a_formatting[$a] = $clone;\r | |
4587 | \r | |
4588 | /* 11. If the entry for clone in the list of active formatting\r | |
4589 | elements is not the last entry in the list, return to step 7. */\r | |
4590 | if (end($this->a_formatting) !== $clone) {\r | |
4591 | $step_seven = true;\r | |
4592 | } else {\r | |
4593 | break;\r | |
4594 | }\r | |
4595 | }\r | |
4596 | }\r | |
4597 | \r | |
4598 | private function clearTheActiveFormattingElementsUpToTheLastMarker()\r | |
4599 | {\r | |
4600 | /* When the steps below require the UA to clear the list of active\r | |
4601 | formatting elements up to the last marker, the UA must perform the\r | |
4602 | following steps: */\r | |
4603 | \r | |
4604 | while (true) {\r | |
4605 | /* 1. Let entry be the last (most recently added) entry in the list\r | |
4606 | of active formatting elements. */\r | |
4607 | $entry = end($this->a_formatting);\r | |
4608 | \r | |
4609 | /* 2. Remove entry from the list of active formatting elements. */\r | |
4610 | array_pop($this->a_formatting);\r | |
4611 | \r | |
4612 | /* 3. If entry was a marker, then stop the algorithm at this point.\r | |
4613 | The list has been cleared up to the last marker. */\r | |
4614 | if ($entry === self::MARKER) {\r | |
4615 | break;\r | |
4616 | }\r | |
4617 | }\r | |
4618 | }\r | |
4619 | \r | |
4620 | private function generateImpliedEndTags($exclude = array())\r | |
4621 | {\r | |
4622 | /* When the steps below require the UA to generate implied end tags,\r | |
4623 | then, if the current node is a dd element, a dt element, an li element,\r | |
4624 | a p element, a td element, a th element, or a tr element, the UA must\r | |
4625 | act as if an end tag with the respective tag name had been seen and\r | |
4626 | then generate implied end tags again. */\r | |
4627 | $node = end($this->stack);\r | |
4628 | $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);\r | |
4629 | \r | |
4630 | while (in_array(end($this->stack)->nodeName, $elements)) {\r | |
4631 | array_pop($this->stack);\r | |
4632 | }\r | |
4633 | }\r | |
4634 | \r | |
4635 | private function getElementCategory($node)\r | |
4636 | {\r | |
4637 | $name = $node->tagName;\r | |
4638 | if (in_array($name, $this->special)) {\r | |
4639 | return self::SPECIAL;\r | |
4640 | } elseif (in_array($name, $this->scoping)) {\r | |
4641 | return self::SCOPING;\r | |
4642 | } elseif (in_array($name, $this->formatting)) {\r | |
4643 | return self::FORMATTING;\r | |
4644 | } else {\r | |
4645 | return self::PHRASING;\r | |
4646 | }\r | |
4647 | }\r | |
4648 | \r | |
4649 | private function clearStackToTableContext($elements)\r | |
4650 | {\r | |
4651 | /* When the steps above require the UA to clear the stack back to a\r | |
4652 | table context, it means that the UA must, while the current node is not\r | |
4653 | a table element or an html element, pop elements from the stack of open\r | |
4654 | elements. If this causes any elements to be popped from the stack, then\r | |
4655 | this is a parse error. */\r | |
4656 | while (true) {\r | |
4657 | $node = end($this->stack)->nodeName;\r | |
4658 | \r | |
4659 | if (in_array($node, $elements)) {\r | |
4660 | break;\r | |
4661 | } else {\r | |
4662 | array_pop($this->stack);\r | |
4663 | }\r | |
4664 | }\r | |
4665 | }\r | |
4666 | \r | |
4667 | private function resetInsertionMode()\r | |
4668 | {\r | |
4669 | /* 1. Let last be false. */\r | |
4670 | $last = false;\r | |
4671 | $leng = count($this->stack);\r | |
4672 | \r | |
4673 | for ($n = $leng - 1; $n >= 0; $n--) {\r | |
4674 | /* 2. Let node be the last node in the stack of open elements. */\r | |
4675 | $node = $this->stack[$n];\r | |
4676 | \r | |
4677 | /* 3. If node is the first node in the stack of open elements, then\r | |
4678 | set last to true. If the element whose innerHTML attribute is being\r | |
4679 | set is neither a td element nor a th element, then set node to the\r | |
4680 | element whose innerHTML attribute is being set. (innerHTML case) */\r | |
4681 | if ($this->stack[0]->isSameNode($node)) {\r | |
4682 | $last = true;\r | |
4683 | }\r | |
4684 | \r | |
4685 | /* 4. If node is a select element, then switch the insertion mode to\r | |
4686 | "in select" and abort these steps. (innerHTML case) */\r | |
4687 | if ($node->nodeName === 'select') {\r | |
4688 | $this->mode = self::IN_SELECT;\r | |
4689 | break;\r | |
4690 | \r | |
4691 | /* 5. If node is a td or th element, then switch the insertion mode\r | |
4692 | to "in cell" and abort these steps. */\r | |
4693 | } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {\r | |
4694 | $this->mode = self::IN_CELL;\r | |
4695 | break;\r | |
4696 | \r | |
4697 | /* 6. If node is a tr element, then switch the insertion mode to\r | |
4698 | "in row" and abort these steps. */\r | |
4699 | } elseif ($node->nodeName === 'tr') {\r | |
4700 | $this->mode = self::IN_ROW;\r | |
4701 | break;\r | |
4702 | \r | |
4703 | /* 7. If node is a tbody, thead, or tfoot element, then switch the\r | |
4704 | insertion mode to "in table body" and abort these steps. */\r | |
4705 | } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {\r | |
4706 | $this->mode = self::IN_TBODY;\r | |
4707 | break;\r | |
4708 | \r | |
4709 | /* 8. If node is a caption element, then switch the insertion mode\r | |
4710 | to "in caption" and abort these steps. */\r | |
4711 | } elseif ($node->nodeName === 'caption') {\r | |
4712 | $this->mode = self::IN_CAPTION;\r | |
4713 | break;\r | |
4714 | \r | |
4715 | /* 9. If node is a colgroup element, then switch the insertion mode\r | |
4716 | to "in column group" and abort these steps. (innerHTML case) */\r | |
4717 | } elseif ($node->nodeName === 'colgroup') {\r | |
4718 | $this->mode = self::IN_CGROUP;\r | |
4719 | break;\r | |
4720 | \r | |
4721 | /* 10. If node is a table element, then switch the insertion mode\r | |
4722 | to "in table" and abort these steps. */\r | |
4723 | } elseif ($node->nodeName === 'table') {\r | |
4724 | $this->mode = self::IN_TABLE;\r | |
4725 | break;\r | |
4726 | \r | |
4727 | /* 11. If node is a head element, then switch the insertion mode\r | |
4728 | to "in body" ("in body"! not "in head"!) and abort these steps.\r | |
4729 | (innerHTML case) */\r | |
4730 | } elseif ($node->nodeName === 'head') {\r | |
4731 | $this->mode = self::IN_BODY;\r | |
4732 | break;\r | |
4733 | \r | |
4734 | /* 12. If node is a body element, then switch the insertion mode to\r | |
4735 | "in body" and abort these steps. */\r | |
4736 | } elseif ($node->nodeName === 'body') {\r | |
4737 | $this->mode = self::IN_BODY;\r | |
4738 | break;\r | |
4739 | \r | |
4740 | /* 13. If node is a frameset element, then switch the insertion\r | |
4741 | mode to "in frameset" and abort these steps. (innerHTML case) */\r | |
4742 | } elseif ($node->nodeName === 'frameset') {\r | |
4743 | $this->mode = self::IN_FRAME;\r | |
4744 | break;\r | |
4745 | \r | |
4746 | /* 14. If node is an html element, then: if the head element\r | |
4747 | pointer is null, switch the insertion mode to "before head",\r | |
4748 | otherwise, switch the insertion mode to "after head". In either\r | |
4749 | case, abort these steps. (innerHTML case) */\r | |
4750 | } elseif ($node->nodeName === 'html') {\r | |
4751 | $this->mode = ($this->head_pointer === null)\r | |
4752 | ? self::BEFOR_HEAD\r | |
4753 | : self::AFTER_HEAD;\r | |
4754 | \r | |
4755 | break;\r | |
4756 | \r | |
4757 | /* 15. If last is true, then set the insertion mode to "in body"\r | |
4758 | and abort these steps. (innerHTML case) */\r | |
4759 | } elseif ($last) {\r | |
4760 | $this->mode = self::IN_BODY;\r | |
4761 | break;\r | |
4762 | }\r | |
4763 | }\r | |
4764 | }\r | |
4765 | \r | |
4766 | private function closeCell()\r | |
4767 | {\r | |
4768 | /* If the stack of open elements has a td or th element in table scope,\r | |
4769 | then act as if an end tag token with that tag name had been seen. */\r | |
4770 | foreach (array('td', 'th') as $cell) {\r | |
4771 | if ($this->elementInScope($cell, true)) {\r | |
4772 | $this->inCell(\r | |
4773 | array(\r | |
4774 | 'name' => $cell,\r | |
4775 | 'type' => HTML5::ENDTAG\r | |
4776 | )\r | |
4777 | );\r | |
4778 | \r | |
4779 | break;\r | |
4780 | }\r | |
4781 | }\r | |
4782 | }\r | |
4783 | \r | |
4784 | public function save()\r | |
4785 | {\r | |
4786 | return $this->dom;\r | |
4787 | }\r | |
4788 | }\r |