diff options
author | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2014-02-21 15:43:14 +0100 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2014-02-21 15:43:14 +0100 |
commit | d4949327efa15b492cab1bef3fe074290a328a17 (patch) | |
tree | e89e0322bb1f1b06d663fd10fdded21bac867e5d /inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer | |
parent | c9bd17a1007bb78e5de0775efca01df0fb515031 (diff) | |
download | wallabag-d4949327efa15b492cab1bef3fe074290a328a17.tar.gz wallabag-d4949327efa15b492cab1bef3fe074290a328a17.tar.zst wallabag-d4949327efa15b492cab1bef3fe074290a328a17.zip |
[add] HTML Purifier added to clean code
Diffstat (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer')
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php | 280 | ||||
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php | 539 | ||||
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/PH5P.php | 4788 |
3 files changed, 5607 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 00000000..b13e6c55 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php | |||
@@ -0,0 +1,280 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Parser that uses PHP 5's DOM extension (part of the core). | ||
5 | * | ||
6 | * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. | ||
7 | * It gives us a forgiving HTML parser, which we use to transform the HTML | ||
8 | * into a DOM, and then into the tokens. It is blazingly fast (for large | ||
9 | * documents, it performs twenty times faster than | ||
10 | * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. | ||
11 | * | ||
12 | * @note Any empty elements will have empty tokens associated with them, even if | ||
13 | * this is prohibited by the spec. This is cannot be fixed until the spec | ||
14 | * comes into play. | ||
15 | * | ||
16 | * @note PHP's DOM extension does not actually parse any entities, we use | ||
17 | * our own function to do that. | ||
18 | * | ||
19 | * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. | ||
20 | * If this is a huge problem, due to the fact that HTML is hand | ||
21 | * edited and you are unable to get a parser cache that caches the | ||
22 | * the output of HTML Purifier while keeping the original HTML lying | ||
23 | * around, you may want to run Tidy on the resulting output or use | ||
24 | * HTMLPurifier_DirectLex | ||
25 | */ | ||
26 | |||
27 | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer | ||
28 | { | ||
29 | |||
30 | /** | ||
31 | * @type HTMLPurifier_TokenFactory | ||
32 | */ | ||
33 | private $factory; | ||
34 | |||
35 | public function __construct() | ||
36 | { | ||
37 | // setup the factory | ||
38 | parent::__construct(); | ||
39 | $this->factory = new HTMLPurifier_TokenFactory(); | ||
40 | } | ||
41 | |||
42 | /** | ||
43 | * @param string $html | ||
44 | * @param HTMLPurifier_Config $config | ||
45 | * @param HTMLPurifier_Context $context | ||
46 | * @return HTMLPurifier_Token[] | ||
47 | */ | ||
48 | public function tokenizeHTML($html, $config, $context) | ||
49 | { | ||
50 | $html = $this->normalize($html, $config, $context); | ||
51 | |||
52 | // attempt to armor stray angled brackets that cannot possibly | ||
53 | // form tags and thus are probably being used as emoticons | ||
54 | if ($config->get('Core.AggressivelyFixLt')) { | ||
55 | $char = '[^a-z!\/]'; | ||
56 | $comment = "/<!--(.*?)(-->|\z)/is"; | ||
57 | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); | ||
58 | do { | ||
59 | $old = $html; | ||
60 | $html = preg_replace("/<($char)/i", '<\\1', $html); | ||
61 | } while ($html !== $old); | ||
62 | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments | ||
63 | } | ||
64 | |||
65 | // preprocess html, essential for UTF-8 | ||
66 | $html = $this->wrapHTML($html, $config, $context); | ||
67 | |||
68 | $doc = new DOMDocument(); | ||
69 | $doc->encoding = 'UTF-8'; // theoretically, the above has this covered | ||
70 | |||
71 | set_error_handler(array($this, 'muteErrorHandler')); | ||
72 | $doc->loadHTML($html); | ||
73 | restore_error_handler(); | ||
74 | |||
75 | $tokens = array(); | ||
76 | $this->tokenizeDOM( | ||
77 | $doc->getElementsByTagName('html')->item(0)-> // <html> | ||
78 | getElementsByTagName('body')->item(0)-> // <body> | ||
79 | getElementsByTagName('div')->item(0), // <div> | ||
80 | $tokens | ||
81 | ); | ||
82 | return $tokens; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * Iterative function that tokenizes a node, putting it into an accumulator. | ||
87 | * To iterate is human, to recurse divine - L. Peter Deutsch | ||
88 | * @param DOMNode $node DOMNode to be tokenized. | ||
89 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. | ||
90 | * @return HTMLPurifier_Token of node appended to previously passed tokens. | ||
91 | */ | ||
92 | protected function tokenizeDOM($node, &$tokens) | ||
93 | { | ||
94 | $level = 0; | ||
95 | $nodes = array($level => new HTMLPurifier_Queue(array($node))); | ||
96 | $closingNodes = array(); | ||
97 | do { | ||
98 | while (!$nodes[$level]->isEmpty()) { | ||
99 | $node = $nodes[$level]->shift(); // FIFO | ||
100 | $collect = $level > 0 ? true : false; | ||
101 | $needEndingTag = $this->createStartNode($node, $tokens, $collect); | ||
102 | if ($needEndingTag) { | ||
103 | $closingNodes[$level][] = $node; | ||
104 | } | ||
105 | if ($node->childNodes && $node->childNodes->length) { | ||
106 | $level++; | ||
107 | $nodes[$level] = new HTMLPurifier_Queue(); | ||
108 | foreach ($node->childNodes as $childNode) { | ||
109 | $nodes[$level]->push($childNode); | ||
110 | } | ||
111 | } | ||
112 | } | ||
113 | $level--; | ||
114 | if ($level && isset($closingNodes[$level])) { | ||
115 | while ($node = array_pop($closingNodes[$level])) { | ||
116 | $this->createEndNode($node, $tokens); | ||
117 | } | ||
118 | } | ||
119 | } while ($level > 0); | ||
120 | } | ||
121 | |||
122 | /** | ||
123 | * @param DOMNode $node DOMNode to be tokenized. | ||
124 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. | ||
125 | * @param bool $collect Says whether or start and close are collected, set to | ||
126 | * false at first recursion because it's the implicit DIV | ||
127 | * tag you're dealing with. | ||
128 | * @return bool if the token needs an endtoken | ||
129 | * @todo data and tagName properties don't seem to exist in DOMNode? | ||
130 | */ | ||
131 | protected function createStartNode($node, &$tokens, $collect) | ||
132 | { | ||
133 | // intercept non element nodes. WE MUST catch all of them, | ||
134 | // but we're not getting the character reference nodes because | ||
135 | // those should have been preprocessed | ||
136 | if ($node->nodeType === XML_TEXT_NODE) { | ||
137 | $tokens[] = $this->factory->createText($node->data); | ||
138 | return false; | ||
139 | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { | ||
140 | // undo libxml's special treatment of <script> and <style> tags | ||
141 | $last = end($tokens); | ||
142 | $data = $node->data; | ||
143 | // (note $node->tagname is already normalized) | ||
144 | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { | ||
145 | $new_data = trim($data); | ||
146 | if (substr($new_data, 0, 4) === '<!--') { | ||
147 | $data = substr($new_data, 4); | ||
148 | if (substr($data, -3) === '-->') { | ||
149 | $data = substr($data, 0, -3); | ||
150 | } else { | ||
151 | // Highly suspicious! Not sure what to do... | ||
152 | } | ||
153 | } | ||
154 | } | ||
155 | $tokens[] = $this->factory->createText($this->parseData($data)); | ||
156 | return false; | ||
157 | } elseif ($node->nodeType === XML_COMMENT_NODE) { | ||
158 | // this is code is only invoked for comments in script/style in versions | ||
159 | // of libxml pre-2.6.28 (regular comments, of course, are still | ||
160 | // handled regularly) | ||
161 | $tokens[] = $this->factory->createComment($node->data); | ||
162 | return false; | ||
163 | } elseif ($node->nodeType !== XML_ELEMENT_NODE) { | ||
164 | // not-well tested: there may be other nodes we have to grab | ||
165 | return false; | ||
166 | } | ||
167 | |||
168 | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); | ||
169 | |||
170 | // We still have to make sure that the element actually IS empty | ||
171 | if (!$node->childNodes->length) { | ||
172 | if ($collect) { | ||
173 | $tokens[] = $this->factory->createEmpty($node->tagName, $attr); | ||
174 | } | ||
175 | return false; | ||
176 | } else { | ||
177 | if ($collect) { | ||
178 | $tokens[] = $this->factory->createStart( | ||
179 | $tag_name = $node->tagName, // somehow, it get's dropped | ||
180 | $attr | ||
181 | ); | ||
182 | } | ||
183 | return true; | ||
184 | } | ||
185 | } | ||
186 | |||
187 | /** | ||
188 | * @param DOMNode $node | ||
189 | * @param HTMLPurifier_Token[] $tokens | ||
190 | */ | ||
191 | protected function createEndNode($node, &$tokens) | ||
192 | { | ||
193 | $tokens[] = $this->factory->createEnd($node->tagName); | ||
194 | } | ||
195 | |||
196 | |||
197 | /** | ||
198 | * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. | ||
199 | * | ||
200 | * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. | ||
201 | * @return array Associative array of attributes. | ||
202 | */ | ||
203 | protected function transformAttrToAssoc($node_map) | ||
204 | { | ||
205 | // NamedNodeMap is documented very well, so we're using undocumented | ||
206 | // features, namely, the fact that it implements Iterator and | ||
207 | // has a ->length attribute | ||
208 | if ($node_map->length === 0) { | ||
209 | return array(); | ||
210 | } | ||
211 | $array = array(); | ||
212 | foreach ($node_map as $attr) { | ||
213 | $array[$attr->name] = $attr->value; | ||
214 | } | ||
215 | return $array; | ||
216 | } | ||
217 | |||
218 | /** | ||
219 | * An error handler that mutes all errors | ||
220 | * @param int $errno | ||
221 | * @param string $errstr | ||
222 | */ | ||
223 | public function muteErrorHandler($errno, $errstr) | ||
224 | { | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * Callback function for undoing escaping of stray angled brackets | ||
229 | * in comments | ||
230 | * @param array $matches | ||
231 | * @return string | ||
232 | */ | ||
233 | public function callbackUndoCommentSubst($matches) | ||
234 | { | ||
235 | return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; | ||
236 | } | ||
237 | |||
238 | /** | ||
239 | * Callback function that entity-izes ampersands in comments so that | ||
240 | * callbackUndoCommentSubst doesn't clobber them | ||
241 | * @param array $matches | ||
242 | * @return string | ||
243 | */ | ||
244 | public function callbackArmorCommentEntities($matches) | ||
245 | { | ||
246 | return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; | ||
247 | } | ||
248 | |||
249 | /** | ||
250 | * Wraps an HTML fragment in the necessary HTML | ||
251 | * @param string $html | ||
252 | * @param HTMLPurifier_Config $config | ||
253 | * @param HTMLPurifier_Context $context | ||
254 | * @return string | ||
255 | */ | ||
256 | protected function wrapHTML($html, $config, $context) | ||
257 | { | ||
258 | $def = $config->getDefinition('HTML'); | ||
259 | $ret = ''; | ||
260 | |||
261 | if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { | ||
262 | $ret .= '<!DOCTYPE html '; | ||
263 | if (!empty($def->doctype->dtdPublic)) { | ||
264 | $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; | ||
265 | } | ||
266 | if (!empty($def->doctype->dtdSystem)) { | ||
267 | $ret .= '"' . $def->doctype->dtdSystem . '" '; | ||
268 | } | ||
269 | $ret .= '>'; | ||
270 | } | ||
271 | |||
272 | $ret .= '<html><head>'; | ||
273 | $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; | ||
274 | // No protection if $html contains a stray </div>! | ||
275 | $ret .= '</head><body><div>' . $html . '</div></body></html>'; | ||
276 | return $ret; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | // vim: et sw=4 sts=4 | ||
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php new file mode 100644 index 00000000..a07f4973 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DirectLex.php | |||
@@ -0,0 +1,539 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Our in-house implementation of a parser. | ||
5 | * | ||
6 | * A pure PHP parser, DirectLex has absolutely no dependencies, making | ||
7 | * it a reasonably good default for PHP4. Written with efficiency in mind, | ||
8 | * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it | ||
9 | * pales in comparison to HTMLPurifier_Lexer_DOMLex. | ||
10 | * | ||
11 | * @todo Reread XML spec and document differences. | ||
12 | */ | ||
13 | class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer | ||
14 | { | ||
15 | /** | ||
16 | * @type bool | ||
17 | */ | ||
18 | public $tracksLineNumbers = true; | ||
19 | |||
20 | /** | ||
21 | * Whitespace characters for str(c)spn. | ||
22 | * @type string | ||
23 | */ | ||
24 | protected $_whitespace = "\x20\x09\x0D\x0A"; | ||
25 | |||
26 | /** | ||
27 | * Callback function for script CDATA fudge | ||
28 | * @param array $matches, in form of array(opening tag, contents, closing tag) | ||
29 | * @return string | ||
30 | */ | ||
31 | protected function scriptCallback($matches) | ||
32 | { | ||
33 | return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; | ||
34 | } | ||
35 | |||
36 | /** | ||
37 | * @param String $html | ||
38 | * @param HTMLPurifier_Config $config | ||
39 | * @param HTMLPurifier_Context $context | ||
40 | * @return array|HTMLPurifier_Token[] | ||
41 | */ | ||
42 | public function tokenizeHTML($html, $config, $context) | ||
43 | { | ||
44 | // special normalization for script tags without any armor | ||
45 | // our "armor" heurstic is a < sign any number of whitespaces after | ||
46 | // the first script tag | ||
47 | if ($config->get('HTML.Trusted')) { | ||
48 | $html = preg_replace_callback( | ||
49 | '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', | ||
50 | array($this, 'scriptCallback'), | ||
51 | $html | ||
52 | ); | ||
53 | } | ||
54 | |||
55 | $html = $this->normalize($html, $config, $context); | ||
56 | |||
57 | $cursor = 0; // our location in the text | ||
58 | $inside_tag = false; // whether or not we're parsing the inside of a tag | ||
59 | $array = array(); // result array | ||
60 | |||
61 | // This is also treated to mean maintain *column* numbers too | ||
62 | $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); | ||
63 | |||
64 | if ($maintain_line_numbers === null) { | ||
65 | // automatically determine line numbering by checking | ||
66 | // if error collection is on | ||
67 | $maintain_line_numbers = $config->get('Core.CollectErrors'); | ||
68 | } | ||
69 | |||
70 | if ($maintain_line_numbers) { | ||
71 | $current_line = 1; | ||
72 | $current_col = 0; | ||
73 | $length = strlen($html); | ||
74 | } else { | ||
75 | $current_line = false; | ||
76 | $current_col = false; | ||
77 | $length = false; | ||
78 | } | ||
79 | $context->register('CurrentLine', $current_line); | ||
80 | $context->register('CurrentCol', $current_col); | ||
81 | $nl = "\n"; | ||
82 | // how often to manually recalculate. This will ALWAYS be right, | ||
83 | // but it's pretty wasteful. Set to 0 to turn off | ||
84 | $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); | ||
85 | |||
86 | $e = false; | ||
87 | if ($config->get('Core.CollectErrors')) { | ||
88 | $e =& $context->get('ErrorCollector'); | ||
89 | } | ||
90 | |||
91 | // for testing synchronization | ||
92 | $loops = 0; | ||
93 | |||
94 | while (++$loops) { | ||
95 | // $cursor is either at the start of a token, or inside of | ||
96 | // a tag (i.e. there was a < immediately before it), as indicated | ||
97 | // by $inside_tag | ||
98 | |||
99 | if ($maintain_line_numbers) { | ||
100 | // $rcursor, however, is always at the start of a token. | ||
101 | $rcursor = $cursor - (int)$inside_tag; | ||
102 | |||
103 | // Column number is cheap, so we calculate it every round. | ||
104 | // We're interested at the *end* of the newline string, so | ||
105 | // we need to add strlen($nl) == 1 to $nl_pos before subtracting it | ||
106 | // from our "rcursor" position. | ||
107 | $nl_pos = strrpos($html, $nl, $rcursor - $length); | ||
108 | $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); | ||
109 | |||
110 | // recalculate lines | ||
111 | if ($synchronize_interval && // synchronization is on | ||
112 | $cursor > 0 && // cursor is further than zero | ||
113 | $loops % $synchronize_interval === 0) { // time to synchronize! | ||
114 | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | $position_next_lt = strpos($html, '<', $cursor); | ||
119 | $position_next_gt = strpos($html, '>', $cursor); | ||
120 | |||
121 | // triggers on "<b>asdf</b>" but not "asdf <b></b>" | ||
122 | // special case to set up context | ||
123 | if ($position_next_lt === $cursor) { | ||
124 | $inside_tag = true; | ||
125 | $cursor++; | ||
126 | } | ||
127 | |||
128 | if (!$inside_tag && $position_next_lt !== false) { | ||
129 | // We are not inside tag and there still is another tag to parse | ||
130 | $token = new | ||
131 | HTMLPurifier_Token_Text( | ||
132 | $this->parseData( | ||
133 | substr( | ||
134 | $html, | ||
135 | $cursor, | ||
136 | $position_next_lt - $cursor | ||
137 | ) | ||
138 | ) | ||
139 | ); | ||
140 | if ($maintain_line_numbers) { | ||
141 | $token->rawPosition($current_line, $current_col); | ||
142 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); | ||
143 | } | ||
144 | $array[] = $token; | ||
145 | $cursor = $position_next_lt + 1; | ||
146 | $inside_tag = true; | ||
147 | continue; | ||
148 | } elseif (!$inside_tag) { | ||
149 | // We are not inside tag but there are no more tags | ||
150 | // If we're already at the end, break | ||
151 | if ($cursor === strlen($html)) { | ||
152 | break; | ||
153 | } | ||
154 | // Create Text of rest of string | ||
155 | $token = new | ||
156 | HTMLPurifier_Token_Text( | ||
157 | $this->parseData( | ||
158 | substr( | ||
159 | $html, | ||
160 | $cursor | ||
161 | ) | ||
162 | ) | ||
163 | ); | ||
164 | if ($maintain_line_numbers) { | ||
165 | $token->rawPosition($current_line, $current_col); | ||
166 | } | ||
167 | $array[] = $token; | ||
168 | break; | ||
169 | } elseif ($inside_tag && $position_next_gt !== false) { | ||
170 | // We are in tag and it is well formed | ||
171 | // Grab the internals of the tag | ||
172 | $strlen_segment = $position_next_gt - $cursor; | ||
173 | |||
174 | if ($strlen_segment < 1) { | ||
175 | // there's nothing to process! | ||
176 | $token = new HTMLPurifier_Token_Text('<'); | ||
177 | $cursor++; | ||
178 | continue; | ||
179 | } | ||
180 | |||
181 | $segment = substr($html, $cursor, $strlen_segment); | ||
182 | |||
183 | if ($segment === false) { | ||
184 | // somehow, we attempted to access beyond the end of | ||
185 | // the string, defense-in-depth, reported by Nate Abele | ||
186 | break; | ||
187 | } | ||
188 | |||
189 | // Check if it's a comment | ||
190 | if (substr($segment, 0, 3) === '!--') { | ||
191 | // re-determine segment length, looking for --> | ||
192 | $position_comment_end = strpos($html, '-->', $cursor); | ||
193 | if ($position_comment_end === false) { | ||
194 | // uh oh, we have a comment that extends to | ||
195 | // infinity. Can't be helped: set comment | ||
196 | // end position to end of string | ||
197 | if ($e) { | ||
198 | $e->send(E_WARNING, 'Lexer: Unclosed comment'); | ||
199 | } | ||
200 | $position_comment_end = strlen($html); | ||
201 | $end = true; | ||
202 | } else { | ||
203 | $end = false; | ||
204 | } | ||
205 | $strlen_segment = $position_comment_end - $cursor; | ||
206 | $segment = substr($html, $cursor, $strlen_segment); | ||
207 | $token = new | ||
208 | HTMLPurifier_Token_Comment( | ||
209 | substr( | ||
210 | $segment, | ||
211 | 3, | ||
212 | $strlen_segment - 3 | ||
213 | ) | ||
214 | ); | ||
215 | if ($maintain_line_numbers) { | ||
216 | $token->rawPosition($current_line, $current_col); | ||
217 | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); | ||
218 | } | ||
219 | $array[] = $token; | ||
220 | $cursor = $end ? $position_comment_end : $position_comment_end + 3; | ||
221 | $inside_tag = false; | ||
222 | continue; | ||
223 | } | ||
224 | |||
225 | // Check if it's an end tag | ||
226 | $is_end_tag = (strpos($segment, '/') === 0); | ||
227 | if ($is_end_tag) { | ||
228 | $type = substr($segment, 1); | ||
229 | $token = new HTMLPurifier_Token_End($type); | ||
230 | if ($maintain_line_numbers) { | ||
231 | $token->rawPosition($current_line, $current_col); | ||
232 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
233 | } | ||
234 | $array[] = $token; | ||
235 | $inside_tag = false; | ||
236 | $cursor = $position_next_gt + 1; | ||
237 | continue; | ||
238 | } | ||
239 | |||
240 | // Check leading character is alnum, if not, we may | ||
241 | // have accidently grabbed an emoticon. Translate into | ||
242 | // text and go our merry way | ||
243 | if (!ctype_alpha($segment[0])) { | ||
244 | // XML: $segment[0] !== '_' && $segment[0] !== ':' | ||
245 | if ($e) { | ||
246 | $e->send(E_NOTICE, 'Lexer: Unescaped lt'); | ||
247 | } | ||
248 | $token = new HTMLPurifier_Token_Text('<'); | ||
249 | if ($maintain_line_numbers) { | ||
250 | $token->rawPosition($current_line, $current_col); | ||
251 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
252 | } | ||
253 | $array[] = $token; | ||
254 | $inside_tag = false; | ||
255 | continue; | ||
256 | } | ||
257 | |||
258 | // Check if it is explicitly self closing, if so, remove | ||
259 | // trailing slash. Remember, we could have a tag like <br>, so | ||
260 | // any later token processing scripts must convert improperly | ||
261 | // classified EmptyTags from StartTags. | ||
262 | $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); | ||
263 | if ($is_self_closing) { | ||
264 | $strlen_segment--; | ||
265 | $segment = substr($segment, 0, $strlen_segment); | ||
266 | } | ||
267 | |||
268 | // Check if there are any attributes | ||
269 | $position_first_space = strcspn($segment, $this->_whitespace); | ||
270 | |||
271 | if ($position_first_space >= $strlen_segment) { | ||
272 | if ($is_self_closing) { | ||
273 | $token = new HTMLPurifier_Token_Empty($segment); | ||
274 | } else { | ||
275 | $token = new HTMLPurifier_Token_Start($segment); | ||
276 | } | ||
277 | if ($maintain_line_numbers) { | ||
278 | $token->rawPosition($current_line, $current_col); | ||
279 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
280 | } | ||
281 | $array[] = $token; | ||
282 | $inside_tag = false; | ||
283 | $cursor = $position_next_gt + 1; | ||
284 | continue; | ||
285 | } | ||
286 | |||
287 | // Grab out all the data | ||
288 | $type = substr($segment, 0, $position_first_space); | ||
289 | $attribute_string = | ||
290 | trim( | ||
291 | substr( | ||
292 | $segment, | ||
293 | $position_first_space | ||
294 | ) | ||
295 | ); | ||
296 | if ($attribute_string) { | ||
297 | $attr = $this->parseAttributeString( | ||
298 | $attribute_string, | ||
299 | $config, | ||
300 | $context | ||
301 | ); | ||
302 | } else { | ||
303 | $attr = array(); | ||
304 | } | ||
305 | |||
306 | if ($is_self_closing) { | ||
307 | $token = new HTMLPurifier_Token_Empty($type, $attr); | ||
308 | } else { | ||
309 | $token = new HTMLPurifier_Token_Start($type, $attr); | ||
310 | } | ||
311 | if ($maintain_line_numbers) { | ||
312 | $token->rawPosition($current_line, $current_col); | ||
313 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); | ||
314 | } | ||
315 | $array[] = $token; | ||
316 | $cursor = $position_next_gt + 1; | ||
317 | $inside_tag = false; | ||
318 | continue; | ||
319 | } else { | ||
320 | // inside tag, but there's no ending > sign | ||
321 | if ($e) { | ||
322 | $e->send(E_WARNING, 'Lexer: Missing gt'); | ||
323 | } | ||
324 | $token = new | ||
325 | HTMLPurifier_Token_Text( | ||
326 | '<' . | ||
327 | $this->parseData( | ||
328 | substr($html, $cursor) | ||
329 | ) | ||
330 | ); | ||
331 | if ($maintain_line_numbers) { | ||
332 | $token->rawPosition($current_line, $current_col); | ||
333 | } | ||
334 | // no cursor scroll? Hmm... | ||
335 | $array[] = $token; | ||
336 | break; | ||
337 | } | ||
338 | break; | ||
339 | } | ||
340 | |||
341 | $context->destroy('CurrentLine'); | ||
342 | $context->destroy('CurrentCol'); | ||
343 | return $array; | ||
344 | } | ||
345 | |||
346 | /** | ||
347 | * PHP 5.0.x compatible substr_count that implements offset and length | ||
348 | * @param string $haystack | ||
349 | * @param string $needle | ||
350 | * @param int $offset | ||
351 | * @param int $length | ||
352 | * @return int | ||
353 | */ | ||
354 | protected function substrCount($haystack, $needle, $offset, $length) | ||
355 | { | ||
356 | static $oldVersion; | ||
357 | if ($oldVersion === null) { | ||
358 | $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); | ||
359 | } | ||
360 | if ($oldVersion) { | ||
361 | $haystack = substr($haystack, $offset, $length); | ||
362 | return substr_count($haystack, $needle); | ||
363 | } else { | ||
364 | return substr_count($haystack, $needle, $offset, $length); | ||
365 | } | ||
366 | } | ||
367 | |||
368 | /** | ||
369 | * Takes the inside of an HTML tag and makes an assoc array of attributes. | ||
370 | * | ||
371 | * @param string $string Inside of tag excluding name. | ||
372 | * @param HTMLPurifier_Config $config | ||
373 | * @param HTMLPurifier_Context $context | ||
374 | * @return array Assoc array of attributes. | ||
375 | */ | ||
376 | public function parseAttributeString($string, $config, $context) | ||
377 | { | ||
378 | $string = (string)$string; // quick typecast | ||
379 | |||
380 | if ($string == '') { | ||
381 | return array(); | ||
382 | } // no attributes | ||
383 | |||
384 | $e = false; | ||
385 | if ($config->get('Core.CollectErrors')) { | ||
386 | $e =& $context->get('ErrorCollector'); | ||
387 | } | ||
388 | |||
389 | // let's see if we can abort as quickly as possible | ||
390 | // one equal sign, no spaces => one attribute | ||
391 | $num_equal = substr_count($string, '='); | ||
392 | $has_space = strpos($string, ' '); | ||
393 | if ($num_equal === 0 && !$has_space) { | ||
394 | // bool attribute | ||
395 | return array($string => $string); | ||
396 | } elseif ($num_equal === 1 && !$has_space) { | ||
397 | // only one attribute | ||
398 | list($key, $quoted_value) = explode('=', $string); | ||
399 | $quoted_value = trim($quoted_value); | ||
400 | if (!$key) { | ||
401 | if ($e) { | ||
402 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
403 | } | ||
404 | return array(); | ||
405 | } | ||
406 | if (!$quoted_value) { | ||
407 | return array($key => ''); | ||
408 | } | ||
409 | $first_char = @$quoted_value[0]; | ||
410 | $last_char = @$quoted_value[strlen($quoted_value) - 1]; | ||
411 | |||
412 | $same_quote = ($first_char == $last_char); | ||
413 | $open_quote = ($first_char == '"' || $first_char == "'"); | ||
414 | |||
415 | if ($same_quote && $open_quote) { | ||
416 | // well behaved | ||
417 | $value = substr($quoted_value, 1, strlen($quoted_value) - 2); | ||
418 | } else { | ||
419 | // not well behaved | ||
420 | if ($open_quote) { | ||
421 | if ($e) { | ||
422 | $e->send(E_ERROR, 'Lexer: Missing end quote'); | ||
423 | } | ||
424 | $value = substr($quoted_value, 1); | ||
425 | } else { | ||
426 | $value = $quoted_value; | ||
427 | } | ||
428 | } | ||
429 | if ($value === false) { | ||
430 | $value = ''; | ||
431 | } | ||
432 | return array($key => $this->parseData($value)); | ||
433 | } | ||
434 | |||
435 | // setup loop environment | ||
436 | $array = array(); // return assoc array of attributes | ||
437 | $cursor = 0; // current position in string (moves forward) | ||
438 | $size = strlen($string); // size of the string (stays the same) | ||
439 | |||
440 | // if we have unquoted attributes, the parser expects a terminating | ||
441 | // space, so let's guarantee that there's always a terminating space. | ||
442 | $string .= ' '; | ||
443 | |||
444 | $old_cursor = -1; | ||
445 | while ($cursor < $size) { | ||
446 | if ($old_cursor >= $cursor) { | ||
447 | throw new Exception("Infinite loop detected"); | ||
448 | } | ||
449 | $old_cursor = $cursor; | ||
450 | |||
451 | $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); | ||
452 | // grab the key | ||
453 | |||
454 | $key_begin = $cursor; //we're currently at the start of the key | ||
455 | |||
456 | // scroll past all characters that are the key (not whitespace or =) | ||
457 | $cursor += strcspn($string, $this->_whitespace . '=', $cursor); | ||
458 | |||
459 | $key_end = $cursor; // now at the end of the key | ||
460 | |||
461 | $key = substr($string, $key_begin, $key_end - $key_begin); | ||
462 | |||
463 | if (!$key) { | ||
464 | if ($e) { | ||
465 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
466 | } | ||
467 | $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop | ||
468 | continue; // empty key | ||
469 | } | ||
470 | |||
471 | // scroll past all whitespace | ||
472 | $cursor += strspn($string, $this->_whitespace, $cursor); | ||
473 | |||
474 | if ($cursor >= $size) { | ||
475 | $array[$key] = $key; | ||
476 | break; | ||
477 | } | ||
478 | |||
479 | // if the next character is an equal sign, we've got a regular | ||
480 | // pair, otherwise, it's a bool attribute | ||
481 | $first_char = @$string[$cursor]; | ||
482 | |||
483 | if ($first_char == '=') { | ||
484 | // key="value" | ||
485 | |||
486 | $cursor++; | ||
487 | $cursor += strspn($string, $this->_whitespace, $cursor); | ||
488 | |||
489 | if ($cursor === false) { | ||
490 | $array[$key] = ''; | ||
491 | break; | ||
492 | } | ||
493 | |||
494 | // we might be in front of a quote right now | ||
495 | |||
496 | $char = @$string[$cursor]; | ||
497 | |||
498 | if ($char == '"' || $char == "'") { | ||
499 | // it's quoted, end bound is $char | ||
500 | $cursor++; | ||
501 | $value_begin = $cursor; | ||
502 | $cursor = strpos($string, $char, $cursor); | ||
503 | $value_end = $cursor; | ||
504 | } else { | ||
505 | // it's not quoted, end bound is whitespace | ||
506 | $value_begin = $cursor; | ||
507 | $cursor += strcspn($string, $this->_whitespace, $cursor); | ||
508 | $value_end = $cursor; | ||
509 | } | ||
510 | |||
511 | // we reached a premature end | ||
512 | if ($cursor === false) { | ||
513 | $cursor = $size; | ||
514 | $value_end = $cursor; | ||
515 | } | ||
516 | |||
517 | $value = substr($string, $value_begin, $value_end - $value_begin); | ||
518 | if ($value === false) { | ||
519 | $value = ''; | ||
520 | } | ||
521 | $array[$key] = $this->parseData($value); | ||
522 | $cursor++; | ||
523 | } else { | ||
524 | // boolattr | ||
525 | if ($key !== '') { | ||
526 | $array[$key] = $key; | ||
527 | } else { | ||
528 | // purely theoretical | ||
529 | if ($e) { | ||
530 | $e->send(E_ERROR, 'Lexer: Missing attribute key'); | ||
531 | } | ||
532 | } | ||
533 | } | ||
534 | } | ||
535 | return $array; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | // vim: et sw=4 sts=4 | ||
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/PH5P.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/PH5P.php new file mode 100644 index 00000000..48b5f9a6 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/PH5P.php | |||
@@ -0,0 +1,4788 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. | ||
5 | * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. | ||
6 | * | ||
7 | * @note | ||
8 | * Recent changes to PHP's DOM extension have resulted in some fatal | ||
9 | * error conditions with the original version of PH5P. Pending changes, | ||
10 | * this lexer will punt to DirectLex if DOM throws an exception. | ||
11 | */ | ||
12 | |||
13 | class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex | ||
14 | { | ||
15 | /** | ||
16 | * @param string $html | ||
17 | * @param HTMLPurifier_Config $config | ||
18 | * @param HTMLPurifier_Context $context | ||
19 | * @return HTMLPurifier_Token[] | ||
20 | */ | ||
21 | public function tokenizeHTML($html, $config, $context) | ||
22 | { | ||
23 | $new_html = $this->normalize($html, $config, $context); | ||
24 | $new_html = $this->wrapHTML($new_html, $config, $context); | ||
25 | try { | ||
26 | $parser = new HTML5($new_html); | ||
27 | $doc = $parser->save(); | ||
28 | } catch (DOMException $e) { | ||
29 | // Uh oh, it failed. Punt to DirectLex. | ||
30 | $lexer = new HTMLPurifier_Lexer_DirectLex(); | ||
31 | $context->register('PH5PError', $e); // save the error, so we can detect it | ||
32 | return $lexer->tokenizeHTML($html, $config, $context); // use original HTML | ||
33 | } | ||
34 | $tokens = array(); | ||
35 | $this->tokenizeDOM( | ||
36 | $doc->getElementsByTagName('html')->item(0)-> // <html> | ||
37 | getElementsByTagName('body')->item(0)-> // <body> | ||
38 | getElementsByTagName('div')->item(0) // <div> | ||
39 | , | ||
40 | $tokens | ||
41 | ); | ||
42 | return $tokens; | ||
43 | } | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | |||
48 | Copyright 2007 Jeroen van der Meer <http://jero.net/> | ||
49 | |||
50 | Permission is hereby granted, free of charge, to any person obtaining a | ||
51 | copy of this software and associated documentation files (the | ||
52 | "Software"), to deal in the Software without restriction, including | ||
53 | without limitation the rights to use, copy, modify, merge, publish, | ||
54 | distribute, sublicense, and/or sell copies of the Software, and to | ||
55 | permit persons to whom the Software is furnished to do so, subject to | ||
56 | the following conditions: | ||
57 | |||
58 | The above copyright notice and this permission notice shall be included | ||
59 | in all copies or substantial portions of the Software. | ||
60 | |||
61 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||
62 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
63 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
64 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||
65 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
66 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
67 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
68 | |||
69 | */ | ||
70 | |||
71 | class HTML5 | ||
72 | { | ||
73 | private $data; | ||
74 | private $char; | ||
75 | private $EOF; | ||
76 | private $state; | ||
77 | private $tree; | ||
78 | private $token; | ||
79 | private $content_model; | ||
80 | private $escape = false; | ||
81 | private $entities = array( | ||
82 | 'AElig;', | ||
83 | 'AElig', | ||
84 | 'AMP;', | ||
85 | 'AMP', | ||
86 | 'Aacute;', | ||
87 | 'Aacute', | ||
88 | 'Acirc;', | ||
89 | 'Acirc', | ||
90 | 'Agrave;', | ||
91 | 'Agrave', | ||
92 | 'Alpha;', | ||
93 | 'Aring;', | ||
94 | 'Aring', | ||
95 | 'Atilde;', | ||
96 | 'Atilde', | ||
97 | 'Auml;', | ||
98 | 'Auml', | ||
99 | 'Beta;', | ||
100 | 'COPY;', | ||
101 | 'COPY', | ||
102 | 'Ccedil;', | ||
103 | 'Ccedil', | ||
104 | 'Chi;', | ||
105 | 'Dagger;', | ||
106 | 'Delta;', | ||
107 | 'ETH;', | ||
108 | 'ETH', | ||
109 | 'Eacute;', | ||
110 | 'Eacute', | ||
111 | 'Ecirc;', | ||
112 | 'Ecirc', | ||
113 | 'Egrave;', | ||
114 | 'Egrave', | ||
115 | 'Epsilon;', | ||
116 | 'Eta;', | ||
117 | 'Euml;', | ||
118 | 'Euml', | ||
119 | 'GT;', | ||
120 | 'GT', | ||
121 | 'Gamma;', | ||
122 | 'Iacute;', | ||
123 | 'Iacute', | ||
124 | 'Icirc;', | ||
125 | 'Icirc', | ||
126 | 'Igrave;', | ||
127 | 'Igrave', | ||
128 | 'Iota;', | ||
129 | 'Iuml;', | ||
130 | 'Iuml', | ||
131 | 'Kappa;', | ||
132 | 'LT;', | ||
133 | 'LT', | ||
134 | 'Lambda;', | ||
135 | 'Mu;', | ||
136 | 'Ntilde;', | ||
137 | 'Ntilde', | ||
138 | 'Nu;', | ||
139 | 'OElig;', | ||
140 | 'Oacute;', | ||
141 | 'Oacute', | ||
142 | 'Ocirc;', | ||
143 | 'Ocirc', | ||
144 | 'Ograve;', | ||
145 | 'Ograve', | ||
146 | 'Omega;', | ||
147 | 'Omicron;', | ||
148 | 'Oslash;', | ||
149 | 'Oslash', | ||
150 | 'Otilde;', | ||
151 | 'Otilde', | ||
152 | 'Ouml;', | ||
153 | 'Ouml', | ||
154 | 'Phi;', | ||
155 | 'Pi;', | ||
156 | 'Prime;', | ||
157 | 'Psi;', | ||
158 | 'QUOT;', | ||
159 | 'QUOT', | ||
160 | 'REG;', | ||
161 | 'REG', | ||
162 | 'Rho;', | ||
163 | 'Scaron;', | ||
164 | 'Sigma;', | ||
165 | 'THORN;', | ||
166 | 'THORN', | ||
167 | 'TRADE;', | ||
168 | 'Tau;', | ||
169 | 'Theta;', | ||
170 | 'Uacute;', | ||
171 | 'Uacute', | ||
172 | 'Ucirc;', | ||
173 | 'Ucirc', | ||
174 | 'Ugrave;', | ||
175 | 'Ugrave', | ||
176 | 'Upsilon;', | ||
177 | 'Uuml;', | ||
178 | 'Uuml', | ||
179 | 'Xi;', | ||
180 | 'Yacute;', | ||
181 | 'Yacute', | ||
182 | 'Yuml;', | ||
183 | 'Zeta;', | ||
184 | 'aacute;', | ||
185 | 'aacute', | ||
186 | 'acirc;', | ||
187 | 'acirc', | ||
188 | 'acute;', | ||
189 | 'acute', | ||
190 | 'aelig;', | ||
191 | 'aelig', | ||
192 | 'agrave;', | ||
193 | 'agrave', | ||
194 | 'alefsym;', | ||
195 | 'alpha;', | ||
196 | 'amp;', | ||
197 | 'amp', | ||
198 | 'and;', | ||
199 | 'ang;', | ||
200 | 'apos;', | ||
201 | 'aring;', | ||
202 | 'aring', | ||
203 | 'asymp;', | ||
204 | 'atilde;', | ||
205 | 'atilde', | ||
206 | 'auml;', | ||
207 | 'auml', | ||
208 | 'bdquo;', | ||
209 | 'beta;', | ||
210 | 'brvbar;', | ||
211 | 'brvbar', | ||
212 | 'bull;', | ||
213 | 'cap;', | ||
214 | 'ccedil;', | ||
215 | 'ccedil', | ||
216 | 'cedil;', | ||
217 | 'cedil', | ||
218 | 'cent;', | ||
219 | 'cent', | ||
220 | 'chi;', | ||
221 | 'circ;', | ||
222 | 'clubs;', | ||
223 | 'cong;', | ||
224 | 'copy;', | ||
225 | 'copy', | ||
226 | 'crarr;', | ||
227 | 'cup;', | ||
228 | 'curren;', | ||
229 | 'curren', | ||
230 | 'dArr;', | ||
231 | 'dagger;', | ||
232 | 'darr;', | ||
233 | 'deg;', | ||
234 | 'deg', | ||
235 | 'delta;', | ||
236 | 'diams;', | ||
237 | 'divide;', | ||
238 | 'divide', | ||
239 | 'eacute;', | ||
240 | 'eacute', | ||
241 | 'ecirc;', | ||
242 | 'ecirc', | ||
243 | 'egrave;', | ||
244 | 'egrave', | ||
245 | 'empty;', | ||
246 | 'emsp;', | ||
247 | 'ensp;', | ||
248 | 'epsilon;', | ||
249 | 'equiv;', | ||
250 | 'eta;', | ||
251 | 'eth;', | ||
252 | 'eth', | ||
253 | 'euml;', | ||
254 | 'euml', | ||
255 | 'euro;', | ||
256 | 'exist;', | ||
257 | 'fnof;', | ||
258 | 'forall;', | ||
259 | 'frac12;', | ||
260 | 'frac12', | ||
261 | 'frac14;', | ||
262 | 'frac14', | ||
263 | 'frac34;', | ||
264 | 'frac34', | ||
265 | 'frasl;', | ||
266 | 'gamma;', | ||
267 | 'ge;', | ||
268 | 'gt;', | ||
269 | 'gt', | ||
270 | 'hArr;', | ||
271 | 'harr;', | ||
272 | 'hearts;', | ||
273 | 'hellip;', | ||
274 | 'iacute;', | ||
275 | 'iacute', | ||
276 | 'icirc;', | ||
277 | 'icirc', | ||
278 | 'iexcl;', | ||
279 | 'iexcl', | ||
280 | 'igrave;', | ||
281 | 'igrave', | ||
282 | 'image;', | ||
283 | 'infin;', | ||
284 | 'int;', | ||
285 | 'iota;', | ||
286 | 'iquest;', | ||
287 | 'iquest', | ||
288 | 'isin;', | ||
289 | 'iuml;', | ||
290 | 'iuml', | ||
291 | 'kappa;', | ||
292 | 'lArr;', | ||
293 | 'lambda;', | ||
294 | 'lang;', | ||
295 | 'laquo;', | ||
296 | 'laquo', | ||
297 | 'larr;', | ||
298 | 'lceil;', | ||
299 | 'ldquo;', | ||
300 | 'le;', | ||
301 | 'lfloor;', | ||
302 | 'lowast;', | ||
303 | 'loz;', | ||
304 | 'lrm;', | ||
305 | 'lsaquo;', | ||
306 | 'lsquo;', | ||
307 | 'lt;', | ||
308 | 'lt', | ||
309 | 'macr;', | ||
310 | 'macr', | ||
311 | 'mdash;', | ||
312 | 'micro;', | ||
313 | 'micro', | ||
314 | 'middot;', | ||
315 | 'middot', | ||
316 | 'minus;', | ||
317 | 'mu;', | ||
318 | 'nabla;', | ||
319 | 'nbsp;', | ||
320 | 'nbsp', | ||
321 | 'ndash;', | ||
322 | 'ne;', | ||
323 | 'ni;', | ||
324 | 'not;', | ||
325 | 'not', | ||
326 | 'notin;', | ||
327 | 'nsub;', | ||
328 | 'ntilde;', | ||
329 | 'ntilde', | ||
330 | 'nu;', | ||
331 | 'oacute;', | ||
332 | 'oacute', | ||
333 | 'ocirc;', | ||
334 | 'ocirc', | ||
335 | 'oelig;', | ||
336 | 'ograve;', | ||
337 | 'ograve', | ||
338 | 'oline;', | ||
339 | 'omega;', | ||
340 | 'omicron;', | ||
341 | 'oplus;', | ||
342 | 'or;', | ||
343 | 'ordf;', | ||
344 | 'ordf', | ||
345 | 'ordm;', | ||
346 | 'ordm', | ||
347 | 'oslash;', | ||
348 | 'oslash', | ||
349 | 'otilde;', | ||
350 | 'otilde', | ||
351 | 'otimes;', | ||
352 | 'ouml;', | ||
353 | 'ouml', | ||
354 | 'para;', | ||
355 | 'para', | ||
356 | 'part;', | ||
357 | 'permil;', | ||
358 | 'perp;', | ||
359 | 'phi;', | ||
360 | 'pi;', | ||
361 | 'piv;', | ||
362 | 'plusmn;', | ||
363 | 'plusmn', | ||
364 | 'pound;', | ||
365 | 'pound', | ||
366 | 'prime;', | ||
367 | 'prod;', | ||
368 | 'prop;', | ||
369 | 'psi;', | ||
370 | 'quot;', | ||
371 | 'quot', | ||
372 | 'rArr;', | ||
373 | 'radic;', | ||
374 | 'rang;', | ||
375 | 'raquo;', | ||
376 | 'raquo', | ||
377 | 'rarr;', | ||
378 | 'rceil;', | ||
379 | 'rdquo;', | ||
380 | 'real;', | ||
381 | 'reg;', | ||
382 | 'reg', | ||
383 | 'rfloor;', | ||
384 | 'rho;', | ||
385 | 'rlm;', | ||
386 | 'rsaquo;', | ||
387 | 'rsquo;', | ||
388 | 'sbquo;', | ||
389 | 'scaron;', | ||
390 | 'sdot;', | ||
391 | 'sect;', | ||
392 | 'sect', | ||
393 | 'shy;', | ||
394 | 'shy', | ||
395 | 'sigma;', | ||
396 | 'sigmaf;', | ||
397 | 'sim;', | ||
398 | 'spades;', | ||
399 | 'sub;', | ||
400 | 'sube;', | ||
401 | 'sum;', | ||
402 | 'sup1;', | ||
403 | 'sup1', | ||
404 | 'sup2;', | ||
405 | 'sup2', | ||
406 | 'sup3;', | ||
407 | 'sup3', | ||
408 | 'sup;', | ||
409 | 'supe;', | ||
410 | 'szlig;', | ||
411 | 'szlig', | ||
412 | 'tau;', | ||
413 | 'there4;', | ||
414 | 'theta;', | ||
415 | 'thetasym;', | ||
416 | 'thinsp;', | ||
417 | 'thorn;', | ||
418 | 'thorn', | ||
419 | 'tilde;', | ||
420 | 'times;', | ||
421 | 'times', | ||
422 | 'trade;', | ||
423 | 'uArr;', | ||
424 | 'uacute;', | ||
425 | 'uacute', | ||
426 | 'uarr;', | ||
427 | 'ucirc;', | ||
428 | 'ucirc', | ||
429 | 'ugrave;', | ||
430 | 'ugrave', | ||
431 | 'uml;', | ||
432 | 'uml', | ||
433 | 'upsih;', | ||
434 | 'upsilon;', | ||
435 | 'uuml;', | ||
436 | 'uuml', | ||
437 | 'weierp;', | ||
438 | 'xi;', | ||
439 | 'yacute;', | ||
440 | 'yacute', | ||
441 | 'yen;', | ||
442 | 'yen', | ||
443 | 'yuml;', | ||
444 | 'yuml', | ||
445 | 'zeta;', | ||
446 | 'zwj;', | ||
447 | 'zwnj;' | ||
448 | ); | ||
449 | |||
450 | const PCDATA = 0; | ||
451 | const RCDATA = 1; | ||
452 | const CDATA = 2; | ||
453 | const PLAINTEXT = 3; | ||
454 | |||
455 | const DOCTYPE = 0; | ||
456 | const STARTTAG = 1; | ||
457 | const ENDTAG = 2; | ||
458 | const COMMENT = 3; | ||
459 | const CHARACTR = 4; | ||
460 | const EOF = 5; | ||
461 | |||
462 | public function __construct($data) | ||
463 | { | ||
464 | $this->data = $data; | ||
465 | $this->char = -1; | ||
466 | $this->EOF = strlen($data); | ||
467 | $this->tree = new HTML5TreeConstructer; | ||
468 | $this->content_model = self::PCDATA; | ||
469 | |||
470 | $this->state = 'data'; | ||
471 | |||
472 | while ($this->state !== null) { | ||
473 | $this->{$this->state . 'State'}(); | ||
474 | } | ||
475 | } | ||
476 | |||
477 | public function save() | ||
478 | { | ||
479 | return $this->tree->save(); | ||
480 | } | ||
481 | |||
482 | private function char() | ||
483 | { | ||
484 | return ($this->char < $this->EOF) | ||
485 | ? $this->data[$this->char] | ||
486 | : false; | ||
487 | } | ||
488 | |||
489 | private function character($s, $l = 0) | ||
490 | { | ||
491 | if ($s + $l < $this->EOF) { | ||
492 | if ($l === 0) { | ||
493 | return $this->data[$s]; | ||
494 | } else { | ||
495 | return substr($this->data, $s, $l); | ||
496 | } | ||
497 | } | ||
498 | } | ||
499 | |||
500 | private function characters($char_class, $start) | ||
501 | { | ||
502 | return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start)); | ||
503 | } | ||
504 | |||
505 | private function dataState() | ||
506 | { | ||
507 | // Consume the next input character | ||
508 | $this->char++; | ||
509 | $char = $this->char(); | ||
510 | |||
511 | if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { | ||
512 | /* U+0026 AMPERSAND (&) | ||
513 | When the content model flag is set to one of the PCDATA or RCDATA | ||
514 | states: switch to the entity data state. Otherwise: treat it as per | ||
515 | the "anything else" entry below. */ | ||
516 | $this->state = 'entityData'; | ||
517 | |||
518 | } elseif ($char === '-') { | ||
519 | /* If the content model flag is set to either the RCDATA state or | ||
520 | the CDATA state, and the escape flag is false, and there are at | ||
521 | least three characters before this one in the input stream, and the | ||
522 | last four characters in the input stream, including this one, are | ||
523 | U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, | ||
524 | and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ | ||
525 | if (($this->content_model === self::RCDATA || $this->content_model === | ||
526 | self::CDATA) && $this->escape === false && | ||
527 | $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--' | ||
528 | ) { | ||
529 | $this->escape = true; | ||
530 | } | ||
531 | |||
532 | /* In any case, emit the input character as a character token. Stay | ||
533 | in the data state. */ | ||
534 | $this->emitToken( | ||
535 | array( | ||
536 | 'type' => self::CHARACTR, | ||
537 | 'data' => $char | ||
538 | ) | ||
539 | ); | ||
540 | |||
541 | /* U+003C LESS-THAN SIGN (<) */ | ||
542 | } elseif ($char === '<' && ($this->content_model === self::PCDATA || | ||
543 | (($this->content_model === self::RCDATA || | ||
544 | $this->content_model === self::CDATA) && $this->escape === false)) | ||
545 | ) { | ||
546 | /* When the content model flag is set to the PCDATA state: switch | ||
547 | to the tag open state. | ||
548 | |||
549 | When the content model flag is set to either the RCDATA state or | ||
550 | the CDATA state and the escape flag is false: switch to the tag | ||
551 | open state. | ||
552 | |||
553 | Otherwise: treat it as per the "anything else" entry below. */ | ||
554 | $this->state = 'tagOpen'; | ||
555 | |||
556 | /* U+003E GREATER-THAN SIGN (>) */ | ||
557 | } elseif ($char === '>') { | ||
558 | /* If the content model flag is set to either the RCDATA state or | ||
559 | the CDATA state, and the escape flag is true, and the last three | ||
560 | characters in the input stream including this one are U+002D | ||
561 | HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), | ||
562 | set the escape flag to false. */ | ||
563 | if (($this->content_model === self::RCDATA || | ||
564 | $this->content_model === self::CDATA) && $this->escape === true && | ||
565 | $this->character($this->char, 3) === '-->' | ||
566 | ) { | ||
567 | $this->escape = false; | ||
568 | } | ||
569 | |||
570 | /* In any case, emit the input character as a character token. | ||
571 | Stay in the data state. */ | ||
572 | $this->emitToken( | ||
573 | array( | ||
574 | 'type' => self::CHARACTR, | ||
575 | 'data' => $char | ||
576 | ) | ||
577 | ); | ||
578 | |||
579 | } elseif ($this->char === $this->EOF) { | ||
580 | /* EOF | ||
581 | Emit an end-of-file token. */ | ||
582 | $this->EOF(); | ||
583 | |||
584 | } elseif ($this->content_model === self::PLAINTEXT) { | ||
585 | /* When the content model flag is set to the PLAINTEXT state | ||
586 | THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of | ||
587 | the text and emit it as a character token. */ | ||
588 | $this->emitToken( | ||
589 | array( | ||
590 | 'type' => self::CHARACTR, | ||
591 | 'data' => substr($this->data, $this->char) | ||
592 | ) | ||
593 | ); | ||
594 | |||
595 | $this->EOF(); | ||
596 | |||
597 | } else { | ||
598 | /* Anything else | ||
599 | THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that | ||
600 | otherwise would also be treated as a character token and emit it | ||
601 | as a single character token. Stay in the data state. */ | ||
602 | $len = strcspn($this->data, '<&', $this->char); | ||
603 | $char = substr($this->data, $this->char, $len); | ||
604 | $this->char += $len - 1; | ||
605 | |||
606 | $this->emitToken( | ||
607 | array( | ||
608 | 'type' => self::CHARACTR, | ||
609 | 'data' => $char | ||
610 | ) | ||
611 | ); | ||
612 | |||
613 | $this->state = 'data'; | ||
614 | } | ||
615 | } | ||
616 | |||
617 | private function entityDataState() | ||
618 | { | ||
619 | // Attempt to consume an entity. | ||
620 | $entity = $this->entity(); | ||
621 | |||
622 | // If nothing is returned, emit a U+0026 AMPERSAND character token. | ||
623 | // Otherwise, emit the character token that was returned. | ||
624 | $char = (!$entity) ? '&' : $entity; | ||
625 | $this->emitToken( | ||
626 | array( | ||
627 | 'type' => self::CHARACTR, | ||
628 | 'data' => $char | ||
629 | ) | ||
630 | ); | ||
631 | |||
632 | // Finally, switch to the data state. | ||
633 | $this->state = 'data'; | ||
634 | } | ||
635 | |||
636 | private function tagOpenState() | ||
637 | { | ||
638 | switch ($this->content_model) { | ||
639 | case self::RCDATA: | ||
640 | case self::CDATA: | ||
641 | /* If the next input character is a U+002F SOLIDUS (/) character, | ||
642 | consume it and switch to the close tag open state. If the next | ||
643 | input character is not a U+002F SOLIDUS (/) character, emit a | ||
644 | U+003C LESS-THAN SIGN character token and switch to the data | ||
645 | state to process the next input character. */ | ||
646 | if ($this->character($this->char + 1) === '/') { | ||
647 | $this->char++; | ||
648 | $this->state = 'closeTagOpen'; | ||
649 | |||
650 | } else { | ||
651 | $this->emitToken( | ||
652 | array( | ||
653 | 'type' => self::CHARACTR, | ||
654 | 'data' => '<' | ||
655 | ) | ||
656 | ); | ||
657 | |||
658 | $this->state = 'data'; | ||
659 | } | ||
660 | break; | ||
661 | |||
662 | case self::PCDATA: | ||
663 | // If the content model flag is set to the PCDATA state | ||
664 | // Consume the next input character: | ||
665 | $this->char++; | ||
666 | $char = $this->char(); | ||
667 | |||
668 | if ($char === '!') { | ||
669 | /* U+0021 EXCLAMATION MARK (!) | ||
670 | Switch to the markup declaration open state. */ | ||
671 | $this->state = 'markupDeclarationOpen'; | ||
672 | |||
673 | } elseif ($char === '/') { | ||
674 | /* U+002F SOLIDUS (/) | ||
675 | Switch to the close tag open state. */ | ||
676 | $this->state = 'closeTagOpen'; | ||
677 | |||
678 | } elseif (preg_match('/^[A-Za-z]$/', $char)) { | ||
679 | /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z | ||
680 | Create a new start tag token, set its tag name to the lowercase | ||
681 | version of the input character (add 0x0020 to the character's code | ||
682 | point), then switch to the tag name state. (Don't emit the token | ||
683 | yet; further details will be filled in before it is emitted.) */ | ||
684 | $this->token = array( | ||
685 | 'name' => strtolower($char), | ||
686 | 'type' => self::STARTTAG, | ||
687 | 'attr' => array() | ||
688 | ); | ||
689 | |||
690 | $this->state = 'tagName'; | ||
691 | |||
692 | } elseif ($char === '>') { | ||
693 | /* U+003E GREATER-THAN SIGN (>) | ||
694 | Parse error. Emit a U+003C LESS-THAN SIGN character token and a | ||
695 | U+003E GREATER-THAN SIGN character token. Switch to the data state. */ | ||
696 | $this->emitToken( | ||
697 | array( | ||
698 | 'type' => self::CHARACTR, | ||
699 | 'data' => '<>' | ||
700 | ) | ||
701 | ); | ||
702 | |||
703 | $this->state = 'data'; | ||
704 | |||
705 | } elseif ($char === '?') { | ||
706 | /* U+003F QUESTION MARK (?) | ||
707 | Parse error. Switch to the bogus comment state. */ | ||
708 | $this->state = 'bogusComment'; | ||
709 | |||
710 | } else { | ||
711 | /* Anything else | ||
712 | Parse error. Emit a U+003C LESS-THAN SIGN character token and | ||
713 | reconsume the current input character in the data state. */ | ||
714 | $this->emitToken( | ||
715 | array( | ||
716 | 'type' => self::CHARACTR, | ||
717 | 'data' => '<' | ||
718 | ) | ||
719 | ); | ||
720 | |||
721 | $this->char--; | ||
722 | $this->state = 'data'; | ||
723 | } | ||
724 | break; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | private function closeTagOpenState() | ||
729 | { | ||
730 | $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); | ||
731 | $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; | ||
732 | |||
733 | if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && | ||
734 | (!$the_same || ($the_same && (!preg_match( | ||
735 | '/[\t\n\x0b\x0c >\/]/', | ||
736 | $this->character($this->char + 1 + strlen($next_node)) | ||
737 | ) || $this->EOF === $this->char))) | ||
738 | ) { | ||
739 | /* If the content model flag is set to the RCDATA or CDATA states then | ||
740 | examine the next few characters. If they do not match the tag name of | ||
741 | the last start tag token emitted (case insensitively), or if they do but | ||
742 | they are not immediately followed by one of the following characters: | ||
743 | * U+0009 CHARACTER TABULATION | ||
744 | * U+000A LINE FEED (LF) | ||
745 | * U+000B LINE TABULATION | ||
746 | * U+000C FORM FEED (FF) | ||
747 | * U+0020 SPACE | ||
748 | * U+003E GREATER-THAN SIGN (>) | ||
749 | * U+002F SOLIDUS (/) | ||
750 | * EOF | ||
751 | ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character | ||
752 | token, a U+002F SOLIDUS character token, and switch to the data state | ||
753 | to process the next input character. */ | ||
754 | $this->emitToken( | ||
755 | array( | ||
756 | 'type' => self::CHARACTR, | ||
757 | 'data' => '</' | ||
758 | ) | ||
759 | ); | ||
760 | |||
761 | $this->state = 'data'; | ||
762 | |||
763 | } else { | ||
764 | /* Otherwise, if the content model flag is set to the PCDATA state, | ||
765 | or if the next few characters do match that tag name, consume the | ||
766 | next input character: */ | ||
767 | $this->char++; | ||
768 | $char = $this->char(); | ||
769 | |||
770 | if (preg_match('/^[A-Za-z]$/', $char)) { | ||
771 | /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z | ||
772 | Create a new end tag token, set its tag name to the lowercase version | ||
773 | of the input character (add 0x0020 to the character's code point), then | ||
774 | switch to the tag name state. (Don't emit the token yet; further details | ||
775 | will be filled in before it is emitted.) */ | ||
776 | $this->token = array( | ||
777 | 'name' => strtolower($char), | ||
778 | 'type' => self::ENDTAG | ||
779 | ); | ||
780 | |||
781 | $this->state = 'tagName'; | ||
782 | |||
783 | } elseif ($char === '>') { | ||
784 | /* U+003E GREATER-THAN SIGN (>) | ||
785 | Parse error. Switch to the data state. */ | ||
786 | $this->state = 'data'; | ||
787 | |||
788 | } elseif ($this->char === $this->EOF) { | ||
789 | /* EOF | ||
790 | Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F | ||
791 | SOLIDUS character token. Reconsume the EOF character in the data state. */ | ||
792 | $this->emitToken( | ||
793 | array( | ||
794 | 'type' => self::CHARACTR, | ||
795 | 'data' => '</' | ||
796 | ) | ||
797 | ); | ||
798 | |||
799 | $this->char--; | ||
800 | $this->state = 'data'; | ||
801 | |||
802 | } else { | ||
803 | /* Parse error. Switch to the bogus comment state. */ | ||
804 | $this->state = 'bogusComment'; | ||
805 | } | ||
806 | } | ||
807 | } | ||
808 | |||
809 | private function tagNameState() | ||
810 | { | ||
811 | // Consume the next input character: | ||
812 | $this->char++; | ||
813 | $char = $this->character($this->char); | ||
814 | |||
815 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
816 | /* U+0009 CHARACTER TABULATION | ||
817 | U+000A LINE FEED (LF) | ||
818 | U+000B LINE TABULATION | ||
819 | U+000C FORM FEED (FF) | ||
820 | U+0020 SPACE | ||
821 | Switch to the before attribute name state. */ | ||
822 | $this->state = 'beforeAttributeName'; | ||
823 | |||
824 | } elseif ($char === '>') { | ||
825 | /* U+003E GREATER-THAN SIGN (>) | ||
826 | Emit the current tag token. Switch to the data state. */ | ||
827 | $this->emitToken($this->token); | ||
828 | $this->state = 'data'; | ||
829 | |||
830 | } elseif ($this->char === $this->EOF) { | ||
831 | /* EOF | ||
832 | Parse error. Emit the current tag token. Reconsume the EOF | ||
833 | character in the data state. */ | ||
834 | $this->emitToken($this->token); | ||
835 | |||
836 | $this->char--; | ||
837 | $this->state = 'data'; | ||
838 | |||
839 | } elseif ($char === '/') { | ||
840 | /* U+002F SOLIDUS (/) | ||
841 | Parse error unless this is a permitted slash. Switch to the before | ||
842 | attribute name state. */ | ||
843 | $this->state = 'beforeAttributeName'; | ||
844 | |||
845 | } else { | ||
846 | /* Anything else | ||
847 | Append the current input character to the current tag token's tag name. | ||
848 | Stay in the tag name state. */ | ||
849 | $this->token['name'] .= strtolower($char); | ||
850 | $this->state = 'tagName'; | ||
851 | } | ||
852 | } | ||
853 | |||
854 | private function beforeAttributeNameState() | ||
855 | { | ||
856 | // Consume the next input character: | ||
857 | $this->char++; | ||
858 | $char = $this->character($this->char); | ||
859 | |||
860 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
861 | /* U+0009 CHARACTER TABULATION | ||
862 | U+000A LINE FEED (LF) | ||
863 | U+000B LINE TABULATION | ||
864 | U+000C FORM FEED (FF) | ||
865 | U+0020 SPACE | ||
866 | Stay in the before attribute name state. */ | ||
867 | $this->state = 'beforeAttributeName'; | ||
868 | |||
869 | } elseif ($char === '>') { | ||
870 | /* U+003E GREATER-THAN SIGN (>) | ||
871 | Emit the current tag token. Switch to the data state. */ | ||
872 | $this->emitToken($this->token); | ||
873 | $this->state = 'data'; | ||
874 | |||
875 | } elseif ($char === '/') { | ||
876 | /* U+002F SOLIDUS (/) | ||
877 | Parse error unless this is a permitted slash. Stay in the before | ||
878 | attribute name state. */ | ||
879 | $this->state = 'beforeAttributeName'; | ||
880 | |||
881 | } elseif ($this->char === $this->EOF) { | ||
882 | /* EOF | ||
883 | Parse error. Emit the current tag token. Reconsume the EOF | ||
884 | character in the data state. */ | ||
885 | $this->emitToken($this->token); | ||
886 | |||
887 | $this->char--; | ||
888 | $this->state = 'data'; | ||
889 | |||
890 | } else { | ||
891 | /* Anything else | ||
892 | Start a new attribute in the current tag token. Set that attribute's | ||
893 | name to the current input character, and its value to the empty string. | ||
894 | Switch to the attribute name state. */ | ||
895 | $this->token['attr'][] = array( | ||
896 | 'name' => strtolower($char), | ||
897 | 'value' => null | ||
898 | ); | ||
899 | |||
900 | $this->state = 'attributeName'; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | private function attributeNameState() | ||
905 | { | ||
906 | // Consume the next input character: | ||
907 | $this->char++; | ||
908 | $char = $this->character($this->char); | ||
909 | |||
910 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
911 | /* U+0009 CHARACTER TABULATION | ||
912 | U+000A LINE FEED (LF) | ||
913 | U+000B LINE TABULATION | ||
914 | U+000C FORM FEED (FF) | ||
915 | U+0020 SPACE | ||
916 | Stay in the before attribute name state. */ | ||
917 | $this->state = 'afterAttributeName'; | ||
918 | |||
919 | } elseif ($char === '=') { | ||
920 | /* U+003D EQUALS SIGN (=) | ||
921 | Switch to the before attribute value state. */ | ||
922 | $this->state = 'beforeAttributeValue'; | ||
923 | |||
924 | } elseif ($char === '>') { | ||
925 | /* U+003E GREATER-THAN SIGN (>) | ||
926 | Emit the current tag token. Switch to the data state. */ | ||
927 | $this->emitToken($this->token); | ||
928 | $this->state = 'data'; | ||
929 | |||
930 | } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { | ||
931 | /* U+002F SOLIDUS (/) | ||
932 | Parse error unless this is a permitted slash. Switch to the before | ||
933 | attribute name state. */ | ||
934 | $this->state = 'beforeAttributeName'; | ||
935 | |||
936 | } elseif ($this->char === $this->EOF) { | ||
937 | /* EOF | ||
938 | Parse error. Emit the current tag token. Reconsume the EOF | ||
939 | character in the data state. */ | ||
940 | $this->emitToken($this->token); | ||
941 | |||
942 | $this->char--; | ||
943 | $this->state = 'data'; | ||
944 | |||
945 | } else { | ||
946 | /* Anything else | ||
947 | Append the current input character to the current attribute's name. | ||
948 | Stay in the attribute name state. */ | ||
949 | $last = count($this->token['attr']) - 1; | ||
950 | $this->token['attr'][$last]['name'] .= strtolower($char); | ||
951 | |||
952 | $this->state = 'attributeName'; | ||
953 | } | ||
954 | } | ||
955 | |||
956 | private function afterAttributeNameState() | ||
957 | { | ||
958 | // Consume the next input character: | ||
959 | $this->char++; | ||
960 | $char = $this->character($this->char); | ||
961 | |||
962 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
963 | /* U+0009 CHARACTER TABULATION | ||
964 | U+000A LINE FEED (LF) | ||
965 | U+000B LINE TABULATION | ||
966 | U+000C FORM FEED (FF) | ||
967 | U+0020 SPACE | ||
968 | Stay in the after attribute name state. */ | ||
969 | $this->state = 'afterAttributeName'; | ||
970 | |||
971 | } elseif ($char === '=') { | ||
972 | /* U+003D EQUALS SIGN (=) | ||
973 | Switch to the before attribute value state. */ | ||
974 | $this->state = 'beforeAttributeValue'; | ||
975 | |||
976 | } elseif ($char === '>') { | ||
977 | /* U+003E GREATER-THAN SIGN (>) | ||
978 | Emit the current tag token. Switch to the data state. */ | ||
979 | $this->emitToken($this->token); | ||
980 | $this->state = 'data'; | ||
981 | |||
982 | } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { | ||
983 | /* U+002F SOLIDUS (/) | ||
984 | Parse error unless this is a permitted slash. Switch to the | ||
985 | before attribute name state. */ | ||
986 | $this->state = 'beforeAttributeName'; | ||
987 | |||
988 | } elseif ($this->char === $this->EOF) { | ||
989 | /* EOF | ||
990 | Parse error. Emit the current tag token. Reconsume the EOF | ||
991 | character in the data state. */ | ||
992 | $this->emitToken($this->token); | ||
993 | |||
994 | $this->char--; | ||
995 | $this->state = 'data'; | ||
996 | |||
997 | } else { | ||
998 | /* Anything else | ||
999 | Start a new attribute in the current tag token. Set that attribute's | ||
1000 | name to the current input character, and its value to the empty string. | ||
1001 | Switch to the attribute name state. */ | ||
1002 | $this->token['attr'][] = array( | ||
1003 | 'name' => strtolower($char), | ||
1004 | 'value' => null | ||
1005 | ); | ||
1006 | |||
1007 | $this->state = 'attributeName'; | ||
1008 | } | ||
1009 | } | ||
1010 | |||
1011 | private function beforeAttributeValueState() | ||
1012 | { | ||
1013 | // Consume the next input character: | ||
1014 | $this->char++; | ||
1015 | $char = $this->character($this->char); | ||
1016 | |||
1017 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1018 | /* U+0009 CHARACTER TABULATION | ||
1019 | U+000A LINE FEED (LF) | ||
1020 | U+000B LINE TABULATION | ||
1021 | U+000C FORM FEED (FF) | ||
1022 | U+0020 SPACE | ||
1023 | Stay in the before attribute value state. */ | ||
1024 | $this->state = 'beforeAttributeValue'; | ||
1025 | |||
1026 | } elseif ($char === '"') { | ||
1027 | /* U+0022 QUOTATION MARK (") | ||
1028 | Switch to the attribute value (double-quoted) state. */ | ||
1029 | $this->state = 'attributeValueDoubleQuoted'; | ||
1030 | |||
1031 | } elseif ($char === '&') { | ||
1032 | /* U+0026 AMPERSAND (&) | ||
1033 | Switch to the attribute value (unquoted) state and reconsume | ||
1034 | this input character. */ | ||
1035 | $this->char--; | ||
1036 | $this->state = 'attributeValueUnquoted'; | ||
1037 | |||
1038 | } elseif ($char === '\'') { | ||
1039 | /* U+0027 APOSTROPHE (') | ||
1040 | Switch to the attribute value (single-quoted) state. */ | ||
1041 | $this->state = 'attributeValueSingleQuoted'; | ||
1042 | |||
1043 | } elseif ($char === '>') { | ||
1044 | /* U+003E GREATER-THAN SIGN (>) | ||
1045 | Emit the current tag token. Switch to the data state. */ | ||
1046 | $this->emitToken($this->token); | ||
1047 | $this->state = 'data'; | ||
1048 | |||
1049 | } else { | ||
1050 | /* Anything else | ||
1051 | Append the current input character to the current attribute's value. | ||
1052 | Switch to the attribute value (unquoted) state. */ | ||
1053 | $last = count($this->token['attr']) - 1; | ||
1054 | $this->token['attr'][$last]['value'] .= $char; | ||
1055 | |||
1056 | $this->state = 'attributeValueUnquoted'; | ||
1057 | } | ||
1058 | } | ||
1059 | |||
1060 | private function attributeValueDoubleQuotedState() | ||
1061 | { | ||
1062 | // Consume the next input character: | ||
1063 | $this->char++; | ||
1064 | $char = $this->character($this->char); | ||
1065 | |||
1066 | if ($char === '"') { | ||
1067 | /* U+0022 QUOTATION MARK (") | ||
1068 | Switch to the before attribute name state. */ | ||
1069 | $this->state = 'beforeAttributeName'; | ||
1070 | |||
1071 | } elseif ($char === '&') { | ||
1072 | /* U+0026 AMPERSAND (&) | ||
1073 | Switch to the entity in attribute value state. */ | ||
1074 | $this->entityInAttributeValueState('double'); | ||
1075 | |||
1076 | } elseif ($this->char === $this->EOF) { | ||
1077 | /* EOF | ||
1078 | Parse error. Emit the current tag token. Reconsume the character | ||
1079 | in the data state. */ | ||
1080 | $this->emitToken($this->token); | ||
1081 | |||
1082 | $this->char--; | ||
1083 | $this->state = 'data'; | ||
1084 | |||
1085 | } else { | ||
1086 | /* Anything else | ||
1087 | Append the current input character to the current attribute's value. | ||
1088 | Stay in the attribute value (double-quoted) state. */ | ||
1089 | $last = count($this->token['attr']) - 1; | ||
1090 | $this->token['attr'][$last]['value'] .= $char; | ||
1091 | |||
1092 | $this->state = 'attributeValueDoubleQuoted'; | ||
1093 | } | ||
1094 | } | ||
1095 | |||
1096 | private function attributeValueSingleQuotedState() | ||
1097 | { | ||
1098 | // Consume the next input character: | ||
1099 | $this->char++; | ||
1100 | $char = $this->character($this->char); | ||
1101 | |||
1102 | if ($char === '\'') { | ||
1103 | /* U+0022 QUOTATION MARK (') | ||
1104 | Switch to the before attribute name state. */ | ||
1105 | $this->state = 'beforeAttributeName'; | ||
1106 | |||
1107 | } elseif ($char === '&') { | ||
1108 | /* U+0026 AMPERSAND (&) | ||
1109 | Switch to the entity in attribute value state. */ | ||
1110 | $this->entityInAttributeValueState('single'); | ||
1111 | |||
1112 | } elseif ($this->char === $this->EOF) { | ||
1113 | /* EOF | ||
1114 | Parse error. Emit the current tag token. Reconsume the character | ||
1115 | in the data state. */ | ||
1116 | $this->emitToken($this->token); | ||
1117 | |||
1118 | $this->char--; | ||
1119 | $this->state = 'data'; | ||
1120 | |||
1121 | } else { | ||
1122 | /* Anything else | ||
1123 | Append the current input character to the current attribute's value. | ||
1124 | Stay in the attribute value (single-quoted) state. */ | ||
1125 | $last = count($this->token['attr']) - 1; | ||
1126 | $this->token['attr'][$last]['value'] .= $char; | ||
1127 | |||
1128 | $this->state = 'attributeValueSingleQuoted'; | ||
1129 | } | ||
1130 | } | ||
1131 | |||
1132 | private function attributeValueUnquotedState() | ||
1133 | { | ||
1134 | // Consume the next input character: | ||
1135 | $this->char++; | ||
1136 | $char = $this->character($this->char); | ||
1137 | |||
1138 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1139 | /* U+0009 CHARACTER TABULATION | ||
1140 | U+000A LINE FEED (LF) | ||
1141 | U+000B LINE TABULATION | ||
1142 | U+000C FORM FEED (FF) | ||
1143 | U+0020 SPACE | ||
1144 | Switch to the before attribute name state. */ | ||
1145 | $this->state = 'beforeAttributeName'; | ||
1146 | |||
1147 | } elseif ($char === '&') { | ||
1148 | /* U+0026 AMPERSAND (&) | ||
1149 | Switch to the entity in attribute value state. */ | ||
1150 | $this->entityInAttributeValueState(); | ||
1151 | |||
1152 | } elseif ($char === '>') { | ||
1153 | /* U+003E GREATER-THAN SIGN (>) | ||
1154 | Emit the current tag token. Switch to the data state. */ | ||
1155 | $this->emitToken($this->token); | ||
1156 | $this->state = 'data'; | ||
1157 | |||
1158 | } else { | ||
1159 | /* Anything else | ||
1160 | Append the current input character to the current attribute's value. | ||
1161 | Stay in the attribute value (unquoted) state. */ | ||
1162 | $last = count($this->token['attr']) - 1; | ||
1163 | $this->token['attr'][$last]['value'] .= $char; | ||
1164 | |||
1165 | $this->state = 'attributeValueUnquoted'; | ||
1166 | } | ||
1167 | } | ||
1168 | |||
1169 | private function entityInAttributeValueState() | ||
1170 | { | ||
1171 | // Attempt to consume an entity. | ||
1172 | $entity = $this->entity(); | ||
1173 | |||
1174 | // If nothing is returned, append a U+0026 AMPERSAND character to the | ||
1175 | // current attribute's value. Otherwise, emit the character token that | ||
1176 | // was returned. | ||
1177 | $char = (!$entity) | ||
1178 | ? '&' | ||
1179 | : $entity; | ||
1180 | |||
1181 | $last = count($this->token['attr']) - 1; | ||
1182 | $this->token['attr'][$last]['value'] .= $char; | ||
1183 | } | ||
1184 | |||
1185 | private function bogusCommentState() | ||
1186 | { | ||
1187 | /* Consume every character up to the first U+003E GREATER-THAN SIGN | ||
1188 | character (>) or the end of the file (EOF), whichever comes first. Emit | ||
1189 | a comment token whose data is the concatenation of all the characters | ||
1190 | starting from and including the character that caused the state machine | ||
1191 | to switch into the bogus comment state, up to and including the last | ||
1192 | consumed character before the U+003E character, if any, or up to the | ||
1193 | end of the file otherwise. (If the comment was started by the end of | ||
1194 | the file (EOF), the token is empty.) */ | ||
1195 | $data = $this->characters('^>', $this->char); | ||
1196 | $this->emitToken( | ||
1197 | array( | ||
1198 | 'data' => $data, | ||
1199 | 'type' => self::COMMENT | ||
1200 | ) | ||
1201 | ); | ||
1202 | |||
1203 | $this->char += strlen($data); | ||
1204 | |||
1205 | /* Switch to the data state. */ | ||
1206 | $this->state = 'data'; | ||
1207 | |||
1208 | /* If the end of the file was reached, reconsume the EOF character. */ | ||
1209 | if ($this->char === $this->EOF) { | ||
1210 | $this->char = $this->EOF - 1; | ||
1211 | } | ||
1212 | } | ||
1213 | |||
1214 | private function markupDeclarationOpenState() | ||
1215 | { | ||
1216 | /* If the next two characters are both U+002D HYPHEN-MINUS (-) | ||
1217 | characters, consume those two characters, create a comment token whose | ||
1218 | data is the empty string, and switch to the comment state. */ | ||
1219 | if ($this->character($this->char + 1, 2) === '--') { | ||
1220 | $this->char += 2; | ||
1221 | $this->state = 'comment'; | ||
1222 | $this->token = array( | ||
1223 | 'data' => null, | ||
1224 | 'type' => self::COMMENT | ||
1225 | ); | ||
1226 | |||
1227 | /* Otherwise if the next seven chacacters are a case-insensitive match | ||
1228 | for the word "DOCTYPE", then consume those characters and switch to the | ||
1229 | DOCTYPE state. */ | ||
1230 | } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') { | ||
1231 | $this->char += 7; | ||
1232 | $this->state = 'doctype'; | ||
1233 | |||
1234 | /* Otherwise, is is a parse error. Switch to the bogus comment state. | ||
1235 | The next character that is consumed, if any, is the first character | ||
1236 | that will be in the comment. */ | ||
1237 | } else { | ||
1238 | $this->char++; | ||
1239 | $this->state = 'bogusComment'; | ||
1240 | } | ||
1241 | } | ||
1242 | |||
1243 | private function commentState() | ||
1244 | { | ||
1245 | /* Consume the next input character: */ | ||
1246 | $this->char++; | ||
1247 | $char = $this->char(); | ||
1248 | |||
1249 | /* U+002D HYPHEN-MINUS (-) */ | ||
1250 | if ($char === '-') { | ||
1251 | /* Switch to the comment dash state */ | ||
1252 | $this->state = 'commentDash'; | ||
1253 | |||
1254 | /* EOF */ | ||
1255 | } elseif ($this->char === $this->EOF) { | ||
1256 | /* Parse error. Emit the comment token. Reconsume the EOF character | ||
1257 | in the data state. */ | ||
1258 | $this->emitToken($this->token); | ||
1259 | $this->char--; | ||
1260 | $this->state = 'data'; | ||
1261 | |||
1262 | /* Anything else */ | ||
1263 | } else { | ||
1264 | /* Append the input character to the comment token's data. Stay in | ||
1265 | the comment state. */ | ||
1266 | $this->token['data'] .= $char; | ||
1267 | } | ||
1268 | } | ||
1269 | |||
1270 | private function commentDashState() | ||
1271 | { | ||
1272 | /* Consume the next input character: */ | ||
1273 | $this->char++; | ||
1274 | $char = $this->char(); | ||
1275 | |||
1276 | /* U+002D HYPHEN-MINUS (-) */ | ||
1277 | if ($char === '-') { | ||
1278 | /* Switch to the comment end state */ | ||
1279 | $this->state = 'commentEnd'; | ||
1280 | |||
1281 | /* EOF */ | ||
1282 | } elseif ($this->char === $this->EOF) { | ||
1283 | /* Parse error. Emit the comment token. Reconsume the EOF character | ||
1284 | in the data state. */ | ||
1285 | $this->emitToken($this->token); | ||
1286 | $this->char--; | ||
1287 | $this->state = 'data'; | ||
1288 | |||
1289 | /* Anything else */ | ||
1290 | } else { | ||
1291 | /* Append a U+002D HYPHEN-MINUS (-) character and the input | ||
1292 | character to the comment token's data. Switch to the comment state. */ | ||
1293 | $this->token['data'] .= '-' . $char; | ||
1294 | $this->state = 'comment'; | ||
1295 | } | ||
1296 | } | ||
1297 | |||
1298 | private function commentEndState() | ||
1299 | { | ||
1300 | /* Consume the next input character: */ | ||
1301 | $this->char++; | ||
1302 | $char = $this->char(); | ||
1303 | |||
1304 | if ($char === '>') { | ||
1305 | $this->emitToken($this->token); | ||
1306 | $this->state = 'data'; | ||
1307 | |||
1308 | } elseif ($char === '-') { | ||
1309 | $this->token['data'] .= '-'; | ||
1310 | |||
1311 | } elseif ($this->char === $this->EOF) { | ||
1312 | $this->emitToken($this->token); | ||
1313 | $this->char--; | ||
1314 | $this->state = 'data'; | ||
1315 | |||
1316 | } else { | ||
1317 | $this->token['data'] .= '--' . $char; | ||
1318 | $this->state = 'comment'; | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | private function doctypeState() | ||
1323 | { | ||
1324 | /* Consume the next input character: */ | ||
1325 | $this->char++; | ||
1326 | $char = $this->char(); | ||
1327 | |||
1328 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1329 | $this->state = 'beforeDoctypeName'; | ||
1330 | |||
1331 | } else { | ||
1332 | $this->char--; | ||
1333 | $this->state = 'beforeDoctypeName'; | ||
1334 | } | ||
1335 | } | ||
1336 | |||
1337 | private function beforeDoctypeNameState() | ||
1338 | { | ||
1339 | /* Consume the next input character: */ | ||
1340 | $this->char++; | ||
1341 | $char = $this->char(); | ||
1342 | |||
1343 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1344 | // Stay in the before DOCTYPE name state. | ||
1345 | |||
1346 | } elseif (preg_match('/^[a-z]$/', $char)) { | ||
1347 | $this->token = array( | ||
1348 | 'name' => strtoupper($char), | ||
1349 | 'type' => self::DOCTYPE, | ||
1350 | 'error' => true | ||
1351 | ); | ||
1352 | |||
1353 | $this->state = 'doctypeName'; | ||
1354 | |||
1355 | } elseif ($char === '>') { | ||
1356 | $this->emitToken( | ||
1357 | array( | ||
1358 | 'name' => null, | ||
1359 | 'type' => self::DOCTYPE, | ||
1360 | 'error' => true | ||
1361 | ) | ||
1362 | ); | ||
1363 | |||
1364 | $this->state = 'data'; | ||
1365 | |||
1366 | } elseif ($this->char === $this->EOF) { | ||
1367 | $this->emitToken( | ||
1368 | array( | ||
1369 | 'name' => null, | ||
1370 | 'type' => self::DOCTYPE, | ||
1371 | 'error' => true | ||
1372 | ) | ||
1373 | ); | ||
1374 | |||
1375 | $this->char--; | ||
1376 | $this->state = 'data'; | ||
1377 | |||
1378 | } else { | ||
1379 | $this->token = array( | ||
1380 | 'name' => $char, | ||
1381 | 'type' => self::DOCTYPE, | ||
1382 | 'error' => true | ||
1383 | ); | ||
1384 | |||
1385 | $this->state = 'doctypeName'; | ||
1386 | } | ||
1387 | } | ||
1388 | |||
1389 | private function doctypeNameState() | ||
1390 | { | ||
1391 | /* Consume the next input character: */ | ||
1392 | $this->char++; | ||
1393 | $char = $this->char(); | ||
1394 | |||
1395 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1396 | $this->state = 'AfterDoctypeName'; | ||
1397 | |||
1398 | } elseif ($char === '>') { | ||
1399 | $this->emitToken($this->token); | ||
1400 | $this->state = 'data'; | ||
1401 | |||
1402 | } elseif (preg_match('/^[a-z]$/', $char)) { | ||
1403 | $this->token['name'] .= strtoupper($char); | ||
1404 | |||
1405 | } elseif ($this->char === $this->EOF) { | ||
1406 | $this->emitToken($this->token); | ||
1407 | $this->char--; | ||
1408 | $this->state = 'data'; | ||
1409 | |||
1410 | } else { | ||
1411 | $this->token['name'] .= $char; | ||
1412 | } | ||
1413 | |||
1414 | $this->token['error'] = ($this->token['name'] === 'HTML') | ||
1415 | ? false | ||
1416 | : true; | ||
1417 | } | ||
1418 | |||
1419 | private function afterDoctypeNameState() | ||
1420 | { | ||
1421 | /* Consume the next input character: */ | ||
1422 | $this->char++; | ||
1423 | $char = $this->char(); | ||
1424 | |||
1425 | if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { | ||
1426 | // Stay in the DOCTYPE name state. | ||
1427 | |||
1428 | } elseif ($char === '>') { | ||
1429 | $this->emitToken($this->token); | ||
1430 | $this->state = 'data'; | ||
1431 | |||
1432 | } elseif ($this->char === $this->EOF) { | ||
1433 | $this->emitToken($this->token); | ||
1434 | $this->char--; | ||
1435 | $this->state = 'data'; | ||
1436 | |||
1437 | } else { | ||
1438 | $this->token['error'] = true; | ||
1439 | $this->state = 'bogusDoctype'; | ||
1440 | } | ||
1441 | } | ||
1442 | |||
1443 | private function bogusDoctypeState() | ||
1444 | { | ||
1445 | /* Consume the next input character: */ | ||
1446 | $this->char++; | ||
1447 | $char = $this->char(); | ||
1448 | |||
1449 | if ($char === '>') { | ||
1450 | $this->emitToken($this->token); | ||
1451 | $this->state = 'data'; | ||
1452 | |||
1453 | } elseif ($this->char === $this->EOF) { | ||
1454 | $this->emitToken($this->token); | ||
1455 | $this->char--; | ||
1456 | $this->state = 'data'; | ||
1457 | |||
1458 | } else { | ||
1459 | // Stay in the bogus DOCTYPE state. | ||
1460 | } | ||
1461 | } | ||
1462 | |||
1463 | private function entity() | ||
1464 | { | ||
1465 | $start = $this->char; | ||
1466 | |||
1467 | // This section defines how to consume an entity. This definition is | ||
1468 | // used when parsing entities in text and in attributes. | ||
1469 | |||
1470 | // The behaviour depends on the identity of the next character (the | ||
1471 | // one immediately after the U+0026 AMPERSAND character): | ||
1472 | |||
1473 | switch ($this->character($this->char + 1)) { | ||
1474 | // U+0023 NUMBER SIGN (#) | ||
1475 | case '#': | ||
1476 | |||
1477 | // The behaviour further depends on the character after the | ||
1478 | // U+0023 NUMBER SIGN: | ||
1479 | switch ($this->character($this->char + 1)) { | ||
1480 | // U+0078 LATIN SMALL LETTER X | ||
1481 | // U+0058 LATIN CAPITAL LETTER X | ||
1482 | case 'x': | ||
1483 | case 'X': | ||
1484 | // Follow the steps below, but using the range of | ||
1485 | // characters U+0030 DIGIT ZERO through to U+0039 DIGIT | ||
1486 | // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 | ||
1487 | // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER | ||
1488 | // A, through to U+0046 LATIN CAPITAL LETTER F (in other | ||
1489 | // words, 0-9, A-F, a-f). | ||
1490 | $char = 1; | ||
1491 | $char_class = '0-9A-Fa-f'; | ||
1492 | break; | ||
1493 | |||
1494 | // Anything else | ||
1495 | default: | ||
1496 | // Follow the steps below, but using the range of | ||
1497 | // characters U+0030 DIGIT ZERO through to U+0039 DIGIT | ||
1498 | // NINE (i.e. just 0-9). | ||
1499 | $char = 0; | ||
1500 | $char_class = '0-9'; | ||
1501 | break; | ||
1502 | } | ||
1503 | |||
1504 | // Consume as many characters as match the range of characters | ||
1505 | // given above. | ||
1506 | $this->char++; | ||
1507 | $e_name = $this->characters($char_class, $this->char + $char + 1); | ||
1508 | $entity = $this->character($start, $this->char); | ||
1509 | $cond = strlen($e_name) > 0; | ||
1510 | |||
1511 | // The rest of the parsing happens bellow. | ||
1512 | break; | ||
1513 | |||
1514 | // Anything else | ||
1515 | default: | ||
1516 | // Consume the maximum number of characters possible, with the | ||
1517 | // consumed characters case-sensitively matching one of the | ||
1518 | // identifiers in the first column of the entities table. | ||
1519 | $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); | ||
1520 | $len = strlen($e_name); | ||
1521 | |||
1522 | for ($c = 1; $c <= $len; $c++) { | ||
1523 | $id = substr($e_name, 0, $c); | ||
1524 | $this->char++; | ||
1525 | |||
1526 | if (in_array($id, $this->entities)) { | ||
1527 | if ($e_name[$c - 1] !== ';') { | ||
1528 | if ($c < $len && $e_name[$c] == ';') { | ||
1529 | $this->char++; // consume extra semicolon | ||
1530 | } | ||
1531 | } | ||
1532 | $entity = $id; | ||
1533 | break; | ||
1534 | } | ||
1535 | } | ||
1536 | |||
1537 | $cond = isset($entity); | ||
1538 | // The rest of the parsing happens bellow. | ||
1539 | break; | ||
1540 | } | ||
1541 | |||
1542 | if (!$cond) { | ||
1543 | // If no match can be made, then this is a parse error. No | ||
1544 | // characters are consumed, and nothing is returned. | ||
1545 | $this->char = $start; | ||
1546 | return false; | ||
1547 | } | ||
1548 | |||
1549 | // Return a character token for the character corresponding to the | ||
1550 | // entity name (as given by the second column of the entities table). | ||
1551 | return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8'); | ||
1552 | } | ||
1553 | |||
1554 | private function emitToken($token) | ||
1555 | { | ||
1556 | $emit = $this->tree->emitToken($token); | ||
1557 | |||
1558 | if (is_int($emit)) { | ||
1559 | $this->content_model = $emit; | ||
1560 | |||
1561 | } elseif ($token['type'] === self::ENDTAG) { | ||
1562 | $this->content_model = self::PCDATA; | ||
1563 | } | ||
1564 | } | ||
1565 | |||
1566 | private function EOF() | ||
1567 | { | ||
1568 | $this->state = null; | ||
1569 | $this->tree->emitToken( | ||
1570 | array( | ||
1571 | 'type' => self::EOF | ||
1572 | ) | ||
1573 | ); | ||
1574 | } | ||
1575 | } | ||
1576 | |||
1577 | class HTML5TreeConstructer | ||
1578 | { | ||
1579 | public $stack = array(); | ||
1580 | |||
1581 | private $phase; | ||
1582 | private $mode; | ||
1583 | private $dom; | ||
1584 | private $foster_parent = null; | ||
1585 | private $a_formatting = array(); | ||
1586 | |||
1587 | private $head_pointer = null; | ||
1588 | private $form_pointer = null; | ||
1589 | |||
1590 | private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th'); | ||
1591 | private $formatting = array( | ||
1592 | 'a', | ||
1593 | 'b', | ||
1594 | 'big', | ||
1595 | 'em', | ||
1596 | 'font', | ||
1597 | 'i', | ||
1598 | 'nobr', | ||
1599 | 's', | ||
1600 | 'small', | ||
1601 | 'strike', | ||
1602 | 'strong', | ||
1603 | 'tt', | ||
1604 | 'u' | ||
1605 | ); | ||
1606 | private $special = array( | ||
1607 | 'address', | ||
1608 | 'area', | ||
1609 | 'base', | ||
1610 | 'basefont', | ||
1611 | 'bgsound', | ||
1612 | 'blockquote', | ||
1613 | 'body', | ||
1614 | 'br', | ||
1615 | 'center', | ||
1616 | 'col', | ||
1617 | 'colgroup', | ||
1618 | 'dd', | ||
1619 | 'dir', | ||
1620 | 'div', | ||
1621 | 'dl', | ||
1622 | 'dt', | ||
1623 | 'embed', | ||
1624 | 'fieldset', | ||
1625 | 'form', | ||
1626 | 'frame', | ||
1627 | 'frameset', | ||
1628 | 'h1', | ||
1629 | 'h2', | ||
1630 | 'h3', | ||
1631 | 'h4', | ||
1632 | 'h5', | ||
1633 | 'h6', | ||
1634 | 'head', | ||
1635 | 'hr', | ||
1636 | 'iframe', | ||
1637 | 'image', | ||
1638 | 'img', | ||
1639 | 'input', | ||
1640 | 'isindex', | ||
1641 | 'li', | ||
1642 | 'link', | ||
1643 | 'listing', | ||
1644 | 'menu', | ||
1645 | 'meta', | ||
1646 | 'noembed', | ||
1647 | 'noframes', | ||
1648 | 'noscript', | ||
1649 | 'ol', | ||
1650 | 'optgroup', | ||
1651 | 'option', | ||
1652 | 'p', | ||
1653 | 'param', | ||
1654 | 'plaintext', | ||
1655 | 'pre', | ||
1656 | 'script', | ||
1657 | 'select', | ||
1658 | 'spacer', | ||
1659 | 'style', | ||
1660 | 'tbody', | ||
1661 | 'textarea', | ||
1662 | 'tfoot', | ||
1663 | 'thead', | ||
1664 | 'title', | ||
1665 | 'tr', | ||
1666 | 'ul', | ||
1667 | 'wbr' | ||
1668 | ); | ||
1669 | |||
1670 | // The different phases. | ||
1671 | const INIT_PHASE = 0; | ||
1672 | const ROOT_PHASE = 1; | ||
1673 | const MAIN_PHASE = 2; | ||
1674 | const END_PHASE = 3; | ||
1675 | |||
1676 | // The different insertion modes for the main phase. | ||
1677 | const BEFOR_HEAD = 0; | ||
1678 | const IN_HEAD = 1; | ||
1679 | const AFTER_HEAD = 2; | ||
1680 | const IN_BODY = 3; | ||
1681 | const IN_TABLE = 4; | ||
1682 | const IN_CAPTION = 5; | ||
1683 | const IN_CGROUP = 6; | ||
1684 | const IN_TBODY = 7; | ||
1685 | const IN_ROW = 8; | ||
1686 | const IN_CELL = 9; | ||
1687 | const IN_SELECT = 10; | ||
1688 | const AFTER_BODY = 11; | ||
1689 | const IN_FRAME = 12; | ||
1690 | const AFTR_FRAME = 13; | ||
1691 | |||
1692 | // The different types of elements. | ||
1693 | const SPECIAL = 0; | ||
1694 | const SCOPING = 1; | ||
1695 | const FORMATTING = 2; | ||
1696 | const PHRASING = 3; | ||
1697 | |||
1698 | const MARKER = 0; | ||
1699 | |||
1700 | public function __construct() | ||
1701 | { | ||
1702 | $this->phase = self::INIT_PHASE; | ||
1703 | $this->mode = self::BEFOR_HEAD; | ||
1704 | $this->dom = new DOMDocument; | ||
1705 | |||
1706 | $this->dom->encoding = 'UTF-8'; | ||
1707 | $this->dom->preserveWhiteSpace = true; | ||
1708 | $this->dom->substituteEntities = true; | ||
1709 | $this->dom->strictErrorChecking = false; | ||
1710 | } | ||
1711 | |||
1712 | // Process tag tokens | ||
1713 | public function emitToken($token) | ||
1714 | { | ||
1715 | switch ($this->phase) { | ||
1716 | case self::INIT_PHASE: | ||
1717 | return $this->initPhase($token); | ||
1718 | break; | ||
1719 | case self::ROOT_PHASE: | ||
1720 | return $this->rootElementPhase($token); | ||
1721 | break; | ||
1722 | case self::MAIN_PHASE: | ||
1723 | return $this->mainPhase($token); | ||
1724 | break; | ||
1725 | case self::END_PHASE : | ||
1726 | return $this->trailingEndPhase($token); | ||
1727 | break; | ||
1728 | } | ||
1729 | } | ||
1730 | |||
1731 | private function initPhase($token) | ||
1732 | { | ||
1733 | /* Initially, the tree construction stage must handle each token | ||
1734 | emitted from the tokenisation stage as follows: */ | ||
1735 | |||
1736 | /* A DOCTYPE token that is marked as being in error | ||
1737 | A comment token | ||
1738 | A start tag token | ||
1739 | An end tag token | ||
1740 | A character token that is not one of one of U+0009 CHARACTER TABULATION, | ||
1741 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1742 | or U+0020 SPACE | ||
1743 | An end-of-file token */ | ||
1744 | if ((isset($token['error']) && $token['error']) || | ||
1745 | $token['type'] === HTML5::COMMENT || | ||
1746 | $token['type'] === HTML5::STARTTAG || | ||
1747 | $token['type'] === HTML5::ENDTAG || | ||
1748 | $token['type'] === HTML5::EOF || | ||
1749 | ($token['type'] === HTML5::CHARACTR && isset($token['data']) && | ||
1750 | !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) | ||
1751 | ) { | ||
1752 | /* This specification does not define how to handle this case. In | ||
1753 | particular, user agents may ignore the entirety of this specification | ||
1754 | altogether for such documents, and instead invoke special parse modes | ||
1755 | with a greater emphasis on backwards compatibility. */ | ||
1756 | |||
1757 | $this->phase = self::ROOT_PHASE; | ||
1758 | return $this->rootElementPhase($token); | ||
1759 | |||
1760 | /* A DOCTYPE token marked as being correct */ | ||
1761 | } elseif (isset($token['error']) && !$token['error']) { | ||
1762 | /* Append a DocumentType node to the Document node, with the name | ||
1763 | attribute set to the name given in the DOCTYPE token (which will be | ||
1764 | "HTML"), and the other attributes specific to DocumentType objects | ||
1765 | set to null, empty lists, or the empty string as appropriate. */ | ||
1766 | $doctype = new DOMDocumentType(null, null, 'HTML'); | ||
1767 | |||
1768 | /* Then, switch to the root element phase of the tree construction | ||
1769 | stage. */ | ||
1770 | $this->phase = self::ROOT_PHASE; | ||
1771 | |||
1772 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
1773 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1774 | or U+0020 SPACE */ | ||
1775 | } elseif (isset($token['data']) && preg_match( | ||
1776 | '/^[\t\n\x0b\x0c ]+$/', | ||
1777 | $token['data'] | ||
1778 | ) | ||
1779 | ) { | ||
1780 | /* Append that character to the Document node. */ | ||
1781 | $text = $this->dom->createTextNode($token['data']); | ||
1782 | $this->dom->appendChild($text); | ||
1783 | } | ||
1784 | } | ||
1785 | |||
1786 | private function rootElementPhase($token) | ||
1787 | { | ||
1788 | /* After the initial phase, as each token is emitted from the tokenisation | ||
1789 | stage, it must be processed as described in this section. */ | ||
1790 | |||
1791 | /* A DOCTYPE token */ | ||
1792 | if ($token['type'] === HTML5::DOCTYPE) { | ||
1793 | // Parse error. Ignore the token. | ||
1794 | |||
1795 | /* A comment token */ | ||
1796 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
1797 | /* Append a Comment node to the Document object with the data | ||
1798 | attribute set to the data given in the comment token. */ | ||
1799 | $comment = $this->dom->createComment($token['data']); | ||
1800 | $this->dom->appendChild($comment); | ||
1801 | |||
1802 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
1803 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1804 | or U+0020 SPACE */ | ||
1805 | } elseif ($token['type'] === HTML5::CHARACTR && | ||
1806 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
1807 | ) { | ||
1808 | /* Append that character to the Document node. */ | ||
1809 | $text = $this->dom->createTextNode($token['data']); | ||
1810 | $this->dom->appendChild($text); | ||
1811 | |||
1812 | /* A character token that is not one of U+0009 CHARACTER TABULATION, | ||
1813 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED | ||
1814 | (FF), or U+0020 SPACE | ||
1815 | A start tag token | ||
1816 | An end tag token | ||
1817 | An end-of-file token */ | ||
1818 | } elseif (($token['type'] === HTML5::CHARACTR && | ||
1819 | !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || | ||
1820 | $token['type'] === HTML5::STARTTAG || | ||
1821 | $token['type'] === HTML5::ENDTAG || | ||
1822 | $token['type'] === HTML5::EOF | ||
1823 | ) { | ||
1824 | /* Create an HTMLElement node with the tag name html, in the HTML | ||
1825 | namespace. Append it to the Document object. Switch to the main | ||
1826 | phase and reprocess the current token. */ | ||
1827 | $html = $this->dom->createElement('html'); | ||
1828 | $this->dom->appendChild($html); | ||
1829 | $this->stack[] = $html; | ||
1830 | |||
1831 | $this->phase = self::MAIN_PHASE; | ||
1832 | return $this->mainPhase($token); | ||
1833 | } | ||
1834 | } | ||
1835 | |||
1836 | private function mainPhase($token) | ||
1837 | { | ||
1838 | /* Tokens in the main phase must be handled as follows: */ | ||
1839 | |||
1840 | /* A DOCTYPE token */ | ||
1841 | if ($token['type'] === HTML5::DOCTYPE) { | ||
1842 | // Parse error. Ignore the token. | ||
1843 | |||
1844 | /* A start tag token with the tag name "html" */ | ||
1845 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { | ||
1846 | /* If this start tag token was not the first start tag token, then | ||
1847 | it is a parse error. */ | ||
1848 | |||
1849 | /* For each attribute on the token, check to see if the attribute | ||
1850 | is already present on the top element of the stack of open elements. | ||
1851 | If it is not, add the attribute and its corresponding value to that | ||
1852 | element. */ | ||
1853 | foreach ($token['attr'] as $attr) { | ||
1854 | if (!$this->stack[0]->hasAttribute($attr['name'])) { | ||
1855 | $this->stack[0]->setAttribute($attr['name'], $attr['value']); | ||
1856 | } | ||
1857 | } | ||
1858 | |||
1859 | /* An end-of-file token */ | ||
1860 | } elseif ($token['type'] === HTML5::EOF) { | ||
1861 | /* Generate implied end tags. */ | ||
1862 | $this->generateImpliedEndTags(); | ||
1863 | |||
1864 | /* Anything else. */ | ||
1865 | } else { | ||
1866 | /* Depends on the insertion mode: */ | ||
1867 | switch ($this->mode) { | ||
1868 | case self::BEFOR_HEAD: | ||
1869 | return $this->beforeHead($token); | ||
1870 | break; | ||
1871 | case self::IN_HEAD: | ||
1872 | return $this->inHead($token); | ||
1873 | break; | ||
1874 | case self::AFTER_HEAD: | ||
1875 | return $this->afterHead($token); | ||
1876 | break; | ||
1877 | case self::IN_BODY: | ||
1878 | return $this->inBody($token); | ||
1879 | break; | ||
1880 | case self::IN_TABLE: | ||
1881 | return $this->inTable($token); | ||
1882 | break; | ||
1883 | case self::IN_CAPTION: | ||
1884 | return $this->inCaption($token); | ||
1885 | break; | ||
1886 | case self::IN_CGROUP: | ||
1887 | return $this->inColumnGroup($token); | ||
1888 | break; | ||
1889 | case self::IN_TBODY: | ||
1890 | return $this->inTableBody($token); | ||
1891 | break; | ||
1892 | case self::IN_ROW: | ||
1893 | return $this->inRow($token); | ||
1894 | break; | ||
1895 | case self::IN_CELL: | ||
1896 | return $this->inCell($token); | ||
1897 | break; | ||
1898 | case self::IN_SELECT: | ||
1899 | return $this->inSelect($token); | ||
1900 | break; | ||
1901 | case self::AFTER_BODY: | ||
1902 | return $this->afterBody($token); | ||
1903 | break; | ||
1904 | case self::IN_FRAME: | ||
1905 | return $this->inFrameset($token); | ||
1906 | break; | ||
1907 | case self::AFTR_FRAME: | ||
1908 | return $this->afterFrameset($token); | ||
1909 | break; | ||
1910 | case self::END_PHASE: | ||
1911 | return $this->trailingEndPhase($token); | ||
1912 | break; | ||
1913 | } | ||
1914 | } | ||
1915 | } | ||
1916 | |||
1917 | private function beforeHead($token) | ||
1918 | { | ||
1919 | /* Handle the token as follows: */ | ||
1920 | |||
1921 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
1922 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1923 | or U+0020 SPACE */ | ||
1924 | if ($token['type'] === HTML5::CHARACTR && | ||
1925 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
1926 | ) { | ||
1927 | /* Append the character to the current node. */ | ||
1928 | $this->insertText($token['data']); | ||
1929 | |||
1930 | /* A comment token */ | ||
1931 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
1932 | /* Append a Comment node to the current node with the data attribute | ||
1933 | set to the data given in the comment token. */ | ||
1934 | $this->insertComment($token['data']); | ||
1935 | |||
1936 | /* A start tag token with the tag name "head" */ | ||
1937 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { | ||
1938 | /* Create an element for the token, append the new element to the | ||
1939 | current node and push it onto the stack of open elements. */ | ||
1940 | $element = $this->insertElement($token); | ||
1941 | |||
1942 | /* Set the head element pointer to this new element node. */ | ||
1943 | $this->head_pointer = $element; | ||
1944 | |||
1945 | /* Change the insertion mode to "in head". */ | ||
1946 | $this->mode = self::IN_HEAD; | ||
1947 | |||
1948 | /* A start tag token whose tag name is one of: "base", "link", "meta", | ||
1949 | "script", "style", "title". Or an end tag with the tag name "html". | ||
1950 | Or a character token that is not one of U+0009 CHARACTER TABULATION, | ||
1951 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1952 | or U+0020 SPACE. Or any other start tag token */ | ||
1953 | } elseif ($token['type'] === HTML5::STARTTAG || | ||
1954 | ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || | ||
1955 | ($token['type'] === HTML5::CHARACTR && !preg_match( | ||
1956 | '/^[\t\n\x0b\x0c ]$/', | ||
1957 | $token['data'] | ||
1958 | )) | ||
1959 | ) { | ||
1960 | /* Act as if a start tag token with the tag name "head" and no | ||
1961 | attributes had been seen, then reprocess the current token. */ | ||
1962 | $this->beforeHead( | ||
1963 | array( | ||
1964 | 'name' => 'head', | ||
1965 | 'type' => HTML5::STARTTAG, | ||
1966 | 'attr' => array() | ||
1967 | ) | ||
1968 | ); | ||
1969 | |||
1970 | return $this->inHead($token); | ||
1971 | |||
1972 | /* Any other end tag */ | ||
1973 | } elseif ($token['type'] === HTML5::ENDTAG) { | ||
1974 | /* Parse error. Ignore the token. */ | ||
1975 | } | ||
1976 | } | ||
1977 | |||
1978 | private function inHead($token) | ||
1979 | { | ||
1980 | /* Handle the token as follows: */ | ||
1981 | |||
1982 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
1983 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
1984 | or U+0020 SPACE. | ||
1985 | |||
1986 | THIS DIFFERS FROM THE SPEC: If the current node is either a title, style | ||
1987 | or script element, append the character to the current node regardless | ||
1988 | of its content. */ | ||
1989 | if (($token['type'] === HTML5::CHARACTR && | ||
1990 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( | ||
1991 | $token['type'] === HTML5::CHARACTR && in_array( | ||
1992 | end($this->stack)->nodeName, | ||
1993 | array('title', 'style', 'script') | ||
1994 | )) | ||
1995 | ) { | ||
1996 | /* Append the character to the current node. */ | ||
1997 | $this->insertText($token['data']); | ||
1998 | |||
1999 | /* A comment token */ | ||
2000 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
2001 | /* Append a Comment node to the current node with the data attribute | ||
2002 | set to the data given in the comment token. */ | ||
2003 | $this->insertComment($token['data']); | ||
2004 | |||
2005 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
2006 | in_array($token['name'], array('title', 'style', 'script')) | ||
2007 | ) { | ||
2008 | array_pop($this->stack); | ||
2009 | return HTML5::PCDATA; | ||
2010 | |||
2011 | /* A start tag with the tag name "title" */ | ||
2012 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { | ||
2013 | /* Create an element for the token and append the new element to the | ||
2014 | node pointed to by the head element pointer, or, if that is null | ||
2015 | (innerHTML case), to the current node. */ | ||
2016 | if ($this->head_pointer !== null) { | ||
2017 | $element = $this->insertElement($token, false); | ||
2018 | $this->head_pointer->appendChild($element); | ||
2019 | |||
2020 | } else { | ||
2021 | $element = $this->insertElement($token); | ||
2022 | } | ||
2023 | |||
2024 | /* Switch the tokeniser's content model flag to the RCDATA state. */ | ||
2025 | return HTML5::RCDATA; | ||
2026 | |||
2027 | /* A start tag with the tag name "style" */ | ||
2028 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { | ||
2029 | /* Create an element for the token and append the new element to the | ||
2030 | node pointed to by the head element pointer, or, if that is null | ||
2031 | (innerHTML case), to the current node. */ | ||
2032 | if ($this->head_pointer !== null) { | ||
2033 | $element = $this->insertElement($token, false); | ||
2034 | $this->head_pointer->appendChild($element); | ||
2035 | |||
2036 | } else { | ||
2037 | $this->insertElement($token); | ||
2038 | } | ||
2039 | |||
2040 | /* Switch the tokeniser's content model flag to the CDATA state. */ | ||
2041 | return HTML5::CDATA; | ||
2042 | |||
2043 | /* A start tag with the tag name "script" */ | ||
2044 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { | ||
2045 | /* Create an element for the token. */ | ||
2046 | $element = $this->insertElement($token, false); | ||
2047 | $this->head_pointer->appendChild($element); | ||
2048 | |||
2049 | /* Switch the tokeniser's content model flag to the CDATA state. */ | ||
2050 | return HTML5::CDATA; | ||
2051 | |||
2052 | /* A start tag with the tag name "base", "link", or "meta" */ | ||
2053 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
2054 | $token['name'], | ||
2055 | array('base', 'link', 'meta') | ||
2056 | ) | ||
2057 | ) { | ||
2058 | /* Create an element for the token and append the new element to the | ||
2059 | node pointed to by the head element pointer, or, if that is null | ||
2060 | (innerHTML case), to the current node. */ | ||
2061 | if ($this->head_pointer !== null) { | ||
2062 | $element = $this->insertElement($token, false); | ||
2063 | $this->head_pointer->appendChild($element); | ||
2064 | array_pop($this->stack); | ||
2065 | |||
2066 | } else { | ||
2067 | $this->insertElement($token); | ||
2068 | } | ||
2069 | |||
2070 | /* An end tag with the tag name "head" */ | ||
2071 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { | ||
2072 | /* If the current node is a head element, pop the current node off | ||
2073 | the stack of open elements. */ | ||
2074 | if ($this->head_pointer->isSameNode(end($this->stack))) { | ||
2075 | array_pop($this->stack); | ||
2076 | |||
2077 | /* Otherwise, this is a parse error. */ | ||
2078 | } else { | ||
2079 | // k | ||
2080 | } | ||
2081 | |||
2082 | /* Change the insertion mode to "after head". */ | ||
2083 | $this->mode = self::AFTER_HEAD; | ||
2084 | |||
2085 | /* A start tag with the tag name "head" or an end tag except "html". */ | ||
2086 | } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || | ||
2087 | ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html') | ||
2088 | ) { | ||
2089 | // Parse error. Ignore the token. | ||
2090 | |||
2091 | /* Anything else */ | ||
2092 | } else { | ||
2093 | /* If the current node is a head element, act as if an end tag | ||
2094 | token with the tag name "head" had been seen. */ | ||
2095 | if ($this->head_pointer->isSameNode(end($this->stack))) { | ||
2096 | $this->inHead( | ||
2097 | array( | ||
2098 | 'name' => 'head', | ||
2099 | 'type' => HTML5::ENDTAG | ||
2100 | ) | ||
2101 | ); | ||
2102 | |||
2103 | /* Otherwise, change the insertion mode to "after head". */ | ||
2104 | } else { | ||
2105 | $this->mode = self::AFTER_HEAD; | ||
2106 | } | ||
2107 | |||
2108 | /* Then, reprocess the current token. */ | ||
2109 | return $this->afterHead($token); | ||
2110 | } | ||
2111 | } | ||
2112 | |||
2113 | private function afterHead($token) | ||
2114 | { | ||
2115 | /* Handle the token as follows: */ | ||
2116 | |||
2117 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
2118 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
2119 | or U+0020 SPACE */ | ||
2120 | if ($token['type'] === HTML5::CHARACTR && | ||
2121 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
2122 | ) { | ||
2123 | /* Append the character to the current node. */ | ||
2124 | $this->insertText($token['data']); | ||
2125 | |||
2126 | /* A comment token */ | ||
2127 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
2128 | /* Append a Comment node to the current node with the data attribute | ||
2129 | set to the data given in the comment token. */ | ||
2130 | $this->insertComment($token['data']); | ||
2131 | |||
2132 | /* A start tag token with the tag name "body" */ | ||
2133 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { | ||
2134 | /* Insert a body element for the token. */ | ||
2135 | $this->insertElement($token); | ||
2136 | |||
2137 | /* Change the insertion mode to "in body". */ | ||
2138 | $this->mode = self::IN_BODY; | ||
2139 | |||
2140 | /* A start tag token with the tag name "frameset" */ | ||
2141 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { | ||
2142 | /* Insert a frameset element for the token. */ | ||
2143 | $this->insertElement($token); | ||
2144 | |||
2145 | /* Change the insertion mode to "in frameset". */ | ||
2146 | $this->mode = self::IN_FRAME; | ||
2147 | |||
2148 | /* A start tag token whose tag name is one of: "base", "link", "meta", | ||
2149 | "script", "style", "title" */ | ||
2150 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
2151 | $token['name'], | ||
2152 | array('base', 'link', 'meta', 'script', 'style', 'title') | ||
2153 | ) | ||
2154 | ) { | ||
2155 | /* Parse error. Switch the insertion mode back to "in head" and | ||
2156 | reprocess the token. */ | ||
2157 | $this->mode = self::IN_HEAD; | ||
2158 | return $this->inHead($token); | ||
2159 | |||
2160 | /* Anything else */ | ||
2161 | } else { | ||
2162 | /* Act as if a start tag token with the tag name "body" and no | ||
2163 | attributes had been seen, and then reprocess the current token. */ | ||
2164 | $this->afterHead( | ||
2165 | array( | ||
2166 | 'name' => 'body', | ||
2167 | 'type' => HTML5::STARTTAG, | ||
2168 | 'attr' => array() | ||
2169 | ) | ||
2170 | ); | ||
2171 | |||
2172 | return $this->inBody($token); | ||
2173 | } | ||
2174 | } | ||
2175 | |||
2176 | private function inBody($token) | ||
2177 | { | ||
2178 | /* Handle the token as follows: */ | ||
2179 | |||
2180 | switch ($token['type']) { | ||
2181 | /* A character token */ | ||
2182 | case HTML5::CHARACTR: | ||
2183 | /* Reconstruct the active formatting elements, if any. */ | ||
2184 | $this->reconstructActiveFormattingElements(); | ||
2185 | |||
2186 | /* Append the token's character to the current node. */ | ||
2187 | $this->insertText($token['data']); | ||
2188 | break; | ||
2189 | |||
2190 | /* A comment token */ | ||
2191 | case HTML5::COMMENT: | ||
2192 | /* Append a Comment node to the current node with the data | ||
2193 | attribute set to the data given in the comment token. */ | ||
2194 | $this->insertComment($token['data']); | ||
2195 | break; | ||
2196 | |||
2197 | case HTML5::STARTTAG: | ||
2198 | switch ($token['name']) { | ||
2199 | /* A start tag token whose tag name is one of: "script", | ||
2200 | "style" */ | ||
2201 | case 'script': | ||
2202 | case 'style': | ||
2203 | /* Process the token as if the insertion mode had been "in | ||
2204 | head". */ | ||
2205 | return $this->inHead($token); | ||
2206 | break; | ||
2207 | |||
2208 | /* A start tag token whose tag name is one of: "base", "link", | ||
2209 | "meta", "title" */ | ||
2210 | case 'base': | ||
2211 | case 'link': | ||
2212 | case 'meta': | ||
2213 | case 'title': | ||
2214 | /* Parse error. Process the token as if the insertion mode | ||
2215 | had been "in head". */ | ||
2216 | return $this->inHead($token); | ||
2217 | break; | ||
2218 | |||
2219 | /* A start tag token with the tag name "body" */ | ||
2220 | case 'body': | ||
2221 | /* Parse error. If the second element on the stack of open | ||
2222 | elements is not a body element, or, if the stack of open | ||
2223 | elements has only one node on it, then ignore the token. | ||
2224 | (innerHTML case) */ | ||
2225 | if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { | ||
2226 | // Ignore | ||
2227 | |||
2228 | /* Otherwise, for each attribute on the token, check to see | ||
2229 | if the attribute is already present on the body element (the | ||
2230 | second element) on the stack of open elements. If it is not, | ||
2231 | add the attribute and its corresponding value to that | ||
2232 | element. */ | ||
2233 | } else { | ||
2234 | foreach ($token['attr'] as $attr) { | ||
2235 | if (!$this->stack[1]->hasAttribute($attr['name'])) { | ||
2236 | $this->stack[1]->setAttribute($attr['name'], $attr['value']); | ||
2237 | } | ||
2238 | } | ||
2239 | } | ||
2240 | break; | ||
2241 | |||
2242 | /* A start tag whose tag name is one of: "address", | ||
2243 | "blockquote", "center", "dir", "div", "dl", "fieldset", | ||
2244 | "listing", "menu", "ol", "p", "ul" */ | ||
2245 | case 'address': | ||
2246 | case 'blockquote': | ||
2247 | case 'center': | ||
2248 | case 'dir': | ||
2249 | case 'div': | ||
2250 | case 'dl': | ||
2251 | case 'fieldset': | ||
2252 | case 'listing': | ||
2253 | case 'menu': | ||
2254 | case 'ol': | ||
2255 | case 'p': | ||
2256 | case 'ul': | ||
2257 | /* If the stack of open elements has a p element in scope, | ||
2258 | then act as if an end tag with the tag name p had been | ||
2259 | seen. */ | ||
2260 | if ($this->elementInScope('p')) { | ||
2261 | $this->emitToken( | ||
2262 | array( | ||
2263 | 'name' => 'p', | ||
2264 | 'type' => HTML5::ENDTAG | ||
2265 | ) | ||
2266 | ); | ||
2267 | } | ||
2268 | |||
2269 | /* Insert an HTML element for the token. */ | ||
2270 | $this->insertElement($token); | ||
2271 | break; | ||
2272 | |||
2273 | /* A start tag whose tag name is "form" */ | ||
2274 | case 'form': | ||
2275 | /* If the form element pointer is not null, ignore the | ||
2276 | token with a parse error. */ | ||
2277 | if ($this->form_pointer !== null) { | ||
2278 | // Ignore. | ||
2279 | |||
2280 | /* Otherwise: */ | ||
2281 | } else { | ||
2282 | /* If the stack of open elements has a p element in | ||
2283 | scope, then act as if an end tag with the tag name p | ||
2284 | had been seen. */ | ||
2285 | if ($this->elementInScope('p')) { | ||
2286 | $this->emitToken( | ||
2287 | array( | ||
2288 | 'name' => 'p', | ||
2289 | 'type' => HTML5::ENDTAG | ||
2290 | ) | ||
2291 | ); | ||
2292 | } | ||
2293 | |||
2294 | /* Insert an HTML element for the token, and set the | ||
2295 | form element pointer to point to the element created. */ | ||
2296 | $element = $this->insertElement($token); | ||
2297 | $this->form_pointer = $element; | ||
2298 | } | ||
2299 | break; | ||
2300 | |||
2301 | /* A start tag whose tag name is "li", "dd" or "dt" */ | ||
2302 | case 'li': | ||
2303 | case 'dd': | ||
2304 | case 'dt': | ||
2305 | /* If the stack of open elements has a p element in scope, | ||
2306 | then act as if an end tag with the tag name p had been | ||
2307 | seen. */ | ||
2308 | if ($this->elementInScope('p')) { | ||
2309 | $this->emitToken( | ||
2310 | array( | ||
2311 | 'name' => 'p', | ||
2312 | 'type' => HTML5::ENDTAG | ||
2313 | ) | ||
2314 | ); | ||
2315 | } | ||
2316 | |||
2317 | $stack_length = count($this->stack) - 1; | ||
2318 | |||
2319 | for ($n = $stack_length; 0 <= $n; $n--) { | ||
2320 | /* 1. Initialise node to be the current node (the | ||
2321 | bottommost node of the stack). */ | ||
2322 | $stop = false; | ||
2323 | $node = $this->stack[$n]; | ||
2324 | $cat = $this->getElementCategory($node->tagName); | ||
2325 | |||
2326 | /* 2. If node is an li, dd or dt element, then pop all | ||
2327 | the nodes from the current node up to node, including | ||
2328 | node, then stop this algorithm. */ | ||
2329 | if ($token['name'] === $node->tagName || ($token['name'] !== 'li' | ||
2330 | && ($node->tagName === 'dd' || $node->tagName === 'dt')) | ||
2331 | ) { | ||
2332 | for ($x = $stack_length; $x >= $n; $x--) { | ||
2333 | array_pop($this->stack); | ||
2334 | } | ||
2335 | |||
2336 | break; | ||
2337 | } | ||
2338 | |||
2339 | /* 3. If node is not in the formatting category, and is | ||
2340 | not in the phrasing category, and is not an address or | ||
2341 | div element, then stop this algorithm. */ | ||
2342 | if ($cat !== self::FORMATTING && $cat !== self::PHRASING && | ||
2343 | $node->tagName !== 'address' && $node->tagName !== 'div' | ||
2344 | ) { | ||
2345 | break; | ||
2346 | } | ||
2347 | } | ||
2348 | |||
2349 | /* Finally, insert an HTML element with the same tag | ||
2350 | name as the token's. */ | ||
2351 | $this->insertElement($token); | ||
2352 | break; | ||
2353 | |||
2354 | /* A start tag token whose tag name is "plaintext" */ | ||
2355 | case 'plaintext': | ||
2356 | /* If the stack of open elements has a p element in scope, | ||
2357 | then act as if an end tag with the tag name p had been | ||
2358 | seen. */ | ||
2359 | if ($this->elementInScope('p')) { | ||
2360 | $this->emitToken( | ||
2361 | array( | ||
2362 | 'name' => 'p', | ||
2363 | 'type' => HTML5::ENDTAG | ||
2364 | ) | ||
2365 | ); | ||
2366 | } | ||
2367 | |||
2368 | /* Insert an HTML element for the token. */ | ||
2369 | $this->insertElement($token); | ||
2370 | |||
2371 | return HTML5::PLAINTEXT; | ||
2372 | break; | ||
2373 | |||
2374 | /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", | ||
2375 | "h5", "h6" */ | ||
2376 | case 'h1': | ||
2377 | case 'h2': | ||
2378 | case 'h3': | ||
2379 | case 'h4': | ||
2380 | case 'h5': | ||
2381 | case 'h6': | ||
2382 | /* If the stack of open elements has a p element in scope, | ||
2383 | then act as if an end tag with the tag name p had been seen. */ | ||
2384 | if ($this->elementInScope('p')) { | ||
2385 | $this->emitToken( | ||
2386 | array( | ||
2387 | 'name' => 'p', | ||
2388 | 'type' => HTML5::ENDTAG | ||
2389 | ) | ||
2390 | ); | ||
2391 | } | ||
2392 | |||
2393 | /* If the stack of open elements has in scope an element whose | ||
2394 | tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then | ||
2395 | this is a parse error; pop elements from the stack until an | ||
2396 | element with one of those tag names has been popped from the | ||
2397 | stack. */ | ||
2398 | while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { | ||
2399 | array_pop($this->stack); | ||
2400 | } | ||
2401 | |||
2402 | /* Insert an HTML element for the token. */ | ||
2403 | $this->insertElement($token); | ||
2404 | break; | ||
2405 | |||
2406 | /* A start tag whose tag name is "a" */ | ||
2407 | case 'a': | ||
2408 | /* If the list of active formatting elements contains | ||
2409 | an element whose tag name is "a" between the end of the | ||
2410 | list and the last marker on the list (or the start of | ||
2411 | the list if there is no marker on the list), then this | ||
2412 | is a parse error; act as if an end tag with the tag name | ||
2413 | "a" had been seen, then remove that element from the list | ||
2414 | of active formatting elements and the stack of open | ||
2415 | elements if the end tag didn't already remove it (it | ||
2416 | might not have if the element is not in table scope). */ | ||
2417 | $leng = count($this->a_formatting); | ||
2418 | |||
2419 | for ($n = $leng - 1; $n >= 0; $n--) { | ||
2420 | if ($this->a_formatting[$n] === self::MARKER) { | ||
2421 | break; | ||
2422 | |||
2423 | } elseif ($this->a_formatting[$n]->nodeName === 'a') { | ||
2424 | $this->emitToken( | ||
2425 | array( | ||
2426 | 'name' => 'a', | ||
2427 | 'type' => HTML5::ENDTAG | ||
2428 | ) | ||
2429 | ); | ||
2430 | break; | ||
2431 | } | ||
2432 | } | ||
2433 | |||
2434 | /* Reconstruct the active formatting elements, if any. */ | ||
2435 | $this->reconstructActiveFormattingElements(); | ||
2436 | |||
2437 | /* Insert an HTML element for the token. */ | ||
2438 | $el = $this->insertElement($token); | ||
2439 | |||
2440 | /* Add that element to the list of active formatting | ||
2441 | elements. */ | ||
2442 | $this->a_formatting[] = $el; | ||
2443 | break; | ||
2444 | |||
2445 | /* A start tag whose tag name is one of: "b", "big", "em", "font", | ||
2446 | "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ | ||
2447 | case 'b': | ||
2448 | case 'big': | ||
2449 | case 'em': | ||
2450 | case 'font': | ||
2451 | case 'i': | ||
2452 | case 'nobr': | ||
2453 | case 's': | ||
2454 | case 'small': | ||
2455 | case 'strike': | ||
2456 | case 'strong': | ||
2457 | case 'tt': | ||
2458 | case 'u': | ||
2459 | /* Reconstruct the active formatting elements, if any. */ | ||
2460 | $this->reconstructActiveFormattingElements(); | ||
2461 | |||
2462 | /* Insert an HTML element for the token. */ | ||
2463 | $el = $this->insertElement($token); | ||
2464 | |||
2465 | /* Add that element to the list of active formatting | ||
2466 | elements. */ | ||
2467 | $this->a_formatting[] = $el; | ||
2468 | break; | ||
2469 | |||
2470 | /* A start tag token whose tag name is "button" */ | ||
2471 | case 'button': | ||
2472 | /* If the stack of open elements has a button element in scope, | ||
2473 | then this is a parse error; act as if an end tag with the tag | ||
2474 | name "button" had been seen, then reprocess the token. (We don't | ||
2475 | do that. Unnecessary.) */ | ||
2476 | if ($this->elementInScope('button')) { | ||
2477 | $this->inBody( | ||
2478 | array( | ||
2479 | 'name' => 'button', | ||
2480 | 'type' => HTML5::ENDTAG | ||
2481 | ) | ||
2482 | ); | ||
2483 | } | ||
2484 | |||
2485 | /* Reconstruct the active formatting elements, if any. */ | ||
2486 | $this->reconstructActiveFormattingElements(); | ||
2487 | |||
2488 | /* Insert an HTML element for the token. */ | ||
2489 | $this->insertElement($token); | ||
2490 | |||
2491 | /* Insert a marker at the end of the list of active | ||
2492 | formatting elements. */ | ||
2493 | $this->a_formatting[] = self::MARKER; | ||
2494 | break; | ||
2495 | |||
2496 | /* A start tag token whose tag name is one of: "marquee", "object" */ | ||
2497 | case 'marquee': | ||
2498 | case 'object': | ||
2499 | /* Reconstruct the active formatting elements, if any. */ | ||
2500 | $this->reconstructActiveFormattingElements(); | ||
2501 | |||
2502 | /* Insert an HTML element for the token. */ | ||
2503 | $this->insertElement($token); | ||
2504 | |||
2505 | /* Insert a marker at the end of the list of active | ||
2506 | formatting elements. */ | ||
2507 | $this->a_formatting[] = self::MARKER; | ||
2508 | break; | ||
2509 | |||
2510 | /* A start tag token whose tag name is "xmp" */ | ||
2511 | case 'xmp': | ||
2512 | /* Reconstruct the active formatting elements, if any. */ | ||
2513 | $this->reconstructActiveFormattingElements(); | ||
2514 | |||
2515 | /* Insert an HTML element for the token. */ | ||
2516 | $this->insertElement($token); | ||
2517 | |||
2518 | /* Switch the content model flag to the CDATA state. */ | ||
2519 | return HTML5::CDATA; | ||
2520 | break; | ||
2521 | |||
2522 | /* A start tag whose tag name is "table" */ | ||
2523 | case 'table': | ||
2524 | /* If the stack of open elements has a p element in scope, | ||
2525 | then act as if an end tag with the tag name p had been seen. */ | ||
2526 | if ($this->elementInScope('p')) { | ||
2527 | $this->emitToken( | ||
2528 | array( | ||
2529 | 'name' => 'p', | ||
2530 | 'type' => HTML5::ENDTAG | ||
2531 | ) | ||
2532 | ); | ||
2533 | } | ||
2534 | |||
2535 | /* Insert an HTML element for the token. */ | ||
2536 | $this->insertElement($token); | ||
2537 | |||
2538 | /* Change the insertion mode to "in table". */ | ||
2539 | $this->mode = self::IN_TABLE; | ||
2540 | break; | ||
2541 | |||
2542 | /* A start tag whose tag name is one of: "area", "basefont", | ||
2543 | "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ | ||
2544 | case 'area': | ||
2545 | case 'basefont': | ||
2546 | case 'bgsound': | ||
2547 | case 'br': | ||
2548 | case 'embed': | ||
2549 | case 'img': | ||
2550 | case 'param': | ||
2551 | case 'spacer': | ||
2552 | case 'wbr': | ||
2553 | /* Reconstruct the active formatting elements, if any. */ | ||
2554 | $this->reconstructActiveFormattingElements(); | ||
2555 | |||
2556 | /* Insert an HTML element for the token. */ | ||
2557 | $this->insertElement($token); | ||
2558 | |||
2559 | /* Immediately pop the current node off the stack of open elements. */ | ||
2560 | array_pop($this->stack); | ||
2561 | break; | ||
2562 | |||
2563 | /* A start tag whose tag name is "hr" */ | ||
2564 | case 'hr': | ||
2565 | /* If the stack of open elements has a p element in scope, | ||
2566 | then act as if an end tag with the tag name p had been seen. */ | ||
2567 | if ($this->elementInScope('p')) { | ||
2568 | $this->emitToken( | ||
2569 | array( | ||
2570 | 'name' => 'p', | ||
2571 | 'type' => HTML5::ENDTAG | ||
2572 | ) | ||
2573 | ); | ||
2574 | } | ||
2575 | |||
2576 | /* Insert an HTML element for the token. */ | ||
2577 | $this->insertElement($token); | ||
2578 | |||
2579 | /* Immediately pop the current node off the stack of open elements. */ | ||
2580 | array_pop($this->stack); | ||
2581 | break; | ||
2582 | |||
2583 | /* A start tag whose tag name is "image" */ | ||
2584 | case 'image': | ||
2585 | /* Parse error. Change the token's tag name to "img" and | ||
2586 | reprocess it. (Don't ask.) */ | ||
2587 | $token['name'] = 'img'; | ||
2588 | return $this->inBody($token); | ||
2589 | break; | ||
2590 | |||
2591 | /* A start tag whose tag name is "input" */ | ||
2592 | case 'input': | ||
2593 | /* Reconstruct the active formatting elements, if any. */ | ||
2594 | $this->reconstructActiveFormattingElements(); | ||
2595 | |||
2596 | /* Insert an input element for the token. */ | ||
2597 | $element = $this->insertElement($token, false); | ||
2598 | |||
2599 | /* If the form element pointer is not null, then associate the | ||
2600 | input element with the form element pointed to by the form | ||
2601 | element pointer. */ | ||
2602 | $this->form_pointer !== null | ||
2603 | ? $this->form_pointer->appendChild($element) | ||
2604 | : end($this->stack)->appendChild($element); | ||
2605 | |||
2606 | /* Pop that input element off the stack of open elements. */ | ||
2607 | array_pop($this->stack); | ||
2608 | break; | ||
2609 | |||
2610 | /* A start tag whose tag name is "isindex" */ | ||
2611 | case 'isindex': | ||
2612 | /* Parse error. */ | ||
2613 | // w/e | ||
2614 | |||
2615 | /* If the form element pointer is not null, | ||
2616 | then ignore the token. */ | ||
2617 | if ($this->form_pointer === null) { | ||
2618 | /* Act as if a start tag token with the tag name "form" had | ||
2619 | been seen. */ | ||
2620 | $this->inBody( | ||
2621 | array( | ||
2622 | 'name' => 'body', | ||
2623 | 'type' => HTML5::STARTTAG, | ||
2624 | 'attr' => array() | ||
2625 | ) | ||
2626 | ); | ||
2627 | |||
2628 | /* Act as if a start tag token with the tag name "hr" had | ||
2629 | been seen. */ | ||
2630 | $this->inBody( | ||
2631 | array( | ||
2632 | 'name' => 'hr', | ||
2633 | 'type' => HTML5::STARTTAG, | ||
2634 | 'attr' => array() | ||
2635 | ) | ||
2636 | ); | ||
2637 | |||
2638 | /* Act as if a start tag token with the tag name "p" had | ||
2639 | been seen. */ | ||
2640 | $this->inBody( | ||
2641 | array( | ||
2642 | 'name' => 'p', | ||
2643 | 'type' => HTML5::STARTTAG, | ||
2644 | 'attr' => array() | ||
2645 | ) | ||
2646 | ); | ||
2647 | |||
2648 | /* Act as if a start tag token with the tag name "label" | ||
2649 | had been seen. */ | ||
2650 | $this->inBody( | ||
2651 | array( | ||
2652 | 'name' => 'label', | ||
2653 | 'type' => HTML5::STARTTAG, | ||
2654 | 'attr' => array() | ||
2655 | ) | ||
2656 | ); | ||
2657 | |||
2658 | /* Act as if a stream of character tokens had been seen. */ | ||
2659 | $this->insertText( | ||
2660 | 'This is a searchable index. ' . | ||
2661 | 'Insert your search keywords here: ' | ||
2662 | ); | ||
2663 | |||
2664 | /* Act as if a start tag token with the tag name "input" | ||
2665 | had been seen, with all the attributes from the "isindex" | ||
2666 | token, except with the "name" attribute set to the value | ||
2667 | "isindex" (ignoring any explicit "name" attribute). */ | ||
2668 | $attr = $token['attr']; | ||
2669 | $attr[] = array('name' => 'name', 'value' => 'isindex'); | ||
2670 | |||
2671 | $this->inBody( | ||
2672 | array( | ||
2673 | 'name' => 'input', | ||
2674 | 'type' => HTML5::STARTTAG, | ||
2675 | 'attr' => $attr | ||
2676 | ) | ||
2677 | ); | ||
2678 | |||
2679 | /* Act as if a stream of character tokens had been seen | ||
2680 | (see below for what they should say). */ | ||
2681 | $this->insertText( | ||
2682 | 'This is a searchable index. ' . | ||
2683 | 'Insert your search keywords here: ' | ||
2684 | ); | ||
2685 | |||
2686 | /* Act as if an end tag token with the tag name "label" | ||
2687 | had been seen. */ | ||
2688 | $this->inBody( | ||
2689 | array( | ||
2690 | 'name' => 'label', | ||
2691 | 'type' => HTML5::ENDTAG | ||
2692 | ) | ||
2693 | ); | ||
2694 | |||
2695 | /* Act as if an end tag token with the tag name "p" had | ||
2696 | been seen. */ | ||
2697 | $this->inBody( | ||
2698 | array( | ||
2699 | 'name' => 'p', | ||
2700 | 'type' => HTML5::ENDTAG | ||
2701 | ) | ||
2702 | ); | ||
2703 | |||
2704 | /* Act as if a start tag token with the tag name "hr" had | ||
2705 | been seen. */ | ||
2706 | $this->inBody( | ||
2707 | array( | ||
2708 | 'name' => 'hr', | ||
2709 | 'type' => HTML5::ENDTAG | ||
2710 | ) | ||
2711 | ); | ||
2712 | |||
2713 | /* Act as if an end tag token with the tag name "form" had | ||
2714 | been seen. */ | ||
2715 | $this->inBody( | ||
2716 | array( | ||
2717 | 'name' => 'form', | ||
2718 | 'type' => HTML5::ENDTAG | ||
2719 | ) | ||
2720 | ); | ||
2721 | } | ||
2722 | break; | ||
2723 | |||
2724 | /* A start tag whose tag name is "textarea" */ | ||
2725 | case 'textarea': | ||
2726 | $this->insertElement($token); | ||
2727 | |||
2728 | /* Switch the tokeniser's content model flag to the | ||
2729 | RCDATA state. */ | ||
2730 | return HTML5::RCDATA; | ||
2731 | break; | ||
2732 | |||
2733 | /* A start tag whose tag name is one of: "iframe", "noembed", | ||
2734 | "noframes" */ | ||
2735 | case 'iframe': | ||
2736 | case 'noembed': | ||
2737 | case 'noframes': | ||
2738 | $this->insertElement($token); | ||
2739 | |||
2740 | /* Switch the tokeniser's content model flag to the CDATA state. */ | ||
2741 | return HTML5::CDATA; | ||
2742 | break; | ||
2743 | |||
2744 | /* A start tag whose tag name is "select" */ | ||
2745 | case 'select': | ||
2746 | /* Reconstruct the active formatting elements, if any. */ | ||
2747 | $this->reconstructActiveFormattingElements(); | ||
2748 | |||
2749 | /* Insert an HTML element for the token. */ | ||
2750 | $this->insertElement($token); | ||
2751 | |||
2752 | /* Change the insertion mode to "in select". */ | ||
2753 | $this->mode = self::IN_SELECT; | ||
2754 | break; | ||
2755 | |||
2756 | /* A start or end tag whose tag name is one of: "caption", "col", | ||
2757 | "colgroup", "frame", "frameset", "head", "option", "optgroup", | ||
2758 | "tbody", "td", "tfoot", "th", "thead", "tr". */ | ||
2759 | case 'caption': | ||
2760 | case 'col': | ||
2761 | case 'colgroup': | ||
2762 | case 'frame': | ||
2763 | case 'frameset': | ||
2764 | case 'head': | ||
2765 | case 'option': | ||
2766 | case 'optgroup': | ||
2767 | case 'tbody': | ||
2768 | case 'td': | ||
2769 | case 'tfoot': | ||
2770 | case 'th': | ||
2771 | case 'thead': | ||
2772 | case 'tr': | ||
2773 | // Parse error. Ignore the token. | ||
2774 | break; | ||
2775 | |||
2776 | /* A start or end tag whose tag name is one of: "event-source", | ||
2777 | "section", "nav", "article", "aside", "header", "footer", | ||
2778 | "datagrid", "command" */ | ||
2779 | case 'event-source': | ||
2780 | case 'section': | ||
2781 | case 'nav': | ||
2782 | case 'article': | ||
2783 | case 'aside': | ||
2784 | case 'header': | ||
2785 | case 'footer': | ||
2786 | case 'datagrid': | ||
2787 | case 'command': | ||
2788 | // Work in progress! | ||
2789 | break; | ||
2790 | |||
2791 | /* A start tag token not covered by the previous entries */ | ||
2792 | default: | ||
2793 | /* Reconstruct the active formatting elements, if any. */ | ||
2794 | $this->reconstructActiveFormattingElements(); | ||
2795 | |||
2796 | $this->insertElement($token, true, true); | ||
2797 | break; | ||
2798 | } | ||
2799 | break; | ||
2800 | |||
2801 | case HTML5::ENDTAG: | ||
2802 | switch ($token['name']) { | ||
2803 | /* An end tag with the tag name "body" */ | ||
2804 | case 'body': | ||
2805 | /* If the second element in the stack of open elements is | ||
2806 | not a body element, this is a parse error. Ignore the token. | ||
2807 | (innerHTML case) */ | ||
2808 | if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { | ||
2809 | // Ignore. | ||
2810 | |||
2811 | /* If the current node is not the body element, then this | ||
2812 | is a parse error. */ | ||
2813 | } elseif (end($this->stack)->nodeName !== 'body') { | ||
2814 | // Parse error. | ||
2815 | } | ||
2816 | |||
2817 | /* Change the insertion mode to "after body". */ | ||
2818 | $this->mode = self::AFTER_BODY; | ||
2819 | break; | ||
2820 | |||
2821 | /* An end tag with the tag name "html" */ | ||
2822 | case 'html': | ||
2823 | /* Act as if an end tag with tag name "body" had been seen, | ||
2824 | then, if that token wasn't ignored, reprocess the current | ||
2825 | token. */ | ||
2826 | $this->inBody( | ||
2827 | array( | ||
2828 | 'name' => 'body', | ||
2829 | 'type' => HTML5::ENDTAG | ||
2830 | ) | ||
2831 | ); | ||
2832 | |||
2833 | return $this->afterBody($token); | ||
2834 | break; | ||
2835 | |||
2836 | /* An end tag whose tag name is one of: "address", "blockquote", | ||
2837 | "center", "dir", "div", "dl", "fieldset", "listing", "menu", | ||
2838 | "ol", "pre", "ul" */ | ||
2839 | case 'address': | ||
2840 | case 'blockquote': | ||
2841 | case 'center': | ||
2842 | case 'dir': | ||
2843 | case 'div': | ||
2844 | case 'dl': | ||
2845 | case 'fieldset': | ||
2846 | case 'listing': | ||
2847 | case 'menu': | ||
2848 | case 'ol': | ||
2849 | case 'pre': | ||
2850 | case 'ul': | ||
2851 | /* If the stack of open elements has an element in scope | ||
2852 | with the same tag name as that of the token, then generate | ||
2853 | implied end tags. */ | ||
2854 | if ($this->elementInScope($token['name'])) { | ||
2855 | $this->generateImpliedEndTags(); | ||
2856 | |||
2857 | /* Now, if the current node is not an element with | ||
2858 | the same tag name as that of the token, then this | ||
2859 | is a parse error. */ | ||
2860 | // w/e | ||
2861 | |||
2862 | /* If the stack of open elements has an element in | ||
2863 | scope with the same tag name as that of the token, | ||
2864 | then pop elements from this stack until an element | ||
2865 | with that tag name has been popped from the stack. */ | ||
2866 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
2867 | if ($this->stack[$n]->nodeName === $token['name']) { | ||
2868 | $n = -1; | ||
2869 | } | ||
2870 | |||
2871 | array_pop($this->stack); | ||
2872 | } | ||
2873 | } | ||
2874 | break; | ||
2875 | |||
2876 | /* An end tag whose tag name is "form" */ | ||
2877 | case 'form': | ||
2878 | /* If the stack of open elements has an element in scope | ||
2879 | with the same tag name as that of the token, then generate | ||
2880 | implied end tags. */ | ||
2881 | if ($this->elementInScope($token['name'])) { | ||
2882 | $this->generateImpliedEndTags(); | ||
2883 | |||
2884 | } | ||
2885 | |||
2886 | if (end($this->stack)->nodeName !== $token['name']) { | ||
2887 | /* Now, if the current node is not an element with the | ||
2888 | same tag name as that of the token, then this is a parse | ||
2889 | error. */ | ||
2890 | // w/e | ||
2891 | |||
2892 | } else { | ||
2893 | /* Otherwise, if the current node is an element with | ||
2894 | the same tag name as that of the token pop that element | ||
2895 | from the stack. */ | ||
2896 | array_pop($this->stack); | ||
2897 | } | ||
2898 | |||
2899 | /* In any case, set the form element pointer to null. */ | ||
2900 | $this->form_pointer = null; | ||
2901 | break; | ||
2902 | |||
2903 | /* An end tag whose tag name is "p" */ | ||
2904 | case 'p': | ||
2905 | /* If the stack of open elements has a p element in scope, | ||
2906 | then generate implied end tags, except for p elements. */ | ||
2907 | if ($this->elementInScope('p')) { | ||
2908 | $this->generateImpliedEndTags(array('p')); | ||
2909 | |||
2910 | /* If the current node is not a p element, then this is | ||
2911 | a parse error. */ | ||
2912 | // k | ||
2913 | |||
2914 | /* If the stack of open elements has a p element in | ||
2915 | scope, then pop elements from this stack until the stack | ||
2916 | no longer has a p element in scope. */ | ||
2917 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
2918 | if ($this->elementInScope('p')) { | ||
2919 | array_pop($this->stack); | ||
2920 | |||
2921 | } else { | ||
2922 | break; | ||
2923 | } | ||
2924 | } | ||
2925 | } | ||
2926 | break; | ||
2927 | |||
2928 | /* An end tag whose tag name is "dd", "dt", or "li" */ | ||
2929 | case 'dd': | ||
2930 | case 'dt': | ||
2931 | case 'li': | ||
2932 | /* If the stack of open elements has an element in scope | ||
2933 | whose tag name matches the tag name of the token, then | ||
2934 | generate implied end tags, except for elements with the | ||
2935 | same tag name as the token. */ | ||
2936 | if ($this->elementInScope($token['name'])) { | ||
2937 | $this->generateImpliedEndTags(array($token['name'])); | ||
2938 | |||
2939 | /* If the current node is not an element with the same | ||
2940 | tag name as the token, then this is a parse error. */ | ||
2941 | // w/e | ||
2942 | |||
2943 | /* If the stack of open elements has an element in scope | ||
2944 | whose tag name matches the tag name of the token, then | ||
2945 | pop elements from this stack until an element with that | ||
2946 | tag name has been popped from the stack. */ | ||
2947 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
2948 | if ($this->stack[$n]->nodeName === $token['name']) { | ||
2949 | $n = -1; | ||
2950 | } | ||
2951 | |||
2952 | array_pop($this->stack); | ||
2953 | } | ||
2954 | } | ||
2955 | break; | ||
2956 | |||
2957 | /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", | ||
2958 | "h5", "h6" */ | ||
2959 | case 'h1': | ||
2960 | case 'h2': | ||
2961 | case 'h3': | ||
2962 | case 'h4': | ||
2963 | case 'h5': | ||
2964 | case 'h6': | ||
2965 | $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); | ||
2966 | |||
2967 | /* If the stack of open elements has in scope an element whose | ||
2968 | tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then | ||
2969 | generate implied end tags. */ | ||
2970 | if ($this->elementInScope($elements)) { | ||
2971 | $this->generateImpliedEndTags(); | ||
2972 | |||
2973 | /* Now, if the current node is not an element with the same | ||
2974 | tag name as that of the token, then this is a parse error. */ | ||
2975 | // w/e | ||
2976 | |||
2977 | /* If the stack of open elements has in scope an element | ||
2978 | whose tag name is one of "h1", "h2", "h3", "h4", "h5", or | ||
2979 | "h6", then pop elements from the stack until an element | ||
2980 | with one of those tag names has been popped from the stack. */ | ||
2981 | while ($this->elementInScope($elements)) { | ||
2982 | array_pop($this->stack); | ||
2983 | } | ||
2984 | } | ||
2985 | break; | ||
2986 | |||
2987 | /* An end tag whose tag name is one of: "a", "b", "big", "em", | ||
2988 | "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ | ||
2989 | case 'a': | ||
2990 | case 'b': | ||
2991 | case 'big': | ||
2992 | case 'em': | ||
2993 | case 'font': | ||
2994 | case 'i': | ||
2995 | case 'nobr': | ||
2996 | case 's': | ||
2997 | case 'small': | ||
2998 | case 'strike': | ||
2999 | case 'strong': | ||
3000 | case 'tt': | ||
3001 | case 'u': | ||
3002 | /* 1. Let the formatting element be the last element in | ||
3003 | the list of active formatting elements that: | ||
3004 | * is between the end of the list and the last scope | ||
3005 | marker in the list, if any, or the start of the list | ||
3006 | otherwise, and | ||
3007 | * has the same tag name as the token. | ||
3008 | */ | ||
3009 | while (true) { | ||
3010 | for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) { | ||
3011 | if ($this->a_formatting[$a] === self::MARKER) { | ||
3012 | break; | ||
3013 | |||
3014 | } elseif ($this->a_formatting[$a]->tagName === $token['name']) { | ||
3015 | $formatting_element = $this->a_formatting[$a]; | ||
3016 | $in_stack = in_array($formatting_element, $this->stack, true); | ||
3017 | $fe_af_pos = $a; | ||
3018 | break; | ||
3019 | } | ||
3020 | } | ||
3021 | |||
3022 | /* If there is no such node, or, if that node is | ||
3023 | also in the stack of open elements but the element | ||
3024 | is not in scope, then this is a parse error. Abort | ||
3025 | these steps. The token is ignored. */ | ||
3026 | if (!isset($formatting_element) || ($in_stack && | ||
3027 | !$this->elementInScope($token['name'])) | ||
3028 | ) { | ||
3029 | break; | ||
3030 | |||
3031 | /* Otherwise, if there is such a node, but that node | ||
3032 | is not in the stack of open elements, then this is a | ||
3033 | parse error; remove the element from the list, and | ||
3034 | abort these steps. */ | ||
3035 | } elseif (isset($formatting_element) && !$in_stack) { | ||
3036 | unset($this->a_formatting[$fe_af_pos]); | ||
3037 | $this->a_formatting = array_merge($this->a_formatting); | ||
3038 | break; | ||
3039 | } | ||
3040 | |||
3041 | /* 2. Let the furthest block be the topmost node in the | ||
3042 | stack of open elements that is lower in the stack | ||
3043 | than the formatting element, and is not an element in | ||
3044 | the phrasing or formatting categories. There might | ||
3045 | not be one. */ | ||
3046 | $fe_s_pos = array_search($formatting_element, $this->stack, true); | ||
3047 | $length = count($this->stack); | ||
3048 | |||
3049 | for ($s = $fe_s_pos + 1; $s < $length; $s++) { | ||
3050 | $category = $this->getElementCategory($this->stack[$s]->nodeName); | ||
3051 | |||
3052 | if ($category !== self::PHRASING && $category !== self::FORMATTING) { | ||
3053 | $furthest_block = $this->stack[$s]; | ||
3054 | } | ||
3055 | } | ||
3056 | |||
3057 | /* 3. If there is no furthest block, then the UA must | ||
3058 | skip the subsequent steps and instead just pop all | ||
3059 | the nodes from the bottom of the stack of open | ||
3060 | elements, from the current node up to the formatting | ||
3061 | element, and remove the formatting element from the | ||
3062 | list of active formatting elements. */ | ||
3063 | if (!isset($furthest_block)) { | ||
3064 | for ($n = $length - 1; $n >= $fe_s_pos; $n--) { | ||
3065 | array_pop($this->stack); | ||
3066 | } | ||
3067 | |||
3068 | unset($this->a_formatting[$fe_af_pos]); | ||
3069 | $this->a_formatting = array_merge($this->a_formatting); | ||
3070 | break; | ||
3071 | } | ||
3072 | |||
3073 | /* 4. Let the common ancestor be the element | ||
3074 | immediately above the formatting element in the stack | ||
3075 | of open elements. */ | ||
3076 | $common_ancestor = $this->stack[$fe_s_pos - 1]; | ||
3077 | |||
3078 | /* 5. If the furthest block has a parent node, then | ||
3079 | remove the furthest block from its parent node. */ | ||
3080 | if ($furthest_block->parentNode !== null) { | ||
3081 | $furthest_block->parentNode->removeChild($furthest_block); | ||
3082 | } | ||
3083 | |||
3084 | /* 6. Let a bookmark note the position of the | ||
3085 | formatting element in the list of active formatting | ||
3086 | elements relative to the elements on either side | ||
3087 | of it in the list. */ | ||
3088 | $bookmark = $fe_af_pos; | ||
3089 | |||
3090 | /* 7. Let node and last node be the furthest block. | ||
3091 | Follow these steps: */ | ||
3092 | $node = $furthest_block; | ||
3093 | $last_node = $furthest_block; | ||
3094 | |||
3095 | while (true) { | ||
3096 | for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { | ||
3097 | /* 7.1 Let node be the element immediately | ||
3098 | prior to node in the stack of open elements. */ | ||
3099 | $node = $this->stack[$n]; | ||
3100 | |||
3101 | /* 7.2 If node is not in the list of active | ||
3102 | formatting elements, then remove node from | ||
3103 | the stack of open elements and then go back | ||
3104 | to step 1. */ | ||
3105 | if (!in_array($node, $this->a_formatting, true)) { | ||
3106 | unset($this->stack[$n]); | ||
3107 | $this->stack = array_merge($this->stack); | ||
3108 | |||
3109 | } else { | ||
3110 | break; | ||
3111 | } | ||
3112 | } | ||
3113 | |||
3114 | /* 7.3 Otherwise, if node is the formatting | ||
3115 | element, then go to the next step in the overall | ||
3116 | algorithm. */ | ||
3117 | if ($node === $formatting_element) { | ||
3118 | break; | ||
3119 | |||
3120 | /* 7.4 Otherwise, if last node is the furthest | ||
3121 | block, then move the aforementioned bookmark to | ||
3122 | be immediately after the node in the list of | ||
3123 | active formatting elements. */ | ||
3124 | } elseif ($last_node === $furthest_block) { | ||
3125 | $bookmark = array_search($node, $this->a_formatting, true) + 1; | ||
3126 | } | ||
3127 | |||
3128 | /* 7.5 If node has any children, perform a | ||
3129 | shallow clone of node, replace the entry for | ||
3130 | node in the list of active formatting elements | ||
3131 | with an entry for the clone, replace the entry | ||
3132 | for node in the stack of open elements with an | ||
3133 | entry for the clone, and let node be the clone. */ | ||
3134 | if ($node->hasChildNodes()) { | ||
3135 | $clone = $node->cloneNode(); | ||
3136 | $s_pos = array_search($node, $this->stack, true); | ||
3137 | $a_pos = array_search($node, $this->a_formatting, true); | ||
3138 | |||
3139 | $this->stack[$s_pos] = $clone; | ||
3140 | $this->a_formatting[$a_pos] = $clone; | ||
3141 | $node = $clone; | ||
3142 | } | ||
3143 | |||
3144 | /* 7.6 Insert last node into node, first removing | ||
3145 | it from its previous parent node if any. */ | ||
3146 | if ($last_node->parentNode !== null) { | ||
3147 | $last_node->parentNode->removeChild($last_node); | ||
3148 | } | ||
3149 | |||
3150 | $node->appendChild($last_node); | ||
3151 | |||
3152 | /* 7.7 Let last node be node. */ | ||
3153 | $last_node = $node; | ||
3154 | } | ||
3155 | |||
3156 | /* 8. Insert whatever last node ended up being in | ||
3157 | the previous step into the common ancestor node, | ||
3158 | first removing it from its previous parent node if | ||
3159 | any. */ | ||
3160 | if ($last_node->parentNode !== null) { | ||
3161 | $last_node->parentNode->removeChild($last_node); | ||
3162 | } | ||
3163 | |||
3164 | $common_ancestor->appendChild($last_node); | ||
3165 | |||
3166 | /* 9. Perform a shallow clone of the formatting | ||
3167 | element. */ | ||
3168 | $clone = $formatting_element->cloneNode(); | ||
3169 | |||
3170 | /* 10. Take all of the child nodes of the furthest | ||
3171 | block and append them to the clone created in the | ||
3172 | last step. */ | ||
3173 | while ($furthest_block->hasChildNodes()) { | ||
3174 | $child = $furthest_block->firstChild; | ||
3175 | $furthest_block->removeChild($child); | ||
3176 | $clone->appendChild($child); | ||
3177 | } | ||
3178 | |||
3179 | /* 11. Append that clone to the furthest block. */ | ||
3180 | $furthest_block->appendChild($clone); | ||
3181 | |||
3182 | /* 12. Remove the formatting element from the list | ||
3183 | of active formatting elements, and insert the clone | ||
3184 | into the list of active formatting elements at the | ||
3185 | position of the aforementioned bookmark. */ | ||
3186 | $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); | ||
3187 | unset($this->a_formatting[$fe_af_pos]); | ||
3188 | $this->a_formatting = array_merge($this->a_formatting); | ||
3189 | |||
3190 | $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); | ||
3191 | $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); | ||
3192 | $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); | ||
3193 | |||
3194 | /* 13. Remove the formatting element from the stack | ||
3195 | of open elements, and insert the clone into the stack | ||
3196 | of open elements immediately after (i.e. in a more | ||
3197 | deeply nested position than) the position of the | ||
3198 | furthest block in that stack. */ | ||
3199 | $fe_s_pos = array_search($formatting_element, $this->stack, true); | ||
3200 | $fb_s_pos = array_search($furthest_block, $this->stack, true); | ||
3201 | unset($this->stack[$fe_s_pos]); | ||
3202 | |||
3203 | $s_part1 = array_slice($this->stack, 0, $fb_s_pos); | ||
3204 | $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); | ||
3205 | $this->stack = array_merge($s_part1, array($clone), $s_part2); | ||
3206 | |||
3207 | /* 14. Jump back to step 1 in this series of steps. */ | ||
3208 | unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); | ||
3209 | } | ||
3210 | break; | ||
3211 | |||
3212 | /* An end tag token whose tag name is one of: "button", | ||
3213 | "marquee", "object" */ | ||
3214 | case 'button': | ||
3215 | case 'marquee': | ||
3216 | case 'object': | ||
3217 | /* If the stack of open elements has an element in scope whose | ||
3218 | tag name matches the tag name of the token, then generate implied | ||
3219 | tags. */ | ||
3220 | if ($this->elementInScope($token['name'])) { | ||
3221 | $this->generateImpliedEndTags(); | ||
3222 | |||
3223 | /* Now, if the current node is not an element with the same | ||
3224 | tag name as the token, then this is a parse error. */ | ||
3225 | // k | ||
3226 | |||
3227 | /* Now, if the stack of open elements has an element in scope | ||
3228 | whose tag name matches the tag name of the token, then pop | ||
3229 | elements from the stack until that element has been popped from | ||
3230 | the stack, and clear the list of active formatting elements up | ||
3231 | to the last marker. */ | ||
3232 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
3233 | if ($this->stack[$n]->nodeName === $token['name']) { | ||
3234 | $n = -1; | ||
3235 | } | ||
3236 | |||
3237 | array_pop($this->stack); | ||
3238 | } | ||
3239 | |||
3240 | $marker = end(array_keys($this->a_formatting, self::MARKER, true)); | ||
3241 | |||
3242 | for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) { | ||
3243 | array_pop($this->a_formatting); | ||
3244 | } | ||
3245 | } | ||
3246 | break; | ||
3247 | |||
3248 | /* Or an end tag whose tag name is one of: "area", "basefont", | ||
3249 | "bgsound", "br", "embed", "hr", "iframe", "image", "img", | ||
3250 | "input", "isindex", "noembed", "noframes", "param", "select", | ||
3251 | "spacer", "table", "textarea", "wbr" */ | ||
3252 | case 'area': | ||
3253 | case 'basefont': | ||
3254 | case 'bgsound': | ||
3255 | case 'br': | ||
3256 | case 'embed': | ||
3257 | case 'hr': | ||
3258 | case 'iframe': | ||
3259 | case 'image': | ||
3260 | case 'img': | ||
3261 | case 'input': | ||
3262 | case 'isindex': | ||
3263 | case 'noembed': | ||
3264 | case 'noframes': | ||
3265 | case 'param': | ||
3266 | case 'select': | ||
3267 | case 'spacer': | ||
3268 | case 'table': | ||
3269 | case 'textarea': | ||
3270 | case 'wbr': | ||
3271 | // Parse error. Ignore the token. | ||
3272 | break; | ||
3273 | |||
3274 | /* An end tag token not covered by the previous entries */ | ||
3275 | default: | ||
3276 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
3277 | /* Initialise node to be the current node (the bottommost | ||
3278 | node of the stack). */ | ||
3279 | $node = end($this->stack); | ||
3280 | |||
3281 | /* If node has the same tag name as the end tag token, | ||
3282 | then: */ | ||
3283 | if ($token['name'] === $node->nodeName) { | ||
3284 | /* Generate implied end tags. */ | ||
3285 | $this->generateImpliedEndTags(); | ||
3286 | |||
3287 | /* If the tag name of the end tag token does not | ||
3288 | match the tag name of the current node, this is a | ||
3289 | parse error. */ | ||
3290 | // k | ||
3291 | |||
3292 | /* Pop all the nodes from the current node up to | ||
3293 | node, including node, then stop this algorithm. */ | ||
3294 | for ($x = count($this->stack) - $n; $x >= $n; $x--) { | ||
3295 | array_pop($this->stack); | ||
3296 | } | ||
3297 | |||
3298 | } else { | ||
3299 | $category = $this->getElementCategory($node); | ||
3300 | |||
3301 | if ($category !== self::SPECIAL && $category !== self::SCOPING) { | ||
3302 | /* Otherwise, if node is in neither the formatting | ||
3303 | category nor the phrasing category, then this is a | ||
3304 | parse error. Stop this algorithm. The end tag token | ||
3305 | is ignored. */ | ||
3306 | return false; | ||
3307 | } | ||
3308 | } | ||
3309 | } | ||
3310 | break; | ||
3311 | } | ||
3312 | break; | ||
3313 | } | ||
3314 | } | ||
3315 | |||
3316 | private function inTable($token) | ||
3317 | { | ||
3318 | $clear = array('html', 'table'); | ||
3319 | |||
3320 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
3321 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
3322 | or U+0020 SPACE */ | ||
3323 | if ($token['type'] === HTML5::CHARACTR && | ||
3324 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
3325 | ) { | ||
3326 | /* Append the character to the current node. */ | ||
3327 | $text = $this->dom->createTextNode($token['data']); | ||
3328 | end($this->stack)->appendChild($text); | ||
3329 | |||
3330 | /* A comment token */ | ||
3331 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
3332 | /* Append a Comment node to the current node with the data | ||
3333 | attribute set to the data given in the comment token. */ | ||
3334 | $comment = $this->dom->createComment($token['data']); | ||
3335 | end($this->stack)->appendChild($comment); | ||
3336 | |||
3337 | /* A start tag whose tag name is "caption" */ | ||
3338 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3339 | $token['name'] === 'caption' | ||
3340 | ) { | ||
3341 | /* Clear the stack back to a table context. */ | ||
3342 | $this->clearStackToTableContext($clear); | ||
3343 | |||
3344 | /* Insert a marker at the end of the list of active | ||
3345 | formatting elements. */ | ||
3346 | $this->a_formatting[] = self::MARKER; | ||
3347 | |||
3348 | /* Insert an HTML element for the token, then switch the | ||
3349 | insertion mode to "in caption". */ | ||
3350 | $this->insertElement($token); | ||
3351 | $this->mode = self::IN_CAPTION; | ||
3352 | |||
3353 | /* A start tag whose tag name is "colgroup" */ | ||
3354 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3355 | $token['name'] === 'colgroup' | ||
3356 | ) { | ||
3357 | /* Clear the stack back to a table context. */ | ||
3358 | $this->clearStackToTableContext($clear); | ||
3359 | |||
3360 | /* Insert an HTML element for the token, then switch the | ||
3361 | insertion mode to "in column group". */ | ||
3362 | $this->insertElement($token); | ||
3363 | $this->mode = self::IN_CGROUP; | ||
3364 | |||
3365 | /* A start tag whose tag name is "col" */ | ||
3366 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3367 | $token['name'] === 'col' | ||
3368 | ) { | ||
3369 | $this->inTable( | ||
3370 | array( | ||
3371 | 'name' => 'colgroup', | ||
3372 | 'type' => HTML5::STARTTAG, | ||
3373 | 'attr' => array() | ||
3374 | ) | ||
3375 | ); | ||
3376 | |||
3377 | $this->inColumnGroup($token); | ||
3378 | |||
3379 | /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ | ||
3380 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
3381 | $token['name'], | ||
3382 | array('tbody', 'tfoot', 'thead') | ||
3383 | ) | ||
3384 | ) { | ||
3385 | /* Clear the stack back to a table context. */ | ||
3386 | $this->clearStackToTableContext($clear); | ||
3387 | |||
3388 | /* Insert an HTML element for the token, then switch the insertion | ||
3389 | mode to "in table body". */ | ||
3390 | $this->insertElement($token); | ||
3391 | $this->mode = self::IN_TBODY; | ||
3392 | |||
3393 | /* A start tag whose tag name is one of: "td", "th", "tr" */ | ||
3394 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3395 | in_array($token['name'], array('td', 'th', 'tr')) | ||
3396 | ) { | ||
3397 | /* Act as if a start tag token with the tag name "tbody" had been | ||
3398 | seen, then reprocess the current token. */ | ||
3399 | $this->inTable( | ||
3400 | array( | ||
3401 | 'name' => 'tbody', | ||
3402 | 'type' => HTML5::STARTTAG, | ||
3403 | 'attr' => array() | ||
3404 | ) | ||
3405 | ); | ||
3406 | |||
3407 | return $this->inTableBody($token); | ||
3408 | |||
3409 | /* A start tag whose tag name is "table" */ | ||
3410 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3411 | $token['name'] === 'table' | ||
3412 | ) { | ||
3413 | /* Parse error. Act as if an end tag token with the tag name "table" | ||
3414 | had been seen, then, if that token wasn't ignored, reprocess the | ||
3415 | current token. */ | ||
3416 | $this->inTable( | ||
3417 | array( | ||
3418 | 'name' => 'table', | ||
3419 | 'type' => HTML5::ENDTAG | ||
3420 | ) | ||
3421 | ); | ||
3422 | |||
3423 | return $this->mainPhase($token); | ||
3424 | |||
3425 | /* An end tag whose tag name is "table" */ | ||
3426 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
3427 | $token['name'] === 'table' | ||
3428 | ) { | ||
3429 | /* If the stack of open elements does not have an element in table | ||
3430 | scope with the same tag name as the token, this is a parse error. | ||
3431 | Ignore the token. (innerHTML case) */ | ||
3432 | if (!$this->elementInScope($token['name'], true)) { | ||
3433 | return false; | ||
3434 | |||
3435 | /* Otherwise: */ | ||
3436 | } else { | ||
3437 | /* Generate implied end tags. */ | ||
3438 | $this->generateImpliedEndTags(); | ||
3439 | |||
3440 | /* Now, if the current node is not a table element, then this | ||
3441 | is a parse error. */ | ||
3442 | // w/e | ||
3443 | |||
3444 | /* Pop elements from this stack until a table element has been | ||
3445 | popped from the stack. */ | ||
3446 | while (true) { | ||
3447 | $current = end($this->stack)->nodeName; | ||
3448 | array_pop($this->stack); | ||
3449 | |||
3450 | if ($current === 'table') { | ||
3451 | break; | ||
3452 | } | ||
3453 | } | ||
3454 | |||
3455 | /* Reset the insertion mode appropriately. */ | ||
3456 | $this->resetInsertionMode(); | ||
3457 | } | ||
3458 | |||
3459 | /* An end tag whose tag name is one of: "body", "caption", "col", | ||
3460 | "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ | ||
3461 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3462 | $token['name'], | ||
3463 | array( | ||
3464 | 'body', | ||
3465 | 'caption', | ||
3466 | 'col', | ||
3467 | 'colgroup', | ||
3468 | 'html', | ||
3469 | 'tbody', | ||
3470 | 'td', | ||
3471 | 'tfoot', | ||
3472 | 'th', | ||
3473 | 'thead', | ||
3474 | 'tr' | ||
3475 | ) | ||
3476 | ) | ||
3477 | ) { | ||
3478 | // Parse error. Ignore the token. | ||
3479 | |||
3480 | /* Anything else */ | ||
3481 | } else { | ||
3482 | /* Parse error. Process the token as if the insertion mode was "in | ||
3483 | body", with the following exception: */ | ||
3484 | |||
3485 | /* If the current node is a table, tbody, tfoot, thead, or tr | ||
3486 | element, then, whenever a node would be inserted into the current | ||
3487 | node, it must instead be inserted into the foster parent element. */ | ||
3488 | if (in_array( | ||
3489 | end($this->stack)->nodeName, | ||
3490 | array('table', 'tbody', 'tfoot', 'thead', 'tr') | ||
3491 | ) | ||
3492 | ) { | ||
3493 | /* The foster parent element is the parent element of the last | ||
3494 | table element in the stack of open elements, if there is a | ||
3495 | table element and it has such a parent element. If there is no | ||
3496 | table element in the stack of open elements (innerHTML case), | ||
3497 | then the foster parent element is the first element in the | ||
3498 | stack of open elements (the html element). Otherwise, if there | ||
3499 | is a table element in the stack of open elements, but the last | ||
3500 | table element in the stack of open elements has no parent, or | ||
3501 | its parent node is not an element, then the foster parent | ||
3502 | element is the element before the last table element in the | ||
3503 | stack of open elements. */ | ||
3504 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
3505 | if ($this->stack[$n]->nodeName === 'table') { | ||
3506 | $table = $this->stack[$n]; | ||
3507 | break; | ||
3508 | } | ||
3509 | } | ||
3510 | |||
3511 | if (isset($table) && $table->parentNode !== null) { | ||
3512 | $this->foster_parent = $table->parentNode; | ||
3513 | |||
3514 | } elseif (!isset($table)) { | ||
3515 | $this->foster_parent = $this->stack[0]; | ||
3516 | |||
3517 | } elseif (isset($table) && ($table->parentNode === null || | ||
3518 | $table->parentNode->nodeType !== XML_ELEMENT_NODE) | ||
3519 | ) { | ||
3520 | $this->foster_parent = $this->stack[$n - 1]; | ||
3521 | } | ||
3522 | } | ||
3523 | |||
3524 | $this->inBody($token); | ||
3525 | } | ||
3526 | } | ||
3527 | |||
3528 | private function inCaption($token) | ||
3529 | { | ||
3530 | /* An end tag whose tag name is "caption" */ | ||
3531 | if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { | ||
3532 | /* If the stack of open elements does not have an element in table | ||
3533 | scope with the same tag name as the token, this is a parse error. | ||
3534 | Ignore the token. (innerHTML case) */ | ||
3535 | if (!$this->elementInScope($token['name'], true)) { | ||
3536 | // Ignore | ||
3537 | |||
3538 | /* Otherwise: */ | ||
3539 | } else { | ||
3540 | /* Generate implied end tags. */ | ||
3541 | $this->generateImpliedEndTags(); | ||
3542 | |||
3543 | /* Now, if the current node is not a caption element, then this | ||
3544 | is a parse error. */ | ||
3545 | // w/e | ||
3546 | |||
3547 | /* Pop elements from this stack until a caption element has | ||
3548 | been popped from the stack. */ | ||
3549 | while (true) { | ||
3550 | $node = end($this->stack)->nodeName; | ||
3551 | array_pop($this->stack); | ||
3552 | |||
3553 | if ($node === 'caption') { | ||
3554 | break; | ||
3555 | } | ||
3556 | } | ||
3557 | |||
3558 | /* Clear the list of active formatting elements up to the last | ||
3559 | marker. */ | ||
3560 | $this->clearTheActiveFormattingElementsUpToTheLastMarker(); | ||
3561 | |||
3562 | /* Switch the insertion mode to "in table". */ | ||
3563 | $this->mode = self::IN_TABLE; | ||
3564 | } | ||
3565 | |||
3566 | /* A start tag whose tag name is one of: "caption", "col", "colgroup", | ||
3567 | "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag | ||
3568 | name is "table" */ | ||
3569 | } elseif (($token['type'] === HTML5::STARTTAG && in_array( | ||
3570 | $token['name'], | ||
3571 | array( | ||
3572 | 'caption', | ||
3573 | 'col', | ||
3574 | 'colgroup', | ||
3575 | 'tbody', | ||
3576 | 'td', | ||
3577 | 'tfoot', | ||
3578 | 'th', | ||
3579 | 'thead', | ||
3580 | 'tr' | ||
3581 | ) | ||
3582 | )) || ($token['type'] === HTML5::ENDTAG && | ||
3583 | $token['name'] === 'table') | ||
3584 | ) { | ||
3585 | /* Parse error. Act as if an end tag with the tag name "caption" | ||
3586 | had been seen, then, if that token wasn't ignored, reprocess the | ||
3587 | current token. */ | ||
3588 | $this->inCaption( | ||
3589 | array( | ||
3590 | 'name' => 'caption', | ||
3591 | 'type' => HTML5::ENDTAG | ||
3592 | ) | ||
3593 | ); | ||
3594 | |||
3595 | return $this->inTable($token); | ||
3596 | |||
3597 | /* An end tag whose tag name is one of: "body", "col", "colgroup", | ||
3598 | "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ | ||
3599 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3600 | $token['name'], | ||
3601 | array( | ||
3602 | 'body', | ||
3603 | 'col', | ||
3604 | 'colgroup', | ||
3605 | 'html', | ||
3606 | 'tbody', | ||
3607 | 'tfoot', | ||
3608 | 'th', | ||
3609 | 'thead', | ||
3610 | 'tr' | ||
3611 | ) | ||
3612 | ) | ||
3613 | ) { | ||
3614 | // Parse error. Ignore the token. | ||
3615 | |||
3616 | /* Anything else */ | ||
3617 | } else { | ||
3618 | /* Process the token as if the insertion mode was "in body". */ | ||
3619 | $this->inBody($token); | ||
3620 | } | ||
3621 | } | ||
3622 | |||
3623 | private function inColumnGroup($token) | ||
3624 | { | ||
3625 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
3626 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
3627 | or U+0020 SPACE */ | ||
3628 | if ($token['type'] === HTML5::CHARACTR && | ||
3629 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
3630 | ) { | ||
3631 | /* Append the character to the current node. */ | ||
3632 | $text = $this->dom->createTextNode($token['data']); | ||
3633 | end($this->stack)->appendChild($text); | ||
3634 | |||
3635 | /* A comment token */ | ||
3636 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
3637 | /* Append a Comment node to the current node with the data | ||
3638 | attribute set to the data given in the comment token. */ | ||
3639 | $comment = $this->dom->createComment($token['data']); | ||
3640 | end($this->stack)->appendChild($comment); | ||
3641 | |||
3642 | /* A start tag whose tag name is "col" */ | ||
3643 | } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { | ||
3644 | /* Insert a col element for the token. Immediately pop the current | ||
3645 | node off the stack of open elements. */ | ||
3646 | $this->insertElement($token); | ||
3647 | array_pop($this->stack); | ||
3648 | |||
3649 | /* An end tag whose tag name is "colgroup" */ | ||
3650 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
3651 | $token['name'] === 'colgroup' | ||
3652 | ) { | ||
3653 | /* If the current node is the root html element, then this is a | ||
3654 | parse error, ignore the token. (innerHTML case) */ | ||
3655 | if (end($this->stack)->nodeName === 'html') { | ||
3656 | // Ignore | ||
3657 | |||
3658 | /* Otherwise, pop the current node (which will be a colgroup | ||
3659 | element) from the stack of open elements. Switch the insertion | ||
3660 | mode to "in table". */ | ||
3661 | } else { | ||
3662 | array_pop($this->stack); | ||
3663 | $this->mode = self::IN_TABLE; | ||
3664 | } | ||
3665 | |||
3666 | /* An end tag whose tag name is "col" */ | ||
3667 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { | ||
3668 | /* Parse error. Ignore the token. */ | ||
3669 | |||
3670 | /* Anything else */ | ||
3671 | } else { | ||
3672 | /* Act as if an end tag with the tag name "colgroup" had been seen, | ||
3673 | and then, if that token wasn't ignored, reprocess the current token. */ | ||
3674 | $this->inColumnGroup( | ||
3675 | array( | ||
3676 | 'name' => 'colgroup', | ||
3677 | 'type' => HTML5::ENDTAG | ||
3678 | ) | ||
3679 | ); | ||
3680 | |||
3681 | return $this->inTable($token); | ||
3682 | } | ||
3683 | } | ||
3684 | |||
3685 | private function inTableBody($token) | ||
3686 | { | ||
3687 | $clear = array('tbody', 'tfoot', 'thead', 'html'); | ||
3688 | |||
3689 | /* A start tag whose tag name is "tr" */ | ||
3690 | if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { | ||
3691 | /* Clear the stack back to a table body context. */ | ||
3692 | $this->clearStackToTableContext($clear); | ||
3693 | |||
3694 | /* Insert a tr element for the token, then switch the insertion | ||
3695 | mode to "in row". */ | ||
3696 | $this->insertElement($token); | ||
3697 | $this->mode = self::IN_ROW; | ||
3698 | |||
3699 | /* A start tag whose tag name is one of: "th", "td" */ | ||
3700 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
3701 | ($token['name'] === 'th' || $token['name'] === 'td') | ||
3702 | ) { | ||
3703 | /* Parse error. Act as if a start tag with the tag name "tr" had | ||
3704 | been seen, then reprocess the current token. */ | ||
3705 | $this->inTableBody( | ||
3706 | array( | ||
3707 | 'name' => 'tr', | ||
3708 | 'type' => HTML5::STARTTAG, | ||
3709 | 'attr' => array() | ||
3710 | ) | ||
3711 | ); | ||
3712 | |||
3713 | return $this->inRow($token); | ||
3714 | |||
3715 | /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ | ||
3716 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
3717 | in_array($token['name'], array('tbody', 'tfoot', 'thead')) | ||
3718 | ) { | ||
3719 | /* If the stack of open elements does not have an element in table | ||
3720 | scope with the same tag name as the token, this is a parse error. | ||
3721 | Ignore the token. */ | ||
3722 | if (!$this->elementInScope($token['name'], true)) { | ||
3723 | // Ignore | ||
3724 | |||
3725 | /* Otherwise: */ | ||
3726 | } else { | ||
3727 | /* Clear the stack back to a table body context. */ | ||
3728 | $this->clearStackToTableContext($clear); | ||
3729 | |||
3730 | /* Pop the current node from the stack of open elements. Switch | ||
3731 | the insertion mode to "in table". */ | ||
3732 | array_pop($this->stack); | ||
3733 | $this->mode = self::IN_TABLE; | ||
3734 | } | ||
3735 | |||
3736 | /* A start tag whose tag name is one of: "caption", "col", "colgroup", | ||
3737 | "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ | ||
3738 | } elseif (($token['type'] === HTML5::STARTTAG && in_array( | ||
3739 | $token['name'], | ||
3740 | array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead') | ||
3741 | )) || | ||
3742 | ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table') | ||
3743 | ) { | ||
3744 | /* If the stack of open elements does not have a tbody, thead, or | ||
3745 | tfoot element in table scope, this is a parse error. Ignore the | ||
3746 | token. (innerHTML case) */ | ||
3747 | if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { | ||
3748 | // Ignore. | ||
3749 | |||
3750 | /* Otherwise: */ | ||
3751 | } else { | ||
3752 | /* Clear the stack back to a table body context. */ | ||
3753 | $this->clearStackToTableContext($clear); | ||
3754 | |||
3755 | /* Act as if an end tag with the same tag name as the current | ||
3756 | node ("tbody", "tfoot", or "thead") had been seen, then | ||
3757 | reprocess the current token. */ | ||
3758 | $this->inTableBody( | ||
3759 | array( | ||
3760 | 'name' => end($this->stack)->nodeName, | ||
3761 | 'type' => HTML5::ENDTAG | ||
3762 | ) | ||
3763 | ); | ||
3764 | |||
3765 | return $this->mainPhase($token); | ||
3766 | } | ||
3767 | |||
3768 | /* An end tag whose tag name is one of: "body", "caption", "col", | ||
3769 | "colgroup", "html", "td", "th", "tr" */ | ||
3770 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3771 | $token['name'], | ||
3772 | array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') | ||
3773 | ) | ||
3774 | ) { | ||
3775 | /* Parse error. Ignore the token. */ | ||
3776 | |||
3777 | /* Anything else */ | ||
3778 | } else { | ||
3779 | /* Process the token as if the insertion mode was "in table". */ | ||
3780 | $this->inTable($token); | ||
3781 | } | ||
3782 | } | ||
3783 | |||
3784 | private function inRow($token) | ||
3785 | { | ||
3786 | $clear = array('tr', 'html'); | ||
3787 | |||
3788 | /* A start tag whose tag name is one of: "th", "td" */ | ||
3789 | if ($token['type'] === HTML5::STARTTAG && | ||
3790 | ($token['name'] === 'th' || $token['name'] === 'td') | ||
3791 | ) { | ||
3792 | /* Clear the stack back to a table row context. */ | ||
3793 | $this->clearStackToTableContext($clear); | ||
3794 | |||
3795 | /* Insert an HTML element for the token, then switch the insertion | ||
3796 | mode to "in cell". */ | ||
3797 | $this->insertElement($token); | ||
3798 | $this->mode = self::IN_CELL; | ||
3799 | |||
3800 | /* Insert a marker at the end of the list of active formatting | ||
3801 | elements. */ | ||
3802 | $this->a_formatting[] = self::MARKER; | ||
3803 | |||
3804 | /* An end tag whose tag name is "tr" */ | ||
3805 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { | ||
3806 | /* If the stack of open elements does not have an element in table | ||
3807 | scope with the same tag name as the token, this is a parse error. | ||
3808 | Ignore the token. (innerHTML case) */ | ||
3809 | if (!$this->elementInScope($token['name'], true)) { | ||
3810 | // Ignore. | ||
3811 | |||
3812 | /* Otherwise: */ | ||
3813 | } else { | ||
3814 | /* Clear the stack back to a table row context. */ | ||
3815 | $this->clearStackToTableContext($clear); | ||
3816 | |||
3817 | /* Pop the current node (which will be a tr element) from the | ||
3818 | stack of open elements. Switch the insertion mode to "in table | ||
3819 | body". */ | ||
3820 | array_pop($this->stack); | ||
3821 | $this->mode = self::IN_TBODY; | ||
3822 | } | ||
3823 | |||
3824 | /* A start tag whose tag name is one of: "caption", "col", "colgroup", | ||
3825 | "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ | ||
3826 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
3827 | $token['name'], | ||
3828 | array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr') | ||
3829 | ) | ||
3830 | ) { | ||
3831 | /* Act as if an end tag with the tag name "tr" had been seen, then, | ||
3832 | if that token wasn't ignored, reprocess the current token. */ | ||
3833 | $this->inRow( | ||
3834 | array( | ||
3835 | 'name' => 'tr', | ||
3836 | 'type' => HTML5::ENDTAG | ||
3837 | ) | ||
3838 | ); | ||
3839 | |||
3840 | return $this->inCell($token); | ||
3841 | |||
3842 | /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ | ||
3843 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
3844 | in_array($token['name'], array('tbody', 'tfoot', 'thead')) | ||
3845 | ) { | ||
3846 | /* If the stack of open elements does not have an element in table | ||
3847 | scope with the same tag name as the token, this is a parse error. | ||
3848 | Ignore the token. */ | ||
3849 | if (!$this->elementInScope($token['name'], true)) { | ||
3850 | // Ignore. | ||
3851 | |||
3852 | /* Otherwise: */ | ||
3853 | } else { | ||
3854 | /* Otherwise, act as if an end tag with the tag name "tr" had | ||
3855 | been seen, then reprocess the current token. */ | ||
3856 | $this->inRow( | ||
3857 | array( | ||
3858 | 'name' => 'tr', | ||
3859 | 'type' => HTML5::ENDTAG | ||
3860 | ) | ||
3861 | ); | ||
3862 | |||
3863 | return $this->inCell($token); | ||
3864 | } | ||
3865 | |||
3866 | /* An end tag whose tag name is one of: "body", "caption", "col", | ||
3867 | "colgroup", "html", "td", "th" */ | ||
3868 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3869 | $token['name'], | ||
3870 | array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') | ||
3871 | ) | ||
3872 | ) { | ||
3873 | /* Parse error. Ignore the token. */ | ||
3874 | |||
3875 | /* Anything else */ | ||
3876 | } else { | ||
3877 | /* Process the token as if the insertion mode was "in table". */ | ||
3878 | $this->inTable($token); | ||
3879 | } | ||
3880 | } | ||
3881 | |||
3882 | private function inCell($token) | ||
3883 | { | ||
3884 | /* An end tag whose tag name is one of: "td", "th" */ | ||
3885 | if ($token['type'] === HTML5::ENDTAG && | ||
3886 | ($token['name'] === 'td' || $token['name'] === 'th') | ||
3887 | ) { | ||
3888 | /* If the stack of open elements does not have an element in table | ||
3889 | scope with the same tag name as that of the token, then this is a | ||
3890 | parse error and the token must be ignored. */ | ||
3891 | if (!$this->elementInScope($token['name'], true)) { | ||
3892 | // Ignore. | ||
3893 | |||
3894 | /* Otherwise: */ | ||
3895 | } else { | ||
3896 | /* Generate implied end tags, except for elements with the same | ||
3897 | tag name as the token. */ | ||
3898 | $this->generateImpliedEndTags(array($token['name'])); | ||
3899 | |||
3900 | /* Now, if the current node is not an element with the same tag | ||
3901 | name as the token, then this is a parse error. */ | ||
3902 | // k | ||
3903 | |||
3904 | /* Pop elements from this stack until an element with the same | ||
3905 | tag name as the token has been popped from the stack. */ | ||
3906 | while (true) { | ||
3907 | $node = end($this->stack)->nodeName; | ||
3908 | array_pop($this->stack); | ||
3909 | |||
3910 | if ($node === $token['name']) { | ||
3911 | break; | ||
3912 | } | ||
3913 | } | ||
3914 | |||
3915 | /* Clear the list of active formatting elements up to the last | ||
3916 | marker. */ | ||
3917 | $this->clearTheActiveFormattingElementsUpToTheLastMarker(); | ||
3918 | |||
3919 | /* Switch the insertion mode to "in row". (The current node | ||
3920 | will be a tr element at this point.) */ | ||
3921 | $this->mode = self::IN_ROW; | ||
3922 | } | ||
3923 | |||
3924 | /* A start tag whose tag name is one of: "caption", "col", "colgroup", | ||
3925 | "tbody", "td", "tfoot", "th", "thead", "tr" */ | ||
3926 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
3927 | $token['name'], | ||
3928 | array( | ||
3929 | 'caption', | ||
3930 | 'col', | ||
3931 | 'colgroup', | ||
3932 | 'tbody', | ||
3933 | 'td', | ||
3934 | 'tfoot', | ||
3935 | 'th', | ||
3936 | 'thead', | ||
3937 | 'tr' | ||
3938 | ) | ||
3939 | ) | ||
3940 | ) { | ||
3941 | /* If the stack of open elements does not have a td or th element | ||
3942 | in table scope, then this is a parse error; ignore the token. | ||
3943 | (innerHTML case) */ | ||
3944 | if (!$this->elementInScope(array('td', 'th'), true)) { | ||
3945 | // Ignore. | ||
3946 | |||
3947 | /* Otherwise, close the cell (see below) and reprocess the current | ||
3948 | token. */ | ||
3949 | } else { | ||
3950 | $this->closeCell(); | ||
3951 | return $this->inRow($token); | ||
3952 | } | ||
3953 | |||
3954 | /* A start tag whose tag name is one of: "caption", "col", "colgroup", | ||
3955 | "tbody", "td", "tfoot", "th", "thead", "tr" */ | ||
3956 | } elseif ($token['type'] === HTML5::STARTTAG && in_array( | ||
3957 | $token['name'], | ||
3958 | array( | ||
3959 | 'caption', | ||
3960 | 'col', | ||
3961 | 'colgroup', | ||
3962 | 'tbody', | ||
3963 | 'td', | ||
3964 | 'tfoot', | ||
3965 | 'th', | ||
3966 | 'thead', | ||
3967 | 'tr' | ||
3968 | ) | ||
3969 | ) | ||
3970 | ) { | ||
3971 | /* If the stack of open elements does not have a td or th element | ||
3972 | in table scope, then this is a parse error; ignore the token. | ||
3973 | (innerHTML case) */ | ||
3974 | if (!$this->elementInScope(array('td', 'th'), true)) { | ||
3975 | // Ignore. | ||
3976 | |||
3977 | /* Otherwise, close the cell (see below) and reprocess the current | ||
3978 | token. */ | ||
3979 | } else { | ||
3980 | $this->closeCell(); | ||
3981 | return $this->inRow($token); | ||
3982 | } | ||
3983 | |||
3984 | /* An end tag whose tag name is one of: "body", "caption", "col", | ||
3985 | "colgroup", "html" */ | ||
3986 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3987 | $token['name'], | ||
3988 | array('body', 'caption', 'col', 'colgroup', 'html') | ||
3989 | ) | ||
3990 | ) { | ||
3991 | /* Parse error. Ignore the token. */ | ||
3992 | |||
3993 | /* An end tag whose tag name is one of: "table", "tbody", "tfoot", | ||
3994 | "thead", "tr" */ | ||
3995 | } elseif ($token['type'] === HTML5::ENDTAG && in_array( | ||
3996 | $token['name'], | ||
3997 | array('table', 'tbody', 'tfoot', 'thead', 'tr') | ||
3998 | ) | ||
3999 | ) { | ||
4000 | /* If the stack of open elements does not have an element in table | ||
4001 | scope with the same tag name as that of the token (which can only | ||
4002 | happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), | ||
4003 | then this is a parse error and the token must be ignored. */ | ||
4004 | if (!$this->elementInScope($token['name'], true)) { | ||
4005 | // Ignore. | ||
4006 | |||
4007 | /* Otherwise, close the cell (see below) and reprocess the current | ||
4008 | token. */ | ||
4009 | } else { | ||
4010 | $this->closeCell(); | ||
4011 | return $this->inRow($token); | ||
4012 | } | ||
4013 | |||
4014 | /* Anything else */ | ||
4015 | } else { | ||
4016 | /* Process the token as if the insertion mode was "in body". */ | ||
4017 | $this->inBody($token); | ||
4018 | } | ||
4019 | } | ||
4020 | |||
4021 | private function inSelect($token) | ||
4022 | { | ||
4023 | /* Handle the token as follows: */ | ||
4024 | |||
4025 | /* A character token */ | ||
4026 | if ($token['type'] === HTML5::CHARACTR) { | ||
4027 | /* Append the token's character to the current node. */ | ||
4028 | $this->insertText($token['data']); | ||
4029 | |||
4030 | /* A comment token */ | ||
4031 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
4032 | /* Append a Comment node to the current node with the data | ||
4033 | attribute set to the data given in the comment token. */ | ||
4034 | $this->insertComment($token['data']); | ||
4035 | |||
4036 | /* A start tag token whose tag name is "option" */ | ||
4037 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
4038 | $token['name'] === 'option' | ||
4039 | ) { | ||
4040 | /* If the current node is an option element, act as if an end tag | ||
4041 | with the tag name "option" had been seen. */ | ||
4042 | if (end($this->stack)->nodeName === 'option') { | ||
4043 | $this->inSelect( | ||
4044 | array( | ||
4045 | 'name' => 'option', | ||
4046 | 'type' => HTML5::ENDTAG | ||
4047 | ) | ||
4048 | ); | ||
4049 | } | ||
4050 | |||
4051 | /* Insert an HTML element for the token. */ | ||
4052 | $this->insertElement($token); | ||
4053 | |||
4054 | /* A start tag token whose tag name is "optgroup" */ | ||
4055 | } elseif ($token['type'] === HTML5::STARTTAG && | ||
4056 | $token['name'] === 'optgroup' | ||
4057 | ) { | ||
4058 | /* If the current node is an option element, act as if an end tag | ||
4059 | with the tag name "option" had been seen. */ | ||
4060 | if (end($this->stack)->nodeName === 'option') { | ||
4061 | $this->inSelect( | ||
4062 | array( | ||
4063 | 'name' => 'option', | ||
4064 | 'type' => HTML5::ENDTAG | ||
4065 | ) | ||
4066 | ); | ||
4067 | } | ||
4068 | |||
4069 | /* If the current node is an optgroup element, act as if an end tag | ||
4070 | with the tag name "optgroup" had been seen. */ | ||
4071 | if (end($this->stack)->nodeName === 'optgroup') { | ||
4072 | $this->inSelect( | ||
4073 | array( | ||
4074 | 'name' => 'optgroup', | ||
4075 | 'type' => HTML5::ENDTAG | ||
4076 | ) | ||
4077 | ); | ||
4078 | } | ||
4079 | |||
4080 | /* Insert an HTML element for the token. */ | ||
4081 | $this->insertElement($token); | ||
4082 | |||
4083 | /* An end tag token whose tag name is "optgroup" */ | ||
4084 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
4085 | $token['name'] === 'optgroup' | ||
4086 | ) { | ||
4087 | /* First, if the current node is an option element, and the node | ||
4088 | immediately before it in the stack of open elements is an optgroup | ||
4089 | element, then act as if an end tag with the tag name "option" had | ||
4090 | been seen. */ | ||
4091 | $elements_in_stack = count($this->stack); | ||
4092 | |||
4093 | if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' && | ||
4094 | $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup' | ||
4095 | ) { | ||
4096 | $this->inSelect( | ||
4097 | array( | ||
4098 | 'name' => 'option', | ||
4099 | 'type' => HTML5::ENDTAG | ||
4100 | ) | ||
4101 | ); | ||
4102 | } | ||
4103 | |||
4104 | /* If the current node is an optgroup element, then pop that node | ||
4105 | from the stack of open elements. Otherwise, this is a parse error, | ||
4106 | ignore the token. */ | ||
4107 | if ($this->stack[$elements_in_stack - 1] === 'optgroup') { | ||
4108 | array_pop($this->stack); | ||
4109 | } | ||
4110 | |||
4111 | /* An end tag token whose tag name is "option" */ | ||
4112 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
4113 | $token['name'] === 'option' | ||
4114 | ) { | ||
4115 | /* If the current node is an option element, then pop that node | ||
4116 | from the stack of open elements. Otherwise, this is a parse error, | ||
4117 | ignore the token. */ | ||
4118 | if (end($this->stack)->nodeName === 'option') { | ||
4119 | array_pop($this->stack); | ||
4120 | } | ||
4121 | |||
4122 | /* An end tag whose tag name is "select" */ | ||
4123 | } elseif ($token['type'] === HTML5::ENDTAG && | ||
4124 | $token['name'] === 'select' | ||
4125 | ) { | ||
4126 | /* If the stack of open elements does not have an element in table | ||
4127 | scope with the same tag name as the token, this is a parse error. | ||
4128 | Ignore the token. (innerHTML case) */ | ||
4129 | if (!$this->elementInScope($token['name'], true)) { | ||
4130 | // w/e | ||
4131 | |||
4132 | /* Otherwise: */ | ||
4133 | } else { | ||
4134 | /* Pop elements from the stack of open elements until a select | ||
4135 | element has been popped from the stack. */ | ||
4136 | while (true) { | ||
4137 | $current = end($this->stack)->nodeName; | ||
4138 | array_pop($this->stack); | ||
4139 | |||
4140 | if ($current === 'select') { | ||
4141 | break; | ||
4142 | } | ||
4143 | } | ||
4144 | |||
4145 | /* Reset the insertion mode appropriately. */ | ||
4146 | $this->resetInsertionMode(); | ||
4147 | } | ||
4148 | |||
4149 | /* A start tag whose tag name is "select" */ | ||
4150 | } elseif ($token['name'] === 'select' && | ||
4151 | $token['type'] === HTML5::STARTTAG | ||
4152 | ) { | ||
4153 | /* Parse error. Act as if the token had been an end tag with the | ||
4154 | tag name "select" instead. */ | ||
4155 | $this->inSelect( | ||
4156 | array( | ||
4157 | 'name' => 'select', | ||
4158 | 'type' => HTML5::ENDTAG | ||
4159 | ) | ||
4160 | ); | ||
4161 | |||
4162 | /* An end tag whose tag name is one of: "caption", "table", "tbody", | ||
4163 | "tfoot", "thead", "tr", "td", "th" */ | ||
4164 | } elseif (in_array( | ||
4165 | $token['name'], | ||
4166 | array( | ||
4167 | 'caption', | ||
4168 | 'table', | ||
4169 | 'tbody', | ||
4170 | 'tfoot', | ||
4171 | 'thead', | ||
4172 | 'tr', | ||
4173 | 'td', | ||
4174 | 'th' | ||
4175 | ) | ||
4176 | ) && $token['type'] === HTML5::ENDTAG | ||
4177 | ) { | ||
4178 | /* Parse error. */ | ||
4179 | // w/e | ||
4180 | |||
4181 | /* If the stack of open elements has an element in table scope with | ||
4182 | the same tag name as that of the token, then act as if an end tag | ||
4183 | with the tag name "select" had been seen, and reprocess the token. | ||
4184 | Otherwise, ignore the token. */ | ||
4185 | if ($this->elementInScope($token['name'], true)) { | ||
4186 | $this->inSelect( | ||
4187 | array( | ||
4188 | 'name' => 'select', | ||
4189 | 'type' => HTML5::ENDTAG | ||
4190 | ) | ||
4191 | ); | ||
4192 | |||
4193 | $this->mainPhase($token); | ||
4194 | } | ||
4195 | |||
4196 | /* Anything else */ | ||
4197 | } else { | ||
4198 | /* Parse error. Ignore the token. */ | ||
4199 | } | ||
4200 | } | ||
4201 | |||
4202 | private function afterBody($token) | ||
4203 | { | ||
4204 | /* Handle the token as follows: */ | ||
4205 | |||
4206 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
4207 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
4208 | or U+0020 SPACE */ | ||
4209 | if ($token['type'] === HTML5::CHARACTR && | ||
4210 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
4211 | ) { | ||
4212 | /* Process the token as it would be processed if the insertion mode | ||
4213 | was "in body". */ | ||
4214 | $this->inBody($token); | ||
4215 | |||
4216 | /* A comment token */ | ||
4217 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
4218 | /* Append a Comment node to the first element in the stack of open | ||
4219 | elements (the html element), with the data attribute set to the | ||
4220 | data given in the comment token. */ | ||
4221 | $comment = $this->dom->createComment($token['data']); | ||
4222 | $this->stack[0]->appendChild($comment); | ||
4223 | |||
4224 | /* An end tag with the tag name "html" */ | ||
4225 | } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { | ||
4226 | /* If the parser was originally created in order to handle the | ||
4227 | setting of an element's innerHTML attribute, this is a parse error; | ||
4228 | ignore the token. (The element will be an html element in this | ||
4229 | case.) (innerHTML case) */ | ||
4230 | |||
4231 | /* Otherwise, switch to the trailing end phase. */ | ||
4232 | $this->phase = self::END_PHASE; | ||
4233 | |||
4234 | /* Anything else */ | ||
4235 | } else { | ||
4236 | /* Parse error. Set the insertion mode to "in body" and reprocess | ||
4237 | the token. */ | ||
4238 | $this->mode = self::IN_BODY; | ||
4239 | return $this->inBody($token); | ||
4240 | } | ||
4241 | } | ||
4242 | |||
4243 | private function inFrameset($token) | ||
4244 | { | ||
4245 | /* Handle the token as follows: */ | ||
4246 | |||
4247 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
4248 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
4249 | U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ | ||
4250 | if ($token['type'] === HTML5::CHARACTR && | ||
4251 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
4252 | ) { | ||
4253 | /* Append the character to the current node. */ | ||
4254 | $this->insertText($token['data']); | ||
4255 | |||
4256 | /* A comment token */ | ||
4257 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
4258 | /* Append a Comment node to the current node with the data | ||
4259 | attribute set to the data given in the comment token. */ | ||
4260 | $this->insertComment($token['data']); | ||
4261 | |||
4262 | /* A start tag with the tag name "frameset" */ | ||
4263 | } elseif ($token['name'] === 'frameset' && | ||
4264 | $token['type'] === HTML5::STARTTAG | ||
4265 | ) { | ||
4266 | $this->insertElement($token); | ||
4267 | |||
4268 | /* An end tag with the tag name "frameset" */ | ||
4269 | } elseif ($token['name'] === 'frameset' && | ||
4270 | $token['type'] === HTML5::ENDTAG | ||
4271 | ) { | ||
4272 | /* If the current node is the root html element, then this is a | ||
4273 | parse error; ignore the token. (innerHTML case) */ | ||
4274 | if (end($this->stack)->nodeName === 'html') { | ||
4275 | // Ignore | ||
4276 | |||
4277 | } else { | ||
4278 | /* Otherwise, pop the current node from the stack of open | ||
4279 | elements. */ | ||
4280 | array_pop($this->stack); | ||
4281 | |||
4282 | /* If the parser was not originally created in order to handle | ||
4283 | the setting of an element's innerHTML attribute (innerHTML case), | ||
4284 | and the current node is no longer a frameset element, then change | ||
4285 | the insertion mode to "after frameset". */ | ||
4286 | $this->mode = self::AFTR_FRAME; | ||
4287 | } | ||
4288 | |||
4289 | /* A start tag with the tag name "frame" */ | ||
4290 | } elseif ($token['name'] === 'frame' && | ||
4291 | $token['type'] === HTML5::STARTTAG | ||
4292 | ) { | ||
4293 | /* Insert an HTML element for the token. */ | ||
4294 | $this->insertElement($token); | ||
4295 | |||
4296 | /* Immediately pop the current node off the stack of open elements. */ | ||
4297 | array_pop($this->stack); | ||
4298 | |||
4299 | /* A start tag with the tag name "noframes" */ | ||
4300 | } elseif ($token['name'] === 'noframes' && | ||
4301 | $token['type'] === HTML5::STARTTAG | ||
4302 | ) { | ||
4303 | /* Process the token as if the insertion mode had been "in body". */ | ||
4304 | $this->inBody($token); | ||
4305 | |||
4306 | /* Anything else */ | ||
4307 | } else { | ||
4308 | /* Parse error. Ignore the token. */ | ||
4309 | } | ||
4310 | } | ||
4311 | |||
4312 | private function afterFrameset($token) | ||
4313 | { | ||
4314 | /* Handle the token as follows: */ | ||
4315 | |||
4316 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
4317 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
4318 | U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ | ||
4319 | if ($token['type'] === HTML5::CHARACTR && | ||
4320 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
4321 | ) { | ||
4322 | /* Append the character to the current node. */ | ||
4323 | $this->insertText($token['data']); | ||
4324 | |||
4325 | /* A comment token */ | ||
4326 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
4327 | /* Append a Comment node to the current node with the data | ||
4328 | attribute set to the data given in the comment token. */ | ||
4329 | $this->insertComment($token['data']); | ||
4330 | |||
4331 | /* An end tag with the tag name "html" */ | ||
4332 | } elseif ($token['name'] === 'html' && | ||
4333 | $token['type'] === HTML5::ENDTAG | ||
4334 | ) { | ||
4335 | /* Switch to the trailing end phase. */ | ||
4336 | $this->phase = self::END_PHASE; | ||
4337 | |||
4338 | /* A start tag with the tag name "noframes" */ | ||
4339 | } elseif ($token['name'] === 'noframes' && | ||
4340 | $token['type'] === HTML5::STARTTAG | ||
4341 | ) { | ||
4342 | /* Process the token as if the insertion mode had been "in body". */ | ||
4343 | $this->inBody($token); | ||
4344 | |||
4345 | /* Anything else */ | ||
4346 | } else { | ||
4347 | /* Parse error. Ignore the token. */ | ||
4348 | } | ||
4349 | } | ||
4350 | |||
4351 | private function trailingEndPhase($token) | ||
4352 | { | ||
4353 | /* After the main phase, as each token is emitted from the tokenisation | ||
4354 | stage, it must be processed as described in this section. */ | ||
4355 | |||
4356 | /* A DOCTYPE token */ | ||
4357 | if ($token['type'] === HTML5::DOCTYPE) { | ||
4358 | // Parse error. Ignore the token. | ||
4359 | |||
4360 | /* A comment token */ | ||
4361 | } elseif ($token['type'] === HTML5::COMMENT) { | ||
4362 | /* Append a Comment node to the Document object with the data | ||
4363 | attribute set to the data given in the comment token. */ | ||
4364 | $comment = $this->dom->createComment($token['data']); | ||
4365 | $this->dom->appendChild($comment); | ||
4366 | |||
4367 | /* A character token that is one of one of U+0009 CHARACTER TABULATION, | ||
4368 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
4369 | or U+0020 SPACE */ | ||
4370 | } elseif ($token['type'] === HTML5::CHARACTR && | ||
4371 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) | ||
4372 | ) { | ||
4373 | /* Process the token as it would be processed in the main phase. */ | ||
4374 | $this->mainPhase($token); | ||
4375 | |||
4376 | /* A character token that is not one of U+0009 CHARACTER TABULATION, | ||
4377 | U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), | ||
4378 | or U+0020 SPACE. Or a start tag token. Or an end tag token. */ | ||
4379 | } elseif (($token['type'] === HTML5::CHARACTR && | ||
4380 | preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || | ||
4381 | $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG | ||
4382 | ) { | ||
4383 | /* Parse error. Switch back to the main phase and reprocess the | ||
4384 | token. */ | ||
4385 | $this->phase = self::MAIN_PHASE; | ||
4386 | return $this->mainPhase($token); | ||
4387 | |||
4388 | /* An end-of-file token */ | ||
4389 | } elseif ($token['type'] === HTML5::EOF) { | ||
4390 | /* OMG DONE!! */ | ||
4391 | } | ||
4392 | } | ||
4393 | |||
4394 | private function insertElement($token, $append = true, $check = false) | ||
4395 | { | ||
4396 | // Proprietary workaround for libxml2's limitations with tag names | ||
4397 | if ($check) { | ||
4398 | // Slightly modified HTML5 tag-name modification, | ||
4399 | // removing anything that's not an ASCII letter, digit, or hyphen | ||
4400 | $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); | ||
4401 | // Remove leading hyphens and numbers | ||
4402 | $token['name'] = ltrim($token['name'], '-0..9'); | ||
4403 | // In theory, this should ever be needed, but just in case | ||
4404 | if ($token['name'] === '') { | ||
4405 | $token['name'] = 'span'; | ||
4406 | } // arbitrary generic choice | ||
4407 | } | ||
4408 | |||
4409 | $el = $this->dom->createElement($token['name']); | ||
4410 | |||
4411 | foreach ($token['attr'] as $attr) { | ||
4412 | if (!$el->hasAttribute($attr['name'])) { | ||
4413 | $el->setAttribute($attr['name'], $attr['value']); | ||
4414 | } | ||
4415 | } | ||
4416 | |||
4417 | $this->appendToRealParent($el); | ||
4418 | $this->stack[] = $el; | ||
4419 | |||
4420 | return $el; | ||
4421 | } | ||
4422 | |||
4423 | private function insertText($data) | ||
4424 | { | ||
4425 | $text = $this->dom->createTextNode($data); | ||
4426 | $this->appendToRealParent($text); | ||
4427 | } | ||
4428 | |||
4429 | private function insertComment($data) | ||
4430 | { | ||
4431 | $comment = $this->dom->createComment($data); | ||
4432 | $this->appendToRealParent($comment); | ||
4433 | } | ||
4434 | |||
4435 | private function appendToRealParent($node) | ||
4436 | { | ||
4437 | if ($this->foster_parent === null) { | ||
4438 | end($this->stack)->appendChild($node); | ||
4439 | |||
4440 | } elseif ($this->foster_parent !== null) { | ||
4441 | /* If the foster parent element is the parent element of the | ||
4442 | last table element in the stack of open elements, then the new | ||
4443 | node must be inserted immediately before the last table element | ||
4444 | in the stack of open elements in the foster parent element; | ||
4445 | otherwise, the new node must be appended to the foster parent | ||
4446 | element. */ | ||
4447 | for ($n = count($this->stack) - 1; $n >= 0; $n--) { | ||
4448 | if ($this->stack[$n]->nodeName === 'table' && | ||
4449 | $this->stack[$n]->parentNode !== null | ||
4450 | ) { | ||
4451 | $table = $this->stack[$n]; | ||
4452 | break; | ||
4453 | } | ||
4454 | } | ||
4455 | |||
4456 | if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) { | ||
4457 | $this->foster_parent->insertBefore($node, $table); | ||
4458 | } else { | ||
4459 | $this->foster_parent->appendChild($node); | ||
4460 | } | ||
4461 | |||
4462 | $this->foster_parent = null; | ||
4463 | } | ||
4464 | } | ||
4465 | |||
4466 | private function elementInScope($el, $table = false) | ||
4467 | { | ||
4468 | if (is_array($el)) { | ||
4469 | foreach ($el as $element) { | ||
4470 | if ($this->elementInScope($element, $table)) { | ||
4471 | return true; | ||
4472 | } | ||
4473 | } | ||
4474 | |||
4475 | return false; | ||
4476 | } | ||
4477 | |||
4478 | $leng = count($this->stack); | ||
4479 | |||
4480 | for ($n = 0; $n < $leng; $n++) { | ||
4481 | /* 1. Initialise node to be the current node (the bottommost node of | ||
4482 | the stack). */ | ||
4483 | $node = $this->stack[$leng - 1 - $n]; | ||
4484 | |||
4485 | if ($node->tagName === $el) { | ||
4486 | /* 2. If node is the target node, terminate in a match state. */ | ||
4487 | return true; | ||
4488 | |||
4489 | } elseif ($node->tagName === 'table') { | ||
4490 | /* 3. Otherwise, if node is a table element, terminate in a failure | ||
4491 | state. */ | ||
4492 | return false; | ||
4493 | |||
4494 | } elseif ($table === true && in_array( | ||
4495 | $node->tagName, | ||
4496 | array( | ||
4497 | 'caption', | ||
4498 | 'td', | ||
4499 | 'th', | ||
4500 | 'button', | ||
4501 | 'marquee', | ||
4502 | 'object' | ||
4503 | ) | ||
4504 | ) | ||
4505 | ) { | ||
4506 | /* 4. Otherwise, if the algorithm is the "has an element in scope" | ||
4507 | variant (rather than the "has an element in table scope" variant), | ||
4508 | and node is one of the following, terminate in a failure state. */ | ||
4509 | return false; | ||
4510 | |||
4511 | } elseif ($node === $node->ownerDocument->documentElement) { | ||
4512 | /* 5. Otherwise, if node is an html element (root element), terminate | ||
4513 | in a failure state. (This can only happen if the node is the topmost | ||
4514 | node of the stack of open elements, and prevents the next step from | ||
4515 | being invoked if there are no more elements in the stack.) */ | ||
4516 | return false; | ||
4517 | } | ||
4518 | |||
4519 | /* Otherwise, set node to the previous entry in the stack of open | ||
4520 | elements and return to step 2. (This will never fail, since the loop | ||
4521 | will always terminate in the previous step if the top of the stack | ||
4522 | is reached.) */ | ||
4523 | } | ||
4524 | } | ||
4525 | |||
4526 | private function reconstructActiveFormattingElements() | ||
4527 | { | ||
4528 | /* 1. If there are no entries in the list of active formatting elements, | ||
4529 | then there is nothing to reconstruct; stop this algorithm. */ | ||
4530 | $formatting_elements = count($this->a_formatting); | ||
4531 | |||
4532 | if ($formatting_elements === 0) { | ||
4533 | return false; | ||
4534 | } | ||
4535 | |||
4536 | /* 3. Let entry be the last (most recently added) element in the list | ||
4537 | of active formatting elements. */ | ||
4538 | $entry = end($this->a_formatting); | ||
4539 | |||
4540 | /* 2. If the last (most recently added) entry in the list of active | ||
4541 | formatting elements is a marker, or if it is an element that is in the | ||
4542 | stack of open elements, then there is nothing to reconstruct; stop this | ||
4543 | algorithm. */ | ||
4544 | if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { | ||
4545 | return false; | ||
4546 | } | ||
4547 | |||
4548 | for ($a = $formatting_elements - 1; $a >= 0; true) { | ||
4549 | /* 4. If there are no entries before entry in the list of active | ||
4550 | formatting elements, then jump to step 8. */ | ||
4551 | if ($a === 0) { | ||
4552 | $step_seven = false; | ||
4553 | break; | ||
4554 | } | ||
4555 | |||
4556 | /* 5. Let entry be the entry one earlier than entry in the list of | ||
4557 | active formatting elements. */ | ||
4558 | $a--; | ||
4559 | $entry = $this->a_formatting[$a]; | ||
4560 | |||
4561 | /* 6. If entry is neither a marker nor an element that is also in | ||
4562 | thetack of open elements, go to step 4. */ | ||
4563 | if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { | ||
4564 | break; | ||
4565 | } | ||
4566 | } | ||
4567 | |||
4568 | while (true) { | ||
4569 | /* 7. Let entry be the element one later than entry in the list of | ||
4570 | active formatting elements. */ | ||
4571 | if (isset($step_seven) && $step_seven === true) { | ||
4572 | $a++; | ||
4573 | $entry = $this->a_formatting[$a]; | ||
4574 | } | ||
4575 | |||
4576 | /* 8. Perform a shallow clone of the element entry to obtain clone. */ | ||
4577 | $clone = $entry->cloneNode(); | ||
4578 | |||
4579 | /* 9. Append clone to the current node and push it onto the stack | ||
4580 | of open elements so that it is the new current node. */ | ||
4581 | end($this->stack)->appendChild($clone); | ||
4582 | $this->stack[] = $clone; | ||
4583 | |||
4584 | /* 10. Replace the entry for entry in the list with an entry for | ||
4585 | clone. */ | ||
4586 | $this->a_formatting[$a] = $clone; | ||
4587 | |||
4588 | /* 11. If the entry for clone in the list of active formatting | ||
4589 | elements is not the last entry in the list, return to step 7. */ | ||
4590 | if (end($this->a_formatting) !== $clone) { | ||
4591 | $step_seven = true; | ||
4592 | } else { | ||
4593 | break; | ||
4594 | } | ||
4595 | } | ||
4596 | } | ||
4597 | |||
4598 | private function clearTheActiveFormattingElementsUpToTheLastMarker() | ||
4599 | { | ||
4600 | /* When the steps below require the UA to clear the list of active | ||
4601 | formatting elements up to the last marker, the UA must perform the | ||
4602 | following steps: */ | ||
4603 | |||
4604 | while (true) { | ||
4605 | /* 1. Let entry be the last (most recently added) entry in the list | ||
4606 | of active formatting elements. */ | ||
4607 | $entry = end($this->a_formatting); | ||
4608 | |||
4609 | /* 2. Remove entry from the list of active formatting elements. */ | ||
4610 | array_pop($this->a_formatting); | ||
4611 | |||
4612 | /* 3. If entry was a marker, then stop the algorithm at this point. | ||
4613 | The list has been cleared up to the last marker. */ | ||
4614 | if ($entry === self::MARKER) { | ||
4615 | break; | ||
4616 | } | ||
4617 | } | ||
4618 | } | ||
4619 | |||
4620 | private function generateImpliedEndTags($exclude = array()) | ||
4621 | { | ||
4622 | /* When the steps below require the UA to generate implied end tags, | ||
4623 | then, if the current node is a dd element, a dt element, an li element, | ||
4624 | a p element, a td element, a th element, or a tr element, the UA must | ||
4625 | act as if an end tag with the respective tag name had been seen and | ||
4626 | then generate implied end tags again. */ | ||
4627 | $node = end($this->stack); | ||
4628 | $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); | ||
4629 | |||
4630 | while (in_array(end($this->stack)->nodeName, $elements)) { | ||
4631 | array_pop($this->stack); | ||
4632 | } | ||
4633 | } | ||
4634 | |||
4635 | private function getElementCategory($node) | ||
4636 | { | ||
4637 | $name = $node->tagName; | ||
4638 | if (in_array($name, $this->special)) { | ||
4639 | return self::SPECIAL; | ||
4640 | } elseif (in_array($name, $this->scoping)) { | ||
4641 | return self::SCOPING; | ||
4642 | } elseif (in_array($name, $this->formatting)) { | ||
4643 | return self::FORMATTING; | ||
4644 | } else { | ||
4645 | return self::PHRASING; | ||
4646 | } | ||
4647 | } | ||
4648 | |||
4649 | private function clearStackToTableContext($elements) | ||
4650 | { | ||
4651 | /* When the steps above require the UA to clear the stack back to a | ||
4652 | table context, it means that the UA must, while the current node is not | ||
4653 | a table element or an html element, pop elements from the stack of open | ||
4654 | elements. If this causes any elements to be popped from the stack, then | ||
4655 | this is a parse error. */ | ||
4656 | while (true) { | ||
4657 | $node = end($this->stack)->nodeName; | ||
4658 | |||
4659 | if (in_array($node, $elements)) { | ||
4660 | break; | ||
4661 | } else { | ||
4662 | array_pop($this->stack); | ||
4663 | } | ||
4664 | } | ||
4665 | } | ||
4666 | |||
4667 | private function resetInsertionMode() | ||
4668 | { | ||
4669 | /* 1. Let last be false. */ | ||
4670 | $last = false; | ||
4671 | $leng = count($this->stack); | ||
4672 | |||
4673 | for ($n = $leng - 1; $n >= 0; $n--) { | ||
4674 | /* 2. Let node be the last node in the stack of open elements. */ | ||
4675 | $node = $this->stack[$n]; | ||
4676 | |||
4677 | /* 3. If node is the first node in the stack of open elements, then | ||
4678 | set last to true. If the element whose innerHTML attribute is being | ||
4679 | set is neither a td element nor a th element, then set node to the | ||
4680 | element whose innerHTML attribute is being set. (innerHTML case) */ | ||
4681 | if ($this->stack[0]->isSameNode($node)) { | ||
4682 | $last = true; | ||
4683 | } | ||
4684 | |||
4685 | /* 4. If node is a select element, then switch the insertion mode to | ||
4686 | "in select" and abort these steps. (innerHTML case) */ | ||
4687 | if ($node->nodeName === 'select') { | ||
4688 | $this->mode = self::IN_SELECT; | ||
4689 | break; | ||
4690 | |||
4691 | /* 5. If node is a td or th element, then switch the insertion mode | ||
4692 | to "in cell" and abort these steps. */ | ||
4693 | } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') { | ||
4694 | $this->mode = self::IN_CELL; | ||
4695 | break; | ||
4696 | |||
4697 | /* 6. If node is a tr element, then switch the insertion mode to | ||
4698 | "in row" and abort these steps. */ | ||
4699 | } elseif ($node->nodeName === 'tr') { | ||
4700 | $this->mode = self::IN_ROW; | ||
4701 | break; | ||
4702 | |||
4703 | /* 7. If node is a tbody, thead, or tfoot element, then switch the | ||
4704 | insertion mode to "in table body" and abort these steps. */ | ||
4705 | } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { | ||
4706 | $this->mode = self::IN_TBODY; | ||
4707 | break; | ||
4708 | |||
4709 | /* 8. If node is a caption element, then switch the insertion mode | ||
4710 | to "in caption" and abort these steps. */ | ||
4711 | } elseif ($node->nodeName === 'caption') { | ||
4712 | $this->mode = self::IN_CAPTION; | ||
4713 | break; | ||
4714 | |||
4715 | /* 9. If node is a colgroup element, then switch the insertion mode | ||
4716 | to "in column group" and abort these steps. (innerHTML case) */ | ||
4717 | } elseif ($node->nodeName === 'colgroup') { | ||
4718 | $this->mode = self::IN_CGROUP; | ||
4719 | break; | ||
4720 | |||
4721 | /* 10. If node is a table element, then switch the insertion mode | ||
4722 | to "in table" and abort these steps. */ | ||
4723 | } elseif ($node->nodeName === 'table') { | ||
4724 | $this->mode = self::IN_TABLE; | ||
4725 | break; | ||
4726 | |||
4727 | /* 11. If node is a head element, then switch the insertion mode | ||
4728 | to "in body" ("in body"! not "in head"!) and abort these steps. | ||
4729 | (innerHTML case) */ | ||
4730 | } elseif ($node->nodeName === 'head') { | ||
4731 | $this->mode = self::IN_BODY; | ||
4732 | break; | ||
4733 | |||
4734 | /* 12. If node is a body element, then switch the insertion mode to | ||
4735 | "in body" and abort these steps. */ | ||
4736 | } elseif ($node->nodeName === 'body') { | ||
4737 | $this->mode = self::IN_BODY; | ||
4738 | break; | ||
4739 | |||
4740 | /* 13. If node is a frameset element, then switch the insertion | ||
4741 | mode to "in frameset" and abort these steps. (innerHTML case) */ | ||
4742 | } elseif ($node->nodeName === 'frameset') { | ||
4743 | $this->mode = self::IN_FRAME; | ||
4744 | break; | ||
4745 | |||
4746 | /* 14. If node is an html element, then: if the head element | ||
4747 | pointer is null, switch the insertion mode to "before head", | ||
4748 | otherwise, switch the insertion mode to "after head". In either | ||
4749 | case, abort these steps. (innerHTML case) */ | ||
4750 | } elseif ($node->nodeName === 'html') { | ||
4751 | $this->mode = ($this->head_pointer === null) | ||
4752 | ? self::BEFOR_HEAD | ||
4753 | : self::AFTER_HEAD; | ||
4754 | |||
4755 | break; | ||
4756 | |||
4757 | /* 15. If last is true, then set the insertion mode to "in body" | ||
4758 | and abort these steps. (innerHTML case) */ | ||
4759 | } elseif ($last) { | ||
4760 | $this->mode = self::IN_BODY; | ||
4761 | break; | ||
4762 | } | ||
4763 | } | ||
4764 | } | ||
4765 | |||
4766 | private function closeCell() | ||
4767 | { | ||
4768 | /* If the stack of open elements has a td or th element in table scope, | ||
4769 | then act as if an end tag token with that tag name had been seen. */ | ||
4770 | foreach (array('td', 'th') as $cell) { | ||
4771 | if ($this->elementInScope($cell, true)) { | ||
4772 | $this->inCell( | ||
4773 | array( | ||
4774 | 'name' => $cell, | ||
4775 | 'type' => HTML5::ENDTAG | ||
4776 | ) | ||
4777 | ); | ||
4778 | |||
4779 | break; | ||
4780 | } | ||
4781 | } | ||
4782 | } | ||
4783 | |||
4784 | public function save() | ||
4785 | { | ||
4786 | return $this->dom; | ||
4787 | } | ||
4788 | } | ||