diff options
Diffstat (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php')
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 00000000..b13e6c55 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php | |||
@@ -0,0 +1,280 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Parser that uses PHP 5's DOM extension (part of the core). | ||
5 | * | ||
6 | * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. | ||
7 | * It gives us a forgiving HTML parser, which we use to transform the HTML | ||
8 | * into a DOM, and then into the tokens. It is blazingly fast (for large | ||
9 | * documents, it performs twenty times faster than | ||
10 | * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. | ||
11 | * | ||
12 | * @note Any empty elements will have empty tokens associated with them, even if | ||
13 | * this is prohibited by the spec. This is cannot be fixed until the spec | ||
14 | * comes into play. | ||
15 | * | ||
16 | * @note PHP's DOM extension does not actually parse any entities, we use | ||
17 | * our own function to do that. | ||
18 | * | ||
19 | * @warning DOM tends to drop whitespace, which may wreak havoc on indenting. | ||
20 | * If this is a huge problem, due to the fact that HTML is hand | ||
21 | * edited and you are unable to get a parser cache that caches the | ||
22 | * the output of HTML Purifier while keeping the original HTML lying | ||
23 | * around, you may want to run Tidy on the resulting output or use | ||
24 | * HTMLPurifier_DirectLex | ||
25 | */ | ||
26 | |||
27 | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer | ||
28 | { | ||
29 | |||
30 | /** | ||
31 | * @type HTMLPurifier_TokenFactory | ||
32 | */ | ||
33 | private $factory; | ||
34 | |||
35 | public function __construct() | ||
36 | { | ||
37 | // setup the factory | ||
38 | parent::__construct(); | ||
39 | $this->factory = new HTMLPurifier_TokenFactory(); | ||
40 | } | ||
41 | |||
42 | /** | ||
43 | * @param string $html | ||
44 | * @param HTMLPurifier_Config $config | ||
45 | * @param HTMLPurifier_Context $context | ||
46 | * @return HTMLPurifier_Token[] | ||
47 | */ | ||
48 | public function tokenizeHTML($html, $config, $context) | ||
49 | { | ||
50 | $html = $this->normalize($html, $config, $context); | ||
51 | |||
52 | // attempt to armor stray angled brackets that cannot possibly | ||
53 | // form tags and thus are probably being used as emoticons | ||
54 | if ($config->get('Core.AggressivelyFixLt')) { | ||
55 | $char = '[^a-z!\/]'; | ||
56 | $comment = "/<!--(.*?)(-->|\z)/is"; | ||
57 | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); | ||
58 | do { | ||
59 | $old = $html; | ||
60 | $html = preg_replace("/<($char)/i", '<\\1', $html); | ||
61 | } while ($html !== $old); | ||
62 | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments | ||
63 | } | ||
64 | |||
65 | // preprocess html, essential for UTF-8 | ||
66 | $html = $this->wrapHTML($html, $config, $context); | ||
67 | |||
68 | $doc = new DOMDocument(); | ||
69 | $doc->encoding = 'UTF-8'; // theoretically, the above has this covered | ||
70 | |||
71 | set_error_handler(array($this, 'muteErrorHandler')); | ||
72 | $doc->loadHTML($html); | ||
73 | restore_error_handler(); | ||
74 | |||
75 | $tokens = array(); | ||
76 | $this->tokenizeDOM( | ||
77 | $doc->getElementsByTagName('html')->item(0)-> // <html> | ||
78 | getElementsByTagName('body')->item(0)-> // <body> | ||
79 | getElementsByTagName('div')->item(0), // <div> | ||
80 | $tokens | ||
81 | ); | ||
82 | return $tokens; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * Iterative function that tokenizes a node, putting it into an accumulator. | ||
87 | * To iterate is human, to recurse divine - L. Peter Deutsch | ||
88 | * @param DOMNode $node DOMNode to be tokenized. | ||
89 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. | ||
90 | * @return HTMLPurifier_Token of node appended to previously passed tokens. | ||
91 | */ | ||
92 | protected function tokenizeDOM($node, &$tokens) | ||
93 | { | ||
94 | $level = 0; | ||
95 | $nodes = array($level => new HTMLPurifier_Queue(array($node))); | ||
96 | $closingNodes = array(); | ||
97 | do { | ||
98 | while (!$nodes[$level]->isEmpty()) { | ||
99 | $node = $nodes[$level]->shift(); // FIFO | ||
100 | $collect = $level > 0 ? true : false; | ||
101 | $needEndingTag = $this->createStartNode($node, $tokens, $collect); | ||
102 | if ($needEndingTag) { | ||
103 | $closingNodes[$level][] = $node; | ||
104 | } | ||
105 | if ($node->childNodes && $node->childNodes->length) { | ||
106 | $level++; | ||
107 | $nodes[$level] = new HTMLPurifier_Queue(); | ||
108 | foreach ($node->childNodes as $childNode) { | ||
109 | $nodes[$level]->push($childNode); | ||
110 | } | ||
111 | } | ||
112 | } | ||
113 | $level--; | ||
114 | if ($level && isset($closingNodes[$level])) { | ||
115 | while ($node = array_pop($closingNodes[$level])) { | ||
116 | $this->createEndNode($node, $tokens); | ||
117 | } | ||
118 | } | ||
119 | } while ($level > 0); | ||
120 | } | ||
121 | |||
122 | /** | ||
123 | * @param DOMNode $node DOMNode to be tokenized. | ||
124 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. | ||
125 | * @param bool $collect Says whether or start and close are collected, set to | ||
126 | * false at first recursion because it's the implicit DIV | ||
127 | * tag you're dealing with. | ||
128 | * @return bool if the token needs an endtoken | ||
129 | * @todo data and tagName properties don't seem to exist in DOMNode? | ||
130 | */ | ||
131 | protected function createStartNode($node, &$tokens, $collect) | ||
132 | { | ||
133 | // intercept non element nodes. WE MUST catch all of them, | ||
134 | // but we're not getting the character reference nodes because | ||
135 | // those should have been preprocessed | ||
136 | if ($node->nodeType === XML_TEXT_NODE) { | ||
137 | $tokens[] = $this->factory->createText($node->data); | ||
138 | return false; | ||
139 | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { | ||
140 | // undo libxml's special treatment of <script> and <style> tags | ||
141 | $last = end($tokens); | ||
142 | $data = $node->data; | ||
143 | // (note $node->tagname is already normalized) | ||
144 | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { | ||
145 | $new_data = trim($data); | ||
146 | if (substr($new_data, 0, 4) === '<!--') { | ||
147 | $data = substr($new_data, 4); | ||
148 | if (substr($data, -3) === '-->') { | ||
149 | $data = substr($data, 0, -3); | ||
150 | } else { | ||
151 | // Highly suspicious! Not sure what to do... | ||
152 | } | ||
153 | } | ||
154 | } | ||
155 | $tokens[] = $this->factory->createText($this->parseData($data)); | ||
156 | return false; | ||
157 | } elseif ($node->nodeType === XML_COMMENT_NODE) { | ||
158 | // this is code is only invoked for comments in script/style in versions | ||
159 | // of libxml pre-2.6.28 (regular comments, of course, are still | ||
160 | // handled regularly) | ||
161 | $tokens[] = $this->factory->createComment($node->data); | ||
162 | return false; | ||
163 | } elseif ($node->nodeType !== XML_ELEMENT_NODE) { | ||
164 | // not-well tested: there may be other nodes we have to grab | ||
165 | return false; | ||
166 | } | ||
167 | |||
168 | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); | ||
169 | |||
170 | // We still have to make sure that the element actually IS empty | ||
171 | if (!$node->childNodes->length) { | ||
172 | if ($collect) { | ||
173 | $tokens[] = $this->factory->createEmpty($node->tagName, $attr); | ||
174 | } | ||
175 | return false; | ||
176 | } else { | ||
177 | if ($collect) { | ||
178 | $tokens[] = $this->factory->createStart( | ||
179 | $tag_name = $node->tagName, // somehow, it get's dropped | ||
180 | $attr | ||
181 | ); | ||
182 | } | ||
183 | return true; | ||
184 | } | ||
185 | } | ||
186 | |||
187 | /** | ||
188 | * @param DOMNode $node | ||
189 | * @param HTMLPurifier_Token[] $tokens | ||
190 | */ | ||
191 | protected function createEndNode($node, &$tokens) | ||
192 | { | ||
193 | $tokens[] = $this->factory->createEnd($node->tagName); | ||
194 | } | ||
195 | |||
196 | |||
197 | /** | ||
198 | * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. | ||
199 | * | ||
200 | * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. | ||
201 | * @return array Associative array of attributes. | ||
202 | */ | ||
203 | protected function transformAttrToAssoc($node_map) | ||
204 | { | ||
205 | // NamedNodeMap is documented very well, so we're using undocumented | ||
206 | // features, namely, the fact that it implements Iterator and | ||
207 | // has a ->length attribute | ||
208 | if ($node_map->length === 0) { | ||
209 | return array(); | ||
210 | } | ||
211 | $array = array(); | ||
212 | foreach ($node_map as $attr) { | ||
213 | $array[$attr->name] = $attr->value; | ||
214 | } | ||
215 | return $array; | ||
216 | } | ||
217 | |||
218 | /** | ||
219 | * An error handler that mutes all errors | ||
220 | * @param int $errno | ||
221 | * @param string $errstr | ||
222 | */ | ||
223 | public function muteErrorHandler($errno, $errstr) | ||
224 | { | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * Callback function for undoing escaping of stray angled brackets | ||
229 | * in comments | ||
230 | * @param array $matches | ||
231 | * @return string | ||
232 | */ | ||
233 | public function callbackUndoCommentSubst($matches) | ||
234 | { | ||
235 | return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; | ||
236 | } | ||
237 | |||
238 | /** | ||
239 | * Callback function that entity-izes ampersands in comments so that | ||
240 | * callbackUndoCommentSubst doesn't clobber them | ||
241 | * @param array $matches | ||
242 | * @return string | ||
243 | */ | ||
244 | public function callbackArmorCommentEntities($matches) | ||
245 | { | ||
246 | return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; | ||
247 | } | ||
248 | |||
249 | /** | ||
250 | * Wraps an HTML fragment in the necessary HTML | ||
251 | * @param string $html | ||
252 | * @param HTMLPurifier_Config $config | ||
253 | * @param HTMLPurifier_Context $context | ||
254 | * @return string | ||
255 | */ | ||
256 | protected function wrapHTML($html, $config, $context) | ||
257 | { | ||
258 | $def = $config->getDefinition('HTML'); | ||
259 | $ret = ''; | ||
260 | |||
261 | if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { | ||
262 | $ret .= '<!DOCTYPE html '; | ||
263 | if (!empty($def->doctype->dtdPublic)) { | ||
264 | $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; | ||
265 | } | ||
266 | if (!empty($def->doctype->dtdSystem)) { | ||
267 | $ret .= '"' . $def->doctype->dtdSystem . '" '; | ||
268 | } | ||
269 | $ret .= '>'; | ||
270 | } | ||
271 | |||
272 | $ret .= '<html><head>'; | ||
273 | $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; | ||
274 | // No protection if $html contains a stray </div>! | ||
275 | $ret .= '</head><body><div>' . $html . '</div></body></html>'; | ||
276 | return $ret; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | // vim: et sw=4 sts=4 | ||