[github/wallabag/wallabag.git] / inc / 3rdparty / htmlpurifier / HTMLPurifier / Lexer / DOMLex.php

<?php\r
\r
/**\r
 * Parser that uses PHP 5's DOM extension (part of the core).\r
 *\r
 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.\r
 * It gives us a forgiving HTML parser, which we use to transform the HTML\r
 * into a DOM, and then into the tokens.  It is blazingly fast (for large\r
 * documents, it performs twenty times faster than\r
 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.\r
 *\r
 * @note Any empty elements will have empty tokens associated with them, even if\r
 * this is prohibited by the spec. This is cannot be fixed until the spec\r
 * comes into play.\r
 *\r
 * @note PHP's DOM extension does not actually parse any entities, we use\r
 *       our own function to do that.\r
 *\r
 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.\r
 *          If this is a huge problem, due to the fact that HTML is hand\r
 *          edited and you are unable to get a parser cache that caches the\r
 *          the output of HTML Purifier while keeping the original HTML lying\r
 *          around, you may want to run Tidy on the resulting output or use\r
 *          HTMLPurifier_DirectLex\r
 */\r
\r
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer\r
{\r
\r
    /**\r
     * @type HTMLPurifier_TokenFactory\r
     */\r
    private $factory;\r
\r
    public function __construct()\r
    {\r
        // setup the factory\r
        parent::__construct();\r
        $this->factory = new HTMLPurifier_TokenFactory();\r
    }\r
\r
    /**\r
     * @param string $html\r
     * @param HTMLPurifier_Config $config\r
     * @param HTMLPurifier_Context $context\r
     * @return HTMLPurifier_Token[]\r
     */\r
    public function tokenizeHTML($html, $config, $context)\r
    {\r
        $html = $this->normalize($html, $config, $context);\r
\r
        // attempt to armor stray angled brackets that cannot possibly\r
        // form tags and thus are probably being used as emoticons\r
        if ($config->get('Core.AggressivelyFixLt')) {\r
            $char = '[^a-z!\/]';\r
            $comment = "/<!--(.*?)(-->|\z)/is";\r
            $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);\r
            do {\r
                $old = $html;\r
                $html = preg_replace("/<($char)/i", '&lt;\\1', $html);\r
            } while ($html !== $old);\r
            $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments\r
        }\r
\r
        // preprocess html, essential for UTF-8\r
        $html = $this->wrapHTML($html, $config, $context);\r
\r
        $doc = new DOMDocument();\r
        $doc->encoding = 'UTF-8'; // theoretically, the above has this covered\r
\r
        set_error_handler(array($this, 'muteErrorHandler'));\r
        $doc->loadHTML($html);\r
        restore_error_handler();\r
\r
        $tokens = array();\r
        $this->tokenizeDOM(\r
            $doc->getElementsByTagName('html')->item(0)-> // <html>\r
            getElementsByTagName('body')->item(0)-> //   <body>\r
            getElementsByTagName('div')->item(0), //     <div>\r
            $tokens\r
        );\r
        return $tokens;\r
    }\r
\r
    /**\r
     * Iterative function that tokenizes a node, putting it into an accumulator.\r
     * To iterate is human, to recurse divine - L. Peter Deutsch\r
     * @param DOMNode $node DOMNode to be tokenized.\r
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.\r
     * @return HTMLPurifier_Token of node appended to previously passed tokens.\r
     */\r
    protected function tokenizeDOM($node, &$tokens)\r
    {\r
        $level = 0;\r
        $nodes = array($level => new HTMLPurifier_Queue(array($node)));\r
        $closingNodes = array();\r
        do {\r
            while (!$nodes[$level]->isEmpty()) {\r
                $node = $nodes[$level]->shift(); // FIFO\r
                $collect = $level > 0 ? true : false;\r
                $needEndingTag = $this->createStartNode($node, $tokens, $collect);\r
                if ($needEndingTag) {\r
                    $closingNodes[$level][] = $node;\r
                }\r
                if ($node->childNodes && $node->childNodes->length) {\r
                    $level++;\r
                    $nodes[$level] = new HTMLPurifier_Queue();\r
                    foreach ($node->childNodes as $childNode) {\r
                        $nodes[$level]->push($childNode);\r
                    }\r
                }\r
            }\r
            $level--;\r
            if ($level && isset($closingNodes[$level])) {\r
                while ($node = array_pop($closingNodes[$level])) {\r
                    $this->createEndNode($node, $tokens);\r
                }\r
            }\r
        } while ($level > 0);\r
    }\r
\r
    /**\r
     * @param DOMNode $node DOMNode to be tokenized.\r
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.\r
     * @param bool $collect  Says whether or start and close are collected, set to\r
     *                    false at first recursion because it's the implicit DIV\r
     *                    tag you're dealing with.\r
     * @return bool if the token needs an endtoken\r
     * @todo data and tagName properties don't seem to exist in DOMNode?\r
     */\r
    protected function createStartNode($node, &$tokens, $collect)\r
    {\r
        // intercept non element nodes. WE MUST catch all of them,\r
        // but we're not getting the character reference nodes because\r
        // those should have been preprocessed\r
        if ($node->nodeType === XML_TEXT_NODE) {\r
            $tokens[] = $this->factory->createText($node->data);\r
            return false;\r
        } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {\r
            // undo libxml's special treatment of <script> and <style> tags\r
            $last = end($tokens);\r
            $data = $node->data;\r
            // (note $node->tagname is already normalized)\r
            if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {\r
                $new_data = trim($data);\r
                if (substr($new_data, 0, 4) === '<!--') {\r
                    $data = substr($new_data, 4);\r
                    if (substr($data, -3) === '-->') {\r
                        $data = substr($data, 0, -3);\r
                    } else {\r
                        // Highly suspicious! Not sure what to do...\r
                    }\r
                }\r
            }\r
            $tokens[] = $this->factory->createText($this->parseData($data));\r
            return false;\r
        } elseif ($node->nodeType === XML_COMMENT_NODE) {\r
            // this is code is only invoked for comments in script/style in versions\r
            // of libxml pre-2.6.28 (regular comments, of course, are still\r
            // handled regularly)\r
            $tokens[] = $this->factory->createComment($node->data);\r
            return false;\r
        } elseif ($node->nodeType !== XML_ELEMENT_NODE) {\r
            // not-well tested: there may be other nodes we have to grab\r
            return false;\r
        }\r
\r
        $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();\r
\r
        // We still have to make sure that the element actually IS empty\r
        if (!$node->childNodes->length) {\r
            if ($collect) {\r
                $tokens[] = $this->factory->createEmpty($node->tagName, $attr);\r
            }\r
            return false;\r
        } else {\r
            if ($collect) {\r
                $tokens[] = $this->factory->createStart(\r
                    $tag_name = $node->tagName, // somehow, it get's dropped\r
                    $attr\r
                );\r
            }\r
            return true;\r
        }\r
    }\r
\r
    /**\r
     * @param DOMNode $node\r
     * @param HTMLPurifier_Token[] $tokens\r
     */\r
    protected function createEndNode($node, &$tokens)\r
    {\r
        $tokens[] = $this->factory->createEnd($node->tagName);\r
    }\r
\r
\r
    /**\r
     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.\r
     *\r
     * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.\r
     * @return array Associative array of attributes.\r
     */\r
    protected function transformAttrToAssoc($node_map)\r
    {\r
        // NamedNodeMap is documented very well, so we're using undocumented\r
        // features, namely, the fact that it implements Iterator and\r
        // has a ->length attribute\r
        if ($node_map->length === 0) {\r
            return array();\r
        }\r
        $array = array();\r
        foreach ($node_map as $attr) {\r
            $array[$attr->name] = $attr->value;\r
        }\r
        return $array;\r
    }\r
\r
    /**\r
     * An error handler that mutes all errors\r
     * @param int $errno\r
     * @param string $errstr\r
     */\r
    public function muteErrorHandler($errno, $errstr)\r
    {\r
    }\r
\r
    /**\r
     * Callback function for undoing escaping of stray angled brackets\r
     * in comments\r
     * @param array $matches\r
     * @return string\r
     */\r
    public function callbackUndoCommentSubst($matches)\r
    {\r
        return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];\r
    }\r
\r
    /**\r
     * Callback function that entity-izes ampersands in comments so that\r
     * callbackUndoCommentSubst doesn't clobber them\r
     * @param array $matches\r
     * @return string\r
     */\r
    public function callbackArmorCommentEntities($matches)\r
    {\r
        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];\r
    }\r
\r
    /**\r
     * Wraps an HTML fragment in the necessary HTML\r
     * @param string $html\r
     * @param HTMLPurifier_Config $config\r
     * @param HTMLPurifier_Context $context\r
     * @return string\r
     */\r
    protected function wrapHTML($html, $config, $context)\r
    {\r
        $def = $config->getDefinition('HTML');\r
        $ret = '';\r
\r
        if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {\r
            $ret .= '<!DOCTYPE html ';\r
            if (!empty($def->doctype->dtdPublic)) {\r
                $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';\r
            }\r
            if (!empty($def->doctype->dtdSystem)) {\r
                $ret .= '"' . $def->doctype->dtdSystem . '" ';\r
            }\r
            $ret .= '>';\r
        }\r
\r
        $ret .= '<html><head>';\r
        $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';\r
        // No protection if $html contains a stray </div>!\r
        $ret .= '</head><body><div>' . $html . '</div></body></html>';\r
        return $ret;\r
    }\r
}\r
\r
// vim: et sw=4 sts=4\r
Commit	Line	Data
d4949327 NL	1	<?php\r
	2	\r
	3	/**\r
	4	* Parser that uses PHP 5's DOM extension (part of the core).\r
	5	*\r
	6	* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.\r
	7	* It gives us a forgiving HTML parser, which we use to transform the HTML\r
	8	* into a DOM, and then into the tokens. It is blazingly fast (for large\r
	9	* documents, it performs twenty times faster than\r
	10	* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.\r
	11	*\r
	12	* @note Any empty elements will have empty tokens associated with them, even if\r
	13	* this is prohibited by the spec. This is cannot be fixed until the spec\r
	14	* comes into play.\r
	15	*\r
	16	* @note PHP's DOM extension does not actually parse any entities, we use\r
	17	* our own function to do that.\r
	18	*\r
	19	* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.\r
	20	* If this is a huge problem, due to the fact that HTML is hand\r
	21	* edited and you are unable to get a parser cache that caches the\r
	22	* the output of HTML Purifier while keeping the original HTML lying\r
	23	* around, you may want to run Tidy on the resulting output or use\r
	24	* HTMLPurifier_DirectLex\r
	25	*/\r
	26	\r
	27	class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer\r
	28	{\r
	29	\r
	30	/**\r
	31	* @type HTMLPurifier_TokenFactory\r
	32	*/\r
	33	private $factory;\r
	34	\r
	35	public function __construct()\r
	36	{\r
	37	// setup the factory\r
	38	parent::__construct();\r
	39	$this->factory = new HTMLPurifier_TokenFactory();\r
	40	}\r
	41	\r
	42	/**\r
	43	* @param string $html\r
	44	* @param HTMLPurifier_Config $config\r
	45	* @param HTMLPurifier_Context $context\r
	46	* @return HTMLPurifier_Token[]\r
	47	*/\r
	48	public function tokenizeHTML($html, $config, $context)\r
	49	{\r
	50	$html = $this->normalize($html, $config, $context);\r
	51	\r
	52	// attempt to armor stray angled brackets that cannot possibly\r
	53	// form tags and thus are probably being used as emoticons\r
	54	if ($config->get('Core.AggressivelyFixLt')) {\r
	55	$char = '[^a-z!\/]';\r
	56	$comment = "/<!--(.*?)(-->\|\z)/is";\r
	57	$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);\r
	58	do {\r
	59	$old = $html;\r
	60	$html = preg_replace("/<($char)/i", '<\\1', $html);\r
	61	} while ($html !== $old);\r
	62	$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments\r
	63	}\r
	64	\r
65	// preprocess html, essential for UTF-8\r
66	$html = $this->wrapHTML($html, $config, $context);\r
67	\r
68	$doc = new DOMDocument();\r
69	$doc->encoding = 'UTF-8'; // theoretically, the above has this covered\r
70	\r
71	set_error_handler(array($this, 'muteErrorHandler'));\r
72	$doc->loadHTML($html);\r
73	restore_error_handler();\r
74	\r
75	$tokens = array();\r
76	$this->tokenizeDOM(\r
77	$doc->getElementsByTagName('html')->item(0)-> // <html>\r
78	getElementsByTagName('body')->item(0)-> // <body>\r
79	getElementsByTagName('div')->item(0), // <div>\r
80	$tokens\r
81	);\r
82	return $tokens;\r
83	}\r
84	\r
85	/**\r
86	* Iterative function that tokenizes a node, putting it into an accumulator.\r
87	* To iterate is human, to recurse divine - L. Peter Deutsch\r
88	* @param DOMNode $node DOMNode to be tokenized.\r
89	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.\r
90	* @return HTMLPurifier_Token of node appended to previously passed tokens.\r
91	*/\r
92	protected function tokenizeDOM($node, &$tokens)\r
93	{\r
94	$level = 0;\r
95	$nodes = array($level => new HTMLPurifier_Queue(array($node)));\r
96	$closingNodes = array();\r
97	do {\r
98	while (!$nodes[$level]->isEmpty()) {\r
99	$node = $nodes[$level]->shift(); // FIFO\r
100	$collect = $level > 0 ? true : false;\r
101	$needEndingTag = $this->createStartNode($node, $tokens, $collect);\r
102	if ($needEndingTag) {\r
103	$closingNodes[$level][] = $node;\r
104	}\r
105	if ($node->childNodes && $node->childNodes->length) {\r
106	$level++;\r
107	$nodes[$level] = new HTMLPurifier_Queue();\r
108	foreach ($node->childNodes as $childNode) {\r
109	$nodes[$level]->push($childNode);\r
110	}\r
111	}\r
112	}\r
113	$level--;\r
114	if ($level && isset($closingNodes[$level])) {\r
115	while ($node = array_pop($closingNodes[$level])) {\r
116	$this->createEndNode($node, $tokens);\r
117	}\r
118	}\r
119	} while ($level > 0);\r
120	}\r
121	\r
122	/**\r
123	* @param DOMNode $node DOMNode to be tokenized.\r
124	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.\r
125	* @param bool $collect Says whether or start and close are collected, set to\r
126	* false at first recursion because it's the implicit DIV\r
127	* tag you're dealing with.\r
128	* @return bool if the token needs an endtoken\r
129	* @todo data and tagName properties don't seem to exist in DOMNode?\r
130	*/\r
131	protected function createStartNode($node, &$tokens, $collect)\r
132	{\r
133	// intercept non element nodes. WE MUST catch all of them,\r
134	// but we're not getting the character reference nodes because\r
135	// those should have been preprocessed\r
136	if ($node->nodeType === XML_TEXT_NODE) {\r
137	$tokens[] = $this->factory->createText($node->data);\r
138	return false;\r
139	} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {\r
140	// undo libxml's special treatment of <script> and <style> tags\r
141	$last = end($tokens);\r
142	$data = $node->data;\r
143	// (note $node->tagname is already normalized)\r
144	if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' \|\| $last->name == 'style')) {\r
145	$new_data = trim($data);\r
146	if (substr($new_data, 0, 4) === '<!--') {\r
147	$data = substr($new_data, 4);\r
148	if (substr($data, -3) === '-->') {\r
149	$data = substr($data, 0, -3);\r
150	} else {\r
151	// Highly suspicious! Not sure what to do...\r
152	}\r
153	}\r
154	}\r
155	$tokens[] = $this->factory->createText($this->parseData($data));\r
156	return false;\r
157	} elseif ($node->nodeType === XML_COMMENT_NODE) {\r
158	// this is code is only invoked for comments in script/style in versions\r
159	// of libxml pre-2.6.28 (regular comments, of course, are still\r
160	// handled regularly)\r
161	$tokens[] = $this->factory->createComment($node->data);\r
162	return false;\r
163	} elseif ($node->nodeType !== XML_ELEMENT_NODE) {\r
164	// not-well tested: there may be other nodes we have to grab\r
165	return false;\r
166	}\r
167	\r
168	$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();\r
169	\r
170	// We still have to make sure that the element actually IS empty\r
171	if (!$node->childNodes->length) {\r
172	if ($collect) {\r
173	$tokens[] = $this->factory->createEmpty($node->tagName, $attr);\r
174	}\r
175	return false;\r
176	} else {\r
177	if ($collect) {\r
178	$tokens[] = $this->factory->createStart(\r
179	$tag_name = $node->tagName, // somehow, it get's dropped\r
180	$attr\r
181	);\r
182	}\r
183	return true;\r
184	}\r
185	}\r
186	\r
187	/**\r
188	* @param DOMNode $node\r
189	* @param HTMLPurifier_Token[] $tokens\r
190	*/\r
191	protected function createEndNode($node, &$tokens)\r
192	{\r
193	$tokens[] = $this->factory->createEnd($node->tagName);\r
194	}\r
195	\r
196	\r
197	/**\r
198	* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.\r
199	*\r
200	* @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.\r
201	* @return array Associative array of attributes.\r
202	*/\r
203	protected function transformAttrToAssoc($node_map)\r
204	{\r
205	// NamedNodeMap is documented very well, so we're using undocumented\r
206	// features, namely, the fact that it implements Iterator and\r
207	// has a ->length attribute\r
208	if ($node_map->length === 0) {\r
209	return array();\r
210	}\r
211	$array = array();\r
212	foreach ($node_map as $attr) {\r
213	$array[$attr->name] = $attr->value;\r
214	}\r
215	return $array;\r
216	}\r
217	\r
218	/**\r
219	* An error handler that mutes all errors\r
220	* @param int $errno\r
221	* @param string $errstr\r
222	*/\r
223	public function muteErrorHandler($errno, $errstr)\r
224	{\r
225	}\r
226	\r
227	/**\r
228	* Callback function for undoing escaping of stray angled brackets\r
229	* in comments\r
230	* @param array $matches\r
231	* @return string\r
232	*/\r
233	public function callbackUndoCommentSubst($matches)\r
234	{\r
235	return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];\r
236	}\r
237	\r
238	/**\r
239	* Callback function that entity-izes ampersands in comments so that\r
240	* callbackUndoCommentSubst doesn't clobber them\r
241	* @param array $matches\r
242	* @return string\r
243	*/\r
244	public function callbackArmorCommentEntities($matches)\r
245	{\r
246	return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];\r
247	}\r
248	\r
249	/**\r
250	* Wraps an HTML fragment in the necessary HTML\r
251	* @param string $html\r
252	* @param HTMLPurifier_Config $config\r
253	* @param HTMLPurifier_Context $context\r
254	* @return string\r
255	*/\r
256	protected function wrapHTML($html, $config, $context)\r
257	{\r
258	$def = $config->getDefinition('HTML');\r
259	$ret = '';\r
260	\r
261	if (!empty($def->doctype->dtdPublic) \|\| !empty($def->doctype->dtdSystem)) {\r
262	$ret .= '<!DOCTYPE html ';\r
263	if (!empty($def->doctype->dtdPublic)) {\r
264	$ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';\r
265	}\r
266	if (!empty($def->doctype->dtdSystem)) {\r
267	$ret .= '"' . $def->doctype->dtdSystem . '" ';\r
268	}\r
269	$ret .= '>';\r
270	}\r
271	\r
272	$ret .= '<html><head>';\r
273	$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';\r
274	// No protection if $html contains a stray </div>!\r
275	$ret .= '</head><body><div>' . $html . '</div></body></html>';\r
276	return $ret;\r
277	}\r
278	}\r
279	\r
280	// vim: et sw=4 sts=4\r