1 files changed, 280 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php
new file mode 100644
index 00000000..b13e6c55
--- /dev/null
+++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php
@@ -0,0 +1,280 @@
+<?php
+/**
+ * Parser that uses PHP 5's DOM extension (part of the core).
+ *
+ * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
+ * It gives us a forgiving HTML parser, which we use to transform the HTML
+ * into a DOM, and then into the tokens.  It is blazingly fast (for large
+ * documents, it performs twenty times faster than
+ * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
+ *
+ * @note Any empty elements will have empty tokens associated with them, even if
+ * this is prohibited by the spec. This is cannot be fixed until the spec
+ * comes into play.
+ *
+ * @note PHP's DOM extension does not actually parse any entities, we use
+ *       our own function to do that.
+ *
+ * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
+ *          If this is a huge problem, due to the fact that HTML is hand
+ *          edited and you are unable to get a parser cache that caches the
+ *          the output of HTML Purifier while keeping the original HTML lying
+ *          around, you may want to run Tidy on the resulting output or use
+ *          HTMLPurifier_DirectLex
+ */
+class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
+{
+    /**
+     * @type HTMLPurifier_TokenFactory
+     */
+    private $factory;
+    public function __construct()
+    {
+        // setup the factory
+        parent::__construct();
+        $this->factory = new HTMLPurifier_TokenFactory();
+    }
+    /**
+     * @param string $html
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return HTMLPurifier_Token[]
+     */
+    public function tokenizeHTML($html, $config, $context)
+    {
+        $html = $this->normalize($html, $config, $context);
+        // attempt to armor stray angled brackets that cannot possibly
+        // form tags and thus are probably being used as emoticons
+        if ($config->get('Core.AggressivelyFixLt')) {
+            $char = '[^a-z!\/]';
+            $comment = "/<!--(.*?)(-->|\z)/is";
+            $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
+            do {
+                $old = $html;
+                $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
+            } while ($html !== $old);
+            $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
+        }
+        // preprocess html, essential for UTF-8
+        $html = $this->wrapHTML($html, $config, $context);
+        $doc = new DOMDocument();
+        $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
+        set_error_handler(array($this, 'muteErrorHandler'));
+        $doc->loadHTML($html);
+        restore_error_handler();
+        $tokens = array();
+        $this->tokenizeDOM(
+            $doc->getElementsByTagName('html')->item(0)-> // <html>
+            getElementsByTagName('body')->item(0)-> //   <body>
+            getElementsByTagName('div')->item(0), //     <div>
+            $tokens
+        );
+        return $tokens;
+    }
+    /**
+     * Iterative function that tokenizes a node, putting it into an accumulator.
+     * To iterate is human, to recurse divine - L. Peter Deutsch
+     * @param DOMNode $node DOMNode to be tokenized.
+     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
+     * @return HTMLPurifier_Token of node appended to previously passed tokens.
+     */
+    protected function tokenizeDOM($node, &$tokens)
+    {
+        $level = 0;
+        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
+        $closingNodes = array();
+        do {
+            while (!$nodes[$level]->isEmpty()) {
+                $node = $nodes[$level]->shift(); // FIFO
+                $collect = $level > 0 ? true : false;
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                if ($needEndingTag) {
+                    $closingNodes[$level][] = $node;
+                }
+                if ($node->childNodes && $node->childNodes->length) {
+                    $level++;
+                    $nodes[$level] = new HTMLPurifier_Queue();
+                    foreach ($node->childNodes as $childNode) {
+                        $nodes[$level]->push($childNode);
+                    }
+                }
+            }
+            $level--;
+            if ($level && isset($closingNodes[$level])) {
+                while ($node = array_pop($closingNodes[$level])) {
+                    $this->createEndNode($node, $tokens);
+                }
+            }
+        } while ($level > 0);
+    }
+    /**
+     * @param DOMNode $node DOMNode to be tokenized.
+     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
+     * @param bool $collect  Says whether or start and close are collected, set to
+     *                    false at first recursion because it's the implicit DIV
+     *                    tag you're dealing with.
+     * @return bool if the token needs an endtoken
+     * @todo data and tagName properties don't seem to exist in DOMNode?
+     */
+    protected function createStartNode($node, &$tokens, $collect)
+    {
+        // intercept non element nodes. WE MUST catch all of them,
+        // but we're not getting the character reference nodes because
+        // those should have been preprocessed
+        if ($node->nodeType === XML_TEXT_NODE) {
+            $tokens[] = $this->factory->createText($node->data);
+            return false;
+        } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
+            // undo libxml's special treatment of <script> and <style> tags
+            $last = end($tokens);
+            $data = $node->data;
+            // (note $node->tagname is already normalized)
+            if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
+                $new_data = trim($data);
+                if (substr($new_data, 0, 4) === '<!--') {
+                    $data = substr($new_data, 4);
+                    if (substr($data, -3) === '-->') {
+                        $data = substr($data, 0, -3);
+                    } else {
+                        // Highly suspicious! Not sure what to do...
+                    }
+                }
+            }
+            $tokens[] = $this->factory->createText($this->parseData($data));
+            return false;
+        } elseif ($node->nodeType === XML_COMMENT_NODE) {
+            // this is code is only invoked for comments in script/style in versions
+            // of libxml pre-2.6.28 (regular comments, of course, are still
+            // handled regularly)
+            $tokens[] = $this->factory->createComment($node->data);
+            return false;
+        } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
+            // not-well tested: there may be other nodes we have to grab
+            return false;
+        }
+        $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
+        // We still have to make sure that the element actually IS empty
+        if (!$node->childNodes->length) {
+            if ($collect) {
+                $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
+            }
+            return false;
+        } else {
+            if ($collect) {
+                $tokens[] = $this->factory->createStart(
+                    $tag_name = $node->tagName, // somehow, it get's dropped
+                    $attr
+                );
+            }
+            return true;
+        }
+    }
+    /**
+     * @param DOMNode $node
+     * @param HTMLPurifier_Token[] $tokens
+     */
+    protected function createEndNode($node, &$tokens)
+    {
+        $tokens[] = $this->factory->createEnd($node->tagName);
+    }
+    /**
+     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
+     *
+     * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
+     * @return array Associative array of attributes.
+     */
+    protected function transformAttrToAssoc($node_map)
+    {
+        // NamedNodeMap is documented very well, so we're using undocumented
+        // features, namely, the fact that it implements Iterator and
+        // has a ->length attribute
+        if ($node_map->length === 0) {
+            return array();
+        }
+        $array = array();
+        foreach ($node_map as $attr) {
+            $array[$attr->name] = $attr->value;
+        }
+        return $array;
+    }
+    /**
+     * An error handler that mutes all errors
+     * @param int $errno
+     * @param string $errstr
+     */
+    public function muteErrorHandler($errno, $errstr)
+    {
+    }
+    /**
+     * Callback function for undoing escaping of stray angled brackets
+     * in comments
+     * @param array $matches
+     * @return string
+     */
+    public function callbackUndoCommentSubst($matches)
+    {
+        return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
+    }
+    /**
+     * Callback function that entity-izes ampersands in comments so that
+     * callbackUndoCommentSubst doesn't clobber them
+     * @param array $matches
+     * @return string
+     */
+    public function callbackArmorCommentEntities($matches)
+    {
+        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
+    }
+    /**
+     * Wraps an HTML fragment in the necessary HTML
+     * @param string $html
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return string
+     */
+    protected function wrapHTML($html, $config, $context)
+    {
+        $def = $config->getDefinition('HTML');
+        $ret = '';
+        if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
+            $ret .= '<!DOCTYPE html ';
+            if (!empty($def->doctype->dtdPublic)) {
+                $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
+            }
+            if (!empty($def->doctype->dtdSystem)) {
+                $ret .= '"' . $def->doctype->dtdSystem . '" ';
+            }
+            $ret .= '>';
+        }
+        $ret .= '<html><head>';
+        $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
+        // No protection if $html contains a stray </div>!
+        $ret .= '</head><body><div>' . $html . '</div></body></html>';
+        return $ret;
+    }
+}
+// vim: et sw=4 sts=4

diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 00000000..b13e6c55 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php
@@ -0,0 +1,280 @@
	1	<?php
	2
	3	/**
	4	* Parser that uses PHP 5's DOM extension (part of the core).
	5	*
	6	* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
	7	* It gives us a forgiving HTML parser, which we use to transform the HTML
	8	* into a DOM, and then into the tokens. It is blazingly fast (for large
	9	* documents, it performs twenty times faster than
	10	* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
	11	*
	12	* @note Any empty elements will have empty tokens associated with them, even if
	13	* this is prohibited by the spec. This is cannot be fixed until the spec
	14	* comes into play.
	15	*
	16	* @note PHP's DOM extension does not actually parse any entities, we use
	17	* our own function to do that.
	18	*
	19	* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
	20	* If this is a huge problem, due to the fact that HTML is hand
	21	* edited and you are unable to get a parser cache that caches the
	22	* the output of HTML Purifier while keeping the original HTML lying
	23	* around, you may want to run Tidy on the resulting output or use
	24	* HTMLPurifier_DirectLex
	25	*/
	26
	27	class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
	28	{
	29
	30	/**
	31	* @type HTMLPurifier_TokenFactory
	32	*/
	33	private $factory;
	34
	35	public function __construct()
	36	{
	37	// setup the factory
	38	parent::__construct();
	39	$this->factory = new HTMLPurifier_TokenFactory();
	40	}
	41
	42	/**
	43	* @param string $html
	44	* @param HTMLPurifier_Config $config
	45	* @param HTMLPurifier_Context $context
	46	* @return HTMLPurifier_Token[]
	47	*/
	48	public function tokenizeHTML($html, $config, $context)
	49	{
	50	$html = $this->normalize($html, $config, $context);
	51
	52	// attempt to armor stray angled brackets that cannot possibly
	53	// form tags and thus are probably being used as emoticons
	54	if ($config->get('Core.AggressivelyFixLt')) {
	55	$char = '[^a-z!\/]';
	56	$comment = "/<!--(.*?)(-->\|\z)/is";
	57	$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
	58	do {
	59	$old = $html;
	60	$html = preg_replace("/<($char)/i", '<\\1', $html);
	61	} while ($html !== $old);
	62	$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
	63	}
	64
	65	// preprocess html, essential for UTF-8
	66	$html = $this->wrapHTML($html, $config, $context);
	67
	68	$doc = new DOMDocument();
	69	$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
	70
	71	set_error_handler(array($this, 'muteErrorHandler'));
	72	$doc->loadHTML($html);
	73	restore_error_handler();
	74
	75	$tokens = array();
	76	$this->tokenizeDOM(
	77	$doc->getElementsByTagName('html')->item(0)-> // <html>
	78	getElementsByTagName('body')->item(0)-> // <body>
	79	getElementsByTagName('div')->item(0), // <div>
	80	$tokens
	81	);
	82	return $tokens;
	83	}
	84
	85	/**
	86	* Iterative function that tokenizes a node, putting it into an accumulator.
	87	* To iterate is human, to recurse divine - L. Peter Deutsch
	88	* @param DOMNode $node DOMNode to be tokenized.
	89	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
	90	* @return HTMLPurifier_Token of node appended to previously passed tokens.
	91	*/
	92	protected function tokenizeDOM($node, &$tokens)
	93	{
	94	$level = 0;
	95	$nodes = array($level => new HTMLPurifier_Queue(array($node)));
	96	$closingNodes = array();
	97	do {
	98	while (!$nodes[$level]->isEmpty()) {
	99	$node = $nodes[$level]->shift(); // FIFO
	100	$collect = $level > 0 ? true : false;
	101	$needEndingTag = $this->createStartNode($node, $tokens, $collect);
	102	if ($needEndingTag) {
	103	$closingNodes[$level][] = $node;
	104	}
	105	if ($node->childNodes && $node->childNodes->length) {
	106	$level++;
	107	$nodes[$level] = new HTMLPurifier_Queue();
	108	foreach ($node->childNodes as $childNode) {
	109	$nodes[$level]->push($childNode);
	110	}
	111	}
	112	}
	113	$level--;
	114	if ($level && isset($closingNodes[$level])) {
	115	while ($node = array_pop($closingNodes[$level])) {
	116	$this->createEndNode($node, $tokens);
	117	}
	118	}
	119	} while ($level > 0);
	120	}
	121
	122	/**
	123	* @param DOMNode $node DOMNode to be tokenized.
	124	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
	125	* @param bool $collect Says whether or start and close are collected, set to
	126	* false at first recursion because it's the implicit DIV
	127	* tag you're dealing with.
	128	* @return bool if the token needs an endtoken
	129	* @todo data and tagName properties don't seem to exist in DOMNode?
	130	*/
	131	protected function createStartNode($node, &$tokens, $collect)
	132	{
	133	// intercept non element nodes. WE MUST catch all of them,
	134	// but we're not getting the character reference nodes because
	135	// those should have been preprocessed
	136	if ($node->nodeType === XML_TEXT_NODE) {
	137	$tokens[] = $this->factory->createText($node->data);
	138	return false;
	139	} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
	140	// undo libxml's special treatment of <script> and <style> tags
	141	$last = end($tokens);
	142	$data = $node->data;
	143	// (note $node->tagname is already normalized)
	144	if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' \|\| $last->name == 'style')) {
	145	$new_data = trim($data);
	146	if (substr($new_data, 0, 4) === '<!--') {
	147	$data = substr($new_data, 4);
	148	if (substr($data, -3) === '-->') {
	149	$data = substr($data, 0, -3);
	150	} else {
	151	// Highly suspicious! Not sure what to do...
	152	}
	153	}
	154	}
	155	$tokens[] = $this->factory->createText($this->parseData($data));
	156	return false;
	157	} elseif ($node->nodeType === XML_COMMENT_NODE) {
	158	// this is code is only invoked for comments in script/style in versions
	159	// of libxml pre-2.6.28 (regular comments, of course, are still
	160	// handled regularly)
	161	$tokens[] = $this->factory->createComment($node->data);
	162	return false;
	163	} elseif ($node->nodeType !== XML_ELEMENT_NODE) {
	164	// not-well tested: there may be other nodes we have to grab
	165	return false;
	166	}
	167
	168	$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
	169
	170	// We still have to make sure that the element actually IS empty
	171	if (!$node->childNodes->length) {
	172	if ($collect) {
	173	$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
	174	}
	175	return false;
	176	} else {
	177	if ($collect) {
	178	$tokens[] = $this->factory->createStart(
	179	$tag_name = $node->tagName, // somehow, it get's dropped
	180	$attr
	181	);
	182	}
	183	return true;
	184	}
	185	}
	186
	187	/**
	188	* @param DOMNode $node
	189	* @param HTMLPurifier_Token[] $tokens
	190	*/
	191	protected function createEndNode($node, &$tokens)
	192	{
	193	$tokens[] = $this->factory->createEnd($node->tagName);
	194	}
	195
	196
	197	/**
	198	* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
	199	*
	200	* @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
	201	* @return array Associative array of attributes.
	202	*/
	203	protected function transformAttrToAssoc($node_map)
	204	{
	205	// NamedNodeMap is documented very well, so we're using undocumented
	206	// features, namely, the fact that it implements Iterator and
	207	// has a ->length attribute
	208	if ($node_map->length === 0) {
	209	return array();
	210	}
	211	$array = array();
	212	foreach ($node_map as $attr) {
	213	$array[$attr->name] = $attr->value;
	214	}
	215	return $array;
	216	}
	217
	218	/**
	219	* An error handler that mutes all errors
	220	* @param int $errno
	221	* @param string $errstr
	222	*/
	223	public function muteErrorHandler($errno, $errstr)
	224	{
	225	}
	226
	227	/**
	228	* Callback function for undoing escaping of stray angled brackets
	229	* in comments
	230	* @param array $matches
	231	* @return string
	232	*/
	233	public function callbackUndoCommentSubst($matches)
	234	{
	235	return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
	236	}
	237
	238	/**
	239	* Callback function that entity-izes ampersands in comments so that
	240	* callbackUndoCommentSubst doesn't clobber them
	241	* @param array $matches
	242	* @return string
	243	*/
	244	public function callbackArmorCommentEntities($matches)
	245	{
	246	return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
	247	}
	248
	249	/**
	250	* Wraps an HTML fragment in the necessary HTML
	251	* @param string $html
	252	* @param HTMLPurifier_Config $config
	253	* @param HTMLPurifier_Context $context
	254	* @return string
	255	*/
	256	protected function wrapHTML($html, $config, $context)
	257	{
	258	$def = $config->getDefinition('HTML');
	259	$ret = '';
	260
	261	if (!empty($def->doctype->dtdPublic) \|\| !empty($def->doctype->dtdSystem)) {
	262	$ret .= '<!DOCTYPE html ';
	263	if (!empty($def->doctype->dtdPublic)) {
	264	$ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
	265	}
	266	if (!empty($def->doctype->dtdSystem)) {
	267	$ret .= '"' . $def->doctype->dtdSystem . '" ';
	268	}
	269	$ret .= '>';
	270	}
	271
	272	$ret .= '<html><head>';
	273	$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
	274	// No protection if $html contains a stray </div>!
	275	$ret .= '</head><body><div>' . $html . '</div></body></html>';
	276	return $ret;
	277	}
	278	}
	279
	280	// vim: et sw=4 sts=4