From d4949327efa15b492cab1bef3fe074290a328a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Fri, 21 Feb 2014 15:43:14 +0100 Subject: [add] HTML Purifier added to clean code --- .../htmlpurifier/HTMLPurifier/Lexer/DOMLex.php | 280 +++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php') diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php new file mode 100644 index 00000000..b13e6c55 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer/DOMLex.php @@ -0,0 +1,280 @@ +factory = new HTMLPurifier_TokenFactory(); + } + + /** + * @param string $html + * @param HTMLPurifier_Config $config + * @param HTMLPurifier_Context $context + * @return HTMLPurifier_Token[] + */ + public function tokenizeHTML($html, $config, $context) + { + $html = $this->normalize($html, $config, $context); + + // attempt to armor stray angled brackets that cannot possibly + // form tags and thus are probably being used as emoticons + if ($config->get('Core.AggressivelyFixLt')) { + $char = '[^a-z!\/]'; + $comment = "/|\z)/is"; + $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); + do { + $old = $html; + $html = preg_replace("/<($char)/i", '<\\1', $html); + } while ($html !== $old); + $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments + } + + // preprocess html, essential for UTF-8 + $html = $this->wrapHTML($html, $config, $context); + + $doc = new DOMDocument(); + $doc->encoding = 'UTF-8'; // theoretically, the above has this covered + + set_error_handler(array($this, 'muteErrorHandler')); + $doc->loadHTML($html); + restore_error_handler(); + + $tokens = array(); + $this->tokenizeDOM( + $doc->getElementsByTagName('html')->item(0)-> // + getElementsByTagName('body')->item(0)-> // + getElementsByTagName('div')->item(0), //
+ $tokens + ); + return $tokens; + } + + /** + * Iterative function that tokenizes a node, putting it into an accumulator. + * To iterate is human, to recurse divine - L. Peter Deutsch + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @return HTMLPurifier_Token of node appended to previously passed tokens. + */ + protected function tokenizeDOM($node, &$tokens) + { + $level = 0; + $nodes = array($level => new HTMLPurifier_Queue(array($node))); + $closingNodes = array(); + do { + while (!$nodes[$level]->isEmpty()) { + $node = $nodes[$level]->shift(); // FIFO + $collect = $level > 0 ? true : false; + $needEndingTag = $this->createStartNode($node, $tokens, $collect); + if ($needEndingTag) { + $closingNodes[$level][] = $node; + } + if ($node->childNodes && $node->childNodes->length) { + $level++; + $nodes[$level] = new HTMLPurifier_Queue(); + foreach ($node->childNodes as $childNode) { + $nodes[$level]->push($childNode); + } + } + } + $level--; + if ($level && isset($closingNodes[$level])) { + while ($node = array_pop($closingNodes[$level])) { + $this->createEndNode($node, $tokens); + } + } + } while ($level > 0); + } + + /** + * @param DOMNode $node DOMNode to be tokenized. + * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. + * @param bool $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @return bool if the token needs an endtoken + * @todo data and tagName properties don't seem to exist in DOMNode? + */ + protected function createStartNode($node, &$tokens, $collect) + { + // intercept non element nodes. WE MUST catch all of them, + // but we're not getting the character reference nodes because + // those should have been preprocessed + if ($node->nodeType === XML_TEXT_NODE) { + $tokens[] = $this->factory->createText($node->data); + return false; + } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { + // undo libxml's special treatment of