Merge pull request #481 from wallabag/dev1.5.2

1.5.2
author: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-02-21 15:57:10 +0100
committer: Nicolas Lœuillet <nicolas@loeuillet.org> 2014-02-21 15:57:10 +0100
commit: 99679d06884120c57f43b44e55e03595f1f87bed (patch)
tree: a3f2a1aa1afdaeca1386d0c6e8a75344fd2241fb /inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
parent: 655214ab30ee84884dc408488b85586f36263fcb (diff)
parent: d3b47e94705e17b3ba3529cbb1dc6efe69c5d2b7 (diff)
download: wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.gz
wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.zst
wallabag-99679d06884120c57f43b44e55e03595f1f87bed.zip
1 files changed, 357 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
new file mode 100644
index 00000000..2a9a9d62
--- /dev/null
+++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
@@ -0,0 +1,357 @@
+<?php
+/**
+ * Forgivingly lexes HTML (SGML-style) markup into tokens.
+ *
+ * A lexer parses a string of SGML-style markup and converts them into
+ * corresponding tokens.  It doesn't check for well-formedness, although its
+ * internal mechanism may make this automatic (such as the case of
+ * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
+ * from.
+ *
+ * A lexer is HTML-oriented: it might work with XML, but it's not
+ * recommended, as we adhere to a subset of the specification for optimization
+ * reasons. This might change in the future. Also, most tokenizers are not
+ * expected to handle DTDs or PIs.
+ *
+ * This class should not be directly instantiated, but you may use create() to
+ * retrieve a default copy of the lexer.  Being a supertype, this class
+ * does not actually define any implementation, but offers commonly used
+ * convenience functions for subclasses.
+ *
+ * @note The unit tests will instantiate this class for testing purposes, as
+ *       many of the utility functions require a class to be instantiated.
+ *       This means that, even though this class is not runnable, it will
+ *       not be declared abstract.
+ *
+ * @par
+ *
+ * @note
+ * We use tokens rather than create a DOM representation because DOM would:
+ *
+ * @par
+ *  -# Require more processing and memory to create,
+ *  -# Is not streamable, and
+ *  -# Has the entire document structure (html and body not needed).
+ *
+ * @par
+ * However, DOM is helpful in that it makes it easy to move around nodes
+ * without a lot of lookaheads to see when a tag is closed. This is a
+ * limitation of the token system and some workarounds would be nice.
+ */
+class HTMLPurifier_Lexer
+{
+    /**
+     * Whether or not this lexer implements line-number/column-number tracking.
+     * If it does, set to true.
+     */
+    public $tracksLineNumbers = false;
+    // -- STATIC ----------------------------------------------------------
+    /**
+     * Retrieves or sets the default Lexer as a Prototype Factory.
+     *
+     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
+     * a few exceptions involving special features that only DirectLex
+     * implements.
+     *
+     * @note The behavior of this class has changed, rather than accepting
+     *       a prototype object, it now accepts a configuration object.
+     *       To specify your own prototype, set %Core.LexerImpl to it.
+     *       This change in behavior de-singletonizes the lexer object.
+     *
+     * @param HTMLPurifier_Config $config
+     * @return HTMLPurifier_Lexer
+     * @throws HTMLPurifier_Exception
+     */
+    public static function create($config)
+    {
+        if (!($config instanceof HTMLPurifier_Config)) {
+            $lexer = $config;
+            trigger_error(
+                "Passing a prototype to
+                HTMLPurifier_Lexer::create() is deprecated, please instead
+                use %Core.LexerImpl",
+                E_USER_WARNING
+            );
+        } else {
+            $lexer = $config->get('Core.LexerImpl');
+        }
+        $needs_tracking =
+            $config->get('Core.MaintainLineNumbers') ||
+            $config->get('Core.CollectErrors');
+        $inst = null;
+        if (is_object($lexer)) {
+            $inst = $lexer;
+        } else {
+            if (is_null($lexer)) {
+                do {
+                    // auto-detection algorithm
+                    if ($needs_tracking) {
+                        $lexer = 'DirectLex';
+                        break;
+                    }
+                    if (class_exists('DOMDocument') &&
+                        method_exists('DOMDocument', 'loadHTML') &&
+                        !extension_loaded('domxml')
+                    ) {
+                        // check for DOM support, because while it's part of the
+                        // core, it can be disabled compile time. Also, the PECL
+                        // domxml extension overrides the default DOM, and is evil
+                        // and nasty and we shan't bother to support it
+                        $lexer = 'DOMLex';
+                    } else {
+                        $lexer = 'DirectLex';
+                    }
+                } while (0);
+            } // do..while so we can break
+            // instantiate recognized string names
+            switch ($lexer) {
+                case 'DOMLex':
+                    $inst = new HTMLPurifier_Lexer_DOMLex();
+                    break;
+                case 'DirectLex':
+                    $inst = new HTMLPurifier_Lexer_DirectLex();
+                    break;
+                case 'PH5P':
+                    $inst = new HTMLPurifier_Lexer_PH5P();
+                    break;
+                default:
+                    throw new HTMLPurifier_Exception(
+                        "Cannot instantiate unrecognized Lexer type " .
+                        htmlspecialchars($lexer)
+                    );
+            }
+        }
+        if (!$inst) {
+            throw new HTMLPurifier_Exception('No lexer was instantiated');
+        }
+        // once PHP DOM implements native line numbers, or we
+        // hack out something using XSLT, remove this stipulation
+        if ($needs_tracking && !$inst->tracksLineNumbers) {
+            throw new HTMLPurifier_Exception(
+                'Cannot use lexer that does not support line numbers with ' .
+                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
+            );
+        }
+        return $inst;
+    }
+    // -- CONVENIENCE MEMBERS ---------------------------------------------
+    public function __construct()
+    {
+        $this->_entity_parser = new HTMLPurifier_EntityParser();
+    }
+    /**
+     * Most common entity to raw value conversion table for special entities.
+     * @type array
+     */
+    protected $_special_entity2str =
+        array(
+            '&quot;' => '"',
+            '&amp;' => '&',
+            '&lt;' => '<',
+            '&gt;' => '>',
+            '&#39;' => "'",
+            '&#039;' => "'",
+            '&#x27;' => "'"
+        );
+    /**
+     * Parses special entities into the proper characters.
+     *
+     * This string will translate escaped versions of the special characters
+     * into the correct ones.
+     *
+     * @warning
+     * You should be able to treat the output of this function as
+     * completely parsed, but that's only because all other entities should
+     * have been handled previously in substituteNonSpecialEntities()
+     *
+     * @param string $string String character data to be parsed.
+     * @return string Parsed character data.
+     */
+    public function parseData($string)
+    {
+        // following functions require at least one character
+        if ($string === '') {
+            return '';
+        }
+        // subtracts amps that cannot possibly be escaped
+        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string) - 1] === '&' ? 1 : 0);
+        if (!$num_amp) {
+            return $string;
+        } // abort if no entities
+        $num_esc_amp = substr_count($string, '&amp;');
+        $string = strtr($string, $this->_special_entity2str);
+        // code duplication for sake of optimization, see above
+        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string) - 1] === '&' ? 1 : 0);
+        if ($num_amp_2 <= $num_esc_amp) {
+            return $string;
+        }
+        // hmm... now we have some uncommon entities. Use the callback.
+        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        return $string;
+    }
+    /**
+     * Lexes an HTML string into tokens.
+     * @param $string String HTML.
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return HTMLPurifier_Token[] array representation of HTML.
+     */
+    public function tokenizeHTML($string, $config, $context)
+    {
+        trigger_error('Call to abstract class', E_USER_ERROR);
+    }
+    /**
+     * Translates CDATA sections into regular sections (through escaping).
+     * @param string $string HTML string to process.
+     * @return string HTML with CDATA sections escaped.
+     */
+    protected static function escapeCDATA($string)
+    {
+        return preg_replace_callback(
+            '/<!\[CDATA\[(.+?)\]\]>/s',
+            array('HTMLPurifier_Lexer', 'CDATACallback'),
+            $string
+        );
+    }
+    /**
+     * Special CDATA case that is especially convoluted for <script>
+     * @param string $string HTML string to process.
+     * @return string HTML with CDATA sections escaped.
+     */
+    protected static function escapeCommentedCDATA($string)
+    {
+        return preg_replace_callback(
+            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
+            array('HTMLPurifier_Lexer', 'CDATACallback'),
+            $string
+        );
+    }
+    /**
+     * Special Internet Explorer conditional comments should be removed.
+     * @param string $string HTML string to process.
+     * @return string HTML with conditional comments removed.
+     */
+    protected static function removeIEConditional($string)
+    {
+        return preg_replace(
+            '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
+            '',
+            $string
+        );
+    }
+    /**
+     * Callback function for escapeCDATA() that does the work.
+     *
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param array $matches PCRE matches array, with index 0 the entire match
+     *                  and 1 the inside of the CDATA section.
+     * @return string Escaped internals of the CDATA section.
+     */
+    protected static function CDATACallback($matches)
+    {
+        // not exactly sure why the character set is needed, but whatever
+        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
+    }
+    /**
+     * Takes a piece of HTML and normalizes it by converting entities, fixing
+     * encoding, extracting bits, and other good stuff.
+     * @param string $html HTML.
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return string
+     * @todo Consider making protected
+     */
+    public function normalize($html, $config, $context)
+    {
+        // normalize newlines to \n
+        if ($config->get('Core.NormalizeNewlines')) {
+            $html = str_replace("\r\n", "\n", $html);
+            $html = str_replace("\r", "\n", $html);
+        }
+        if ($config->get('HTML.Trusted')) {
+            // escape convoluted CDATA
+            $html = $this->escapeCommentedCDATA($html);
+        }
+        // escape CDATA
+        $html = $this->escapeCDATA($html);
+        $html = $this->removeIEConditional($html);
+        // extract body from document if applicable
+        if ($config->get('Core.ConvertDocumentToFragment')) {
+            $e = false;
+            if ($config->get('Core.CollectErrors')) {
+                $e =& $context->get('ErrorCollector');
+            }
+            $new_html = $this->extractBody($html);
+            if ($e && $new_html != $html) {
+                $e->send(E_WARNING, 'Lexer: Extracted body');
+            }
+            $html = $new_html;
+        }
+        // expand entities that aren't the big five
+        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
+        // clean into wellformed UTF-8 string for an SGML context: this has
+        // to be done after entity expansion because the entities sometimes
+        // represent non-SGML characters (horror, horror!)
+        $html = HTMLPurifier_Encoder::cleanUTF8($html);
+        // if processing instructions are to removed, remove them now
+        if ($config->get('Core.RemoveProcessingInstructions')) {
+            $html = preg_replace('#<\?.+?\?>#s', '', $html);
+        }
+        return $html;
+    }
+    /**
+     * Takes a string of HTML (fragment or document) and returns the content
+     * @todo Consider making protected
+     */
+    public function extractBody($html)
+    {
+        $matches = array();
+        $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
+        if ($result) {
+            return $matches[1];
+        } else {
+            return $html;
+        }
+    }
+}
+// vim: et sw=4 sts=4
author	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-02-21 15:57:10 +0100
committer	Nicolas Lœuillet <nicolas@loeuillet.org>	2014-02-21 15:57:10 +0100
commit	99679d06884120c57f43b44e55e03595f1f87bed (patch)
tree	a3f2a1aa1afdaeca1386d0c6e8a75344fd2241fb /inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
parent	655214ab30ee84884dc408488b85586f36263fcb (diff)
parent	d3b47e94705e17b3ba3529cbb1dc6efe69c5d2b7 (diff)
download	wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.gz wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.zst wallabag-99679d06884120c57f43b44e55e03595f1f87bed.zip

diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php new file mode 100644 index 00000000..2a9a9d62 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
@@ -0,0 +1,357 @@
	1	<?php
	2
	3	/**
	4	* Forgivingly lexes HTML (SGML-style) markup into tokens.
	5	*
	6	* A lexer parses a string of SGML-style markup and converts them into
	7	* corresponding tokens. It doesn't check for well-formedness, although its
	8	* internal mechanism may make this automatic (such as the case of
	9	* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
	10	* from.
	11	*
	12	* A lexer is HTML-oriented: it might work with XML, but it's not
	13	* recommended, as we adhere to a subset of the specification for optimization
	14	* reasons. This might change in the future. Also, most tokenizers are not
	15	* expected to handle DTDs or PIs.
	16	*
	17	* This class should not be directly instantiated, but you may use create() to
	18	* retrieve a default copy of the lexer. Being a supertype, this class
	19	* does not actually define any implementation, but offers commonly used
	20	* convenience functions for subclasses.
	21	*
	22	* @note The unit tests will instantiate this class for testing purposes, as
	23	* many of the utility functions require a class to be instantiated.
	24	* This means that, even though this class is not runnable, it will
	25	* not be declared abstract.
	26	*
	27	* @par
	28	*
	29	* @note
	30	* We use tokens rather than create a DOM representation because DOM would:
	31	*
	32	* @par
	33	* -# Require more processing and memory to create,
	34	* -# Is not streamable, and
	35	* -# Has the entire document structure (html and body not needed).
	36	*
	37	* @par
	38	* However, DOM is helpful in that it makes it easy to move around nodes
	39	* without a lot of lookaheads to see when a tag is closed. This is a
	40	* limitation of the token system and some workarounds would be nice.
	41	*/
	42	class HTMLPurifier_Lexer
	43	{
	44
	45	/**
	46	* Whether or not this lexer implements line-number/column-number tracking.
	47	* If it does, set to true.
	48	*/
	49	public $tracksLineNumbers = false;
	50
	51	// -- STATIC ----------------------------------------------------------
	52
	53	/**
	54	* Retrieves or sets the default Lexer as a Prototype Factory.
	55	*
	56	* By default HTMLPurifier_Lexer_DOMLex will be returned. There are
	57	* a few exceptions involving special features that only DirectLex
	58	* implements.
	59	*
	60	* @note The behavior of this class has changed, rather than accepting
	61	* a prototype object, it now accepts a configuration object.
	62	* To specify your own prototype, set %Core.LexerImpl to it.
	63	* This change in behavior de-singletonizes the lexer object.
	64	*
	65	* @param HTMLPurifier_Config $config
	66	* @return HTMLPurifier_Lexer
	67	* @throws HTMLPurifier_Exception
	68	*/
	69	public static function create($config)
	70	{
	71	if (!($config instanceof HTMLPurifier_Config)) {
	72	$lexer = $config;
	73	trigger_error(
	74	"Passing a prototype to
	75	HTMLPurifier_Lexer::create() is deprecated, please instead
	76	use %Core.LexerImpl",
	77	E_USER_WARNING
	78	);
	79	} else {
	80	$lexer = $config->get('Core.LexerImpl');
	81	}
	82
	83	$needs_tracking =
	84	$config->get('Core.MaintainLineNumbers') \|\|
	85	$config->get('Core.CollectErrors');
	86
	87	$inst = null;
	88	if (is_object($lexer)) {
	89	$inst = $lexer;
	90	} else {
	91	if (is_null($lexer)) {
	92	do {
	93	// auto-detection algorithm
	94	if ($needs_tracking) {
	95	$lexer = 'DirectLex';
	96	break;
	97	}
	98
	99	if (class_exists('DOMDocument') &&
	100	method_exists('DOMDocument', 'loadHTML') &&
	101	!extension_loaded('domxml')
	102	) {
	103	// check for DOM support, because while it's part of the
	104	// core, it can be disabled compile time. Also, the PECL
	105	// domxml extension overrides the default DOM, and is evil
	106	// and nasty and we shan't bother to support it
	107	$lexer = 'DOMLex';
	108	} else {
	109	$lexer = 'DirectLex';
	110	}
	111	} while (0);
	112	} // do..while so we can break
	113
	114	// instantiate recognized string names
	115	switch ($lexer) {
	116	case 'DOMLex':
	117	$inst = new HTMLPurifier_Lexer_DOMLex();
	118	break;
	119	case 'DirectLex':
	120	$inst = new HTMLPurifier_Lexer_DirectLex();
	121	break;
	122	case 'PH5P':
	123	$inst = new HTMLPurifier_Lexer_PH5P();
	124	break;
	125	default:
	126	throw new HTMLPurifier_Exception(
	127	"Cannot instantiate unrecognized Lexer type " .
	128	htmlspecialchars($lexer)
	129	);
	130	}
	131	}
	132
	133	if (!$inst) {
	134	throw new HTMLPurifier_Exception('No lexer was instantiated');
	135	}
	136
	137	// once PHP DOM implements native line numbers, or we
	138	// hack out something using XSLT, remove this stipulation
	139	if ($needs_tracking && !$inst->tracksLineNumbers) {
	140	throw new HTMLPurifier_Exception(
	141	'Cannot use lexer that does not support line numbers with ' .
	142	'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
	143	);
	144	}
	145
	146	return $inst;
	147
	148	}
	149
	150	// -- CONVENIENCE MEMBERS ---------------------------------------------
	151
	152	public function __construct()
	153	{
	154	$this->_entity_parser = new HTMLPurifier_EntityParser();
	155	}
	156
	157	/**
	158	* Most common entity to raw value conversion table for special entities.
	159	* @type array
	160	*/
	161	protected $_special_entity2str =
	162	array(
	163	'"' => '"',
	164	'&' => '&',
	165	'<' => '<',
	166	'>' => '>',
	167	''' => "'",
	168	''' => "'",
	169	''' => "'"
	170	);
	171
	172	/**
	173	* Parses special entities into the proper characters.
	174	*
	175	* This string will translate escaped versions of the special characters
	176	* into the correct ones.
	177	*
	178	* @warning
	179	* You should be able to treat the output of this function as
	180	* completely parsed, but that's only because all other entities should
	181	* have been handled previously in substituteNonSpecialEntities()
	182	*
	183	* @param string $string String character data to be parsed.
	184	* @return string Parsed character data.
	185	*/
	186	public function parseData($string)
	187	{
	188	// following functions require at least one character
	189	if ($string === '') {
	190	return '';
	191	}
	192
	193	// subtracts amps that cannot possibly be escaped
	194	$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
	195	($string[strlen($string) - 1] === '&' ? 1 : 0);
	196
	197	if (!$num_amp) {
	198	return $string;
	199	} // abort if no entities
	200	$num_esc_amp = substr_count($string, '&');
	201	$string = strtr($string, $this->_special_entity2str);
	202
	203	// code duplication for sake of optimization, see above
	204	$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
	205	($string[strlen($string) - 1] === '&' ? 1 : 0);
	206
	207	if ($num_amp_2 <= $num_esc_amp) {
	208	return $string;
	209	}
	210
	211	// hmm... now we have some uncommon entities. Use the callback.
	212	$string = $this->_entity_parser->substituteSpecialEntities($string);
	213	return $string;
	214	}
	215
	216	/**
	217	* Lexes an HTML string into tokens.
	218	* @param $string String HTML.
	219	* @param HTMLPurifier_Config $config
	220	* @param HTMLPurifier_Context $context
	221	* @return HTMLPurifier_Token[] array representation of HTML.
	222	*/
	223	public function tokenizeHTML($string, $config, $context)
	224	{
	225	trigger_error('Call to abstract class', E_USER_ERROR);
	226	}
	227
	228	/**
	229	* Translates CDATA sections into regular sections (through escaping).
	230	* @param string $string HTML string to process.
	231	* @return string HTML with CDATA sections escaped.
	232	*/
	233	protected static function escapeCDATA($string)
	234	{
	235	return preg_replace_callback(
	236	'/<!\[CDATA\[(.+?)\]\]>/s',
	237	array('HTMLPurifier_Lexer', 'CDATACallback'),
	238	$string
	239	);
	240	}
	241
	242	/**
	243	* Special CDATA case that is especially convoluted for <script>
	244	* @param string $string HTML string to process.
	245	* @return string HTML with CDATA sections escaped.
	246	*/
	247	protected static function escapeCommentedCDATA($string)
	248	{
	249	return preg_replace_callback(
	250	'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
	251	array('HTMLPurifier_Lexer', 'CDATACallback'),
	252	$string
	253	);
	254	}
	255
	256	/**
	257	* Special Internet Explorer conditional comments should be removed.
	258	* @param string $string HTML string to process.
	259	* @return string HTML with conditional comments removed.
	260	*/
	261	protected static function removeIEConditional($string)
	262	{
	263	return preg_replace(
	264	'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
	265	'',
	266	$string
	267	);
	268	}
	269
	270	/**
	271	* Callback function for escapeCDATA() that does the work.
	272	*
	273	* @warning Though this is public in order to let the callback happen,
	274	* calling it directly is not recommended.
	275	* @param array $matches PCRE matches array, with index 0 the entire match
	276	* and 1 the inside of the CDATA section.
	277	* @return string Escaped internals of the CDATA section.
	278	*/
	279	protected static function CDATACallback($matches)
	280	{
	281	// not exactly sure why the character set is needed, but whatever
	282	return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
	283	}
	284
	285	/**
	286	* Takes a piece of HTML and normalizes it by converting entities, fixing
	287	* encoding, extracting bits, and other good stuff.
	288	* @param string $html HTML.
	289	* @param HTMLPurifier_Config $config
	290	* @param HTMLPurifier_Context $context
	291	* @return string
	292	* @todo Consider making protected
	293	*/
	294	public function normalize($html, $config, $context)
	295	{
	296	// normalize newlines to \n
	297	if ($config->get('Core.NormalizeNewlines')) {
	298	$html = str_replace("\r\n", "\n", $html);
	299	$html = str_replace("\r", "\n", $html);
	300	}
	301
	302	if ($config->get('HTML.Trusted')) {
	303	// escape convoluted CDATA
	304	$html = $this->escapeCommentedCDATA($html);
	305	}
	306
	307	// escape CDATA
	308	$html = $this->escapeCDATA($html);
	309
	310	$html = $this->removeIEConditional($html);
	311
	312	// extract body from document if applicable
	313	if ($config->get('Core.ConvertDocumentToFragment')) {
	314	$e = false;
	315	if ($config->get('Core.CollectErrors')) {
	316	$e =& $context->get('ErrorCollector');
	317	}
	318	$new_html = $this->extractBody($html);
	319	if ($e && $new_html != $html) {
	320	$e->send(E_WARNING, 'Lexer: Extracted body');
	321	}
	322	$html = $new_html;
	323	}
	324
	325	// expand entities that aren't the big five
	326	$html = $this->_entity_parser->substituteNonSpecialEntities($html);
	327
	328	// clean into wellformed UTF-8 string for an SGML context: this has
	329	// to be done after entity expansion because the entities sometimes
	330	// represent non-SGML characters (horror, horror!)
	331	$html = HTMLPurifier_Encoder::cleanUTF8($html);
	332
	333	// if processing instructions are to removed, remove them now
	334	if ($config->get('Core.RemoveProcessingInstructions')) {
	335	$html = preg_replace('#<\?.+?\?>#s', '', $html);
	336	}
	337
	338	return $html;
	339	}
	340
	341	/**
	342	* Takes a string of HTML (fragment or document) and returns the content
	343	* @todo Consider making protected
	344	*/
	345	public function extractBody($html)
	346	{
	347	$matches = array();
	348	$result = preg_match('!<body[^>]>(.)</body>!is', $html, $matches);
	349	if ($result) {
	350	return $matches[1];
	351	} else {
	352	return $html;
	353	}
	354	}
	355	}
	356
	357	// vim: et sw=4 sts=4