[github/wallabag/wallabag.git] / inc / 3rdparty / htmlpurifier / HTMLPurifier / Lexer.php

<?php\r
\r
/**\r
 * Forgivingly lexes HTML (SGML-style) markup into tokens.\r
 *\r
 * A lexer parses a string of SGML-style markup and converts them into\r
 * corresponding tokens.  It doesn't check for well-formedness, although its\r
 * internal mechanism may make this automatic (such as the case of\r
 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose\r
 * from.\r
 *\r
 * A lexer is HTML-oriented: it might work with XML, but it's not\r
 * recommended, as we adhere to a subset of the specification for optimization\r
 * reasons. This might change in the future. Also, most tokenizers are not\r
 * expected to handle DTDs or PIs.\r
 *\r
 * This class should not be directly instantiated, but you may use create() to\r
 * retrieve a default copy of the lexer.  Being a supertype, this class\r
 * does not actually define any implementation, but offers commonly used\r
 * convenience functions for subclasses.\r
 *\r
 * @note The unit tests will instantiate this class for testing purposes, as\r
 *       many of the utility functions require a class to be instantiated.\r
 *       This means that, even though this class is not runnable, it will\r
 *       not be declared abstract.\r
 *\r
 * @par\r
 *\r
 * @note\r
 * We use tokens rather than create a DOM representation because DOM would:\r
 *\r
 * @par\r
 *  -# Require more processing and memory to create,\r
 *  -# Is not streamable, and\r
 *  -# Has the entire document structure (html and body not needed).\r
 *\r
 * @par\r
 * However, DOM is helpful in that it makes it easy to move around nodes\r
 * without a lot of lookaheads to see when a tag is closed. This is a\r
 * limitation of the token system and some workarounds would be nice.\r
 */\r
class HTMLPurifier_Lexer\r
{\r
\r
    /**\r
     * Whether or not this lexer implements line-number/column-number tracking.\r
     * If it does, set to true.\r
     */\r
    public $tracksLineNumbers = false;\r
\r
    // -- STATIC ----------------------------------------------------------\r
\r
    /**\r
     * Retrieves or sets the default Lexer as a Prototype Factory.\r
     *\r
     * By default HTMLPurifier_Lexer_DOMLex will be returned. There are\r
     * a few exceptions involving special features that only DirectLex\r
     * implements.\r
     *\r
     * @note The behavior of this class has changed, rather than accepting\r
     *       a prototype object, it now accepts a configuration object.\r
     *       To specify your own prototype, set %Core.LexerImpl to it.\r
     *       This change in behavior de-singletonizes the lexer object.\r
     *\r
     * @param HTMLPurifier_Config $config\r
     * @return HTMLPurifier_Lexer\r
     * @throws HTMLPurifier_Exception\r
     */\r
    public static function create($config)\r
    {\r
        if (!($config instanceof HTMLPurifier_Config)) {\r
            $lexer = $config;\r
            trigger_error(\r
                "Passing a prototype to\r
                HTMLPurifier_Lexer::create() is deprecated, please instead\r
                use %Core.LexerImpl",\r
                E_USER_WARNING\r
            );\r
        } else {\r
            $lexer = $config->get('Core.LexerImpl');\r
        }\r
\r
        $needs_tracking =\r
            $config->get('Core.MaintainLineNumbers') ||\r
            $config->get('Core.CollectErrors');\r
\r
        $inst = null;\r
        if (is_object($lexer)) {\r
            $inst = $lexer;\r
        } else {\r
            if (is_null($lexer)) {\r
                do {\r
                    // auto-detection algorithm\r
                    if ($needs_tracking) {\r
                        $lexer = 'DirectLex';\r
                        break;\r
                    }\r
\r
                    if (class_exists('DOMDocument') &&\r
                        method_exists('DOMDocument', 'loadHTML') &&\r
                        !extension_loaded('domxml')\r
                    ) {\r
                        // check for DOM support, because while it's part of the\r
                        // core, it can be disabled compile time. Also, the PECL\r
                        // domxml extension overrides the default DOM, and is evil\r
                        // and nasty and we shan't bother to support it\r
                        $lexer = 'DOMLex';\r
                    } else {\r
                        $lexer = 'DirectLex';\r
                    }\r
                } while (0);\r
            } // do..while so we can break\r
\r
            // instantiate recognized string names\r
            switch ($lexer) {\r
                case 'DOMLex':\r
                    $inst = new HTMLPurifier_Lexer_DOMLex();\r
                    break;\r
                case 'DirectLex':\r
                    $inst = new HTMLPurifier_Lexer_DirectLex();\r
                    break;\r
                case 'PH5P':\r
                    $inst = new HTMLPurifier_Lexer_PH5P();\r
                    break;\r
                default:\r
                    throw new HTMLPurifier_Exception(\r
                        "Cannot instantiate unrecognized Lexer type " .\r
                        htmlspecialchars($lexer)\r
                    );\r
            }\r
        }\r
\r
        if (!$inst) {\r
            throw new HTMLPurifier_Exception('No lexer was instantiated');\r
        }\r
\r
        // once PHP DOM implements native line numbers, or we\r
        // hack out something using XSLT, remove this stipulation\r
        if ($needs_tracking && !$inst->tracksLineNumbers) {\r
            throw new HTMLPurifier_Exception(\r
                'Cannot use lexer that does not support line numbers with ' .\r
                'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'\r
            );\r
        }\r
\r
        return $inst;\r
\r
    }\r
\r
    // -- CONVENIENCE MEMBERS ---------------------------------------------\r
\r
    public function __construct()\r
    {\r
        $this->_entity_parser = new HTMLPurifier_EntityParser();\r
    }\r
\r
    /**\r
     * Most common entity to raw value conversion table for special entities.\r
     * @type array\r
     */\r
    protected $_special_entity2str =\r
        array(\r
            '&quot;' => '"',\r
            '&amp;' => '&',\r
            '&lt;' => '<',\r
            '&gt;' => '>',\r
            '&#39;' => "'",\r
            '&#039;' => "'",\r
            '&#x27;' => "'"\r
        );\r
\r
    /**\r
     * Parses special entities into the proper characters.\r
     *\r
     * This string will translate escaped versions of the special characters\r
     * into the correct ones.\r
     *\r
     * @warning\r
     * You should be able to treat the output of this function as\r
     * completely parsed, but that's only because all other entities should\r
     * have been handled previously in substituteNonSpecialEntities()\r
     *\r
     * @param string $string String character data to be parsed.\r
     * @return string Parsed character data.\r
     */\r
    public function parseData($string)\r
    {\r
        // following functions require at least one character\r
        if ($string === '') {\r
            return '';\r
        }\r
\r
        // subtracts amps that cannot possibly be escaped\r
        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -\r
            ($string[strlen($string) - 1] === '&' ? 1 : 0);\r
\r
        if (!$num_amp) {\r
            return $string;\r
        } // abort if no entities\r
        $num_esc_amp = substr_count($string, '&amp;');\r
        $string = strtr($string, $this->_special_entity2str);\r
\r
        // code duplication for sake of optimization, see above\r
        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -\r
            ($string[strlen($string) - 1] === '&' ? 1 : 0);\r
\r
        if ($num_amp_2 <= $num_esc_amp) {\r
            return $string;\r
        }\r
\r
        // hmm... now we have some uncommon entities. Use the callback.\r
        $string = $this->_entity_parser->substituteSpecialEntities($string);\r
        return $string;\r
    }\r
\r
    /**\r
     * Lexes an HTML string into tokens.\r
     * @param $string String HTML.\r
     * @param HTMLPurifier_Config $config\r
     * @param HTMLPurifier_Context $context\r
     * @return HTMLPurifier_Token[] array representation of HTML.\r
     */\r
    public function tokenizeHTML($string, $config, $context)\r
    {\r
        trigger_error('Call to abstract class', E_USER_ERROR);\r
    }\r
\r
    /**\r
     * Translates CDATA sections into regular sections (through escaping).\r
     * @param string $string HTML string to process.\r
     * @return string HTML with CDATA sections escaped.\r
     */\r
    protected static function escapeCDATA($string)\r
    {\r
        return preg_replace_callback(\r
            '/<!\[CDATA\[(.+?)\]\]>/s',\r
            array('HTMLPurifier_Lexer', 'CDATACallback'),\r
            $string\r
        );\r
    }\r
\r
    /**\r
     * Special CDATA case that is especially convoluted for <script>\r
     * @param string $string HTML string to process.\r
     * @return string HTML with CDATA sections escaped.\r
     */\r
    protected static function escapeCommentedCDATA($string)\r
    {\r
        return preg_replace_callback(\r
            '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',\r
            array('HTMLPurifier_Lexer', 'CDATACallback'),\r
            $string\r
        );\r
    }\r
\r
    /**\r
     * Special Internet Explorer conditional comments should be removed.\r
     * @param string $string HTML string to process.\r
     * @return string HTML with conditional comments removed.\r
     */\r
    protected static function removeIEConditional($string)\r
    {\r
        return preg_replace(\r
            '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings\r
            '',\r
            $string\r
        );\r
    }\r
\r
    /**\r
     * Callback function for escapeCDATA() that does the work.\r
     *\r
     * @warning Though this is public in order to let the callback happen,\r
     *          calling it directly is not recommended.\r
     * @param array $matches PCRE matches array, with index 0 the entire match\r
     *                  and 1 the inside of the CDATA section.\r
     * @return string Escaped internals of the CDATA section.\r
     */\r
    protected static function CDATACallback($matches)\r
    {\r
        // not exactly sure why the character set is needed, but whatever\r
        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');\r
    }\r
\r
    /**\r
     * Takes a piece of HTML and normalizes it by converting entities, fixing\r
     * encoding, extracting bits, and other good stuff.\r
     * @param string $html HTML.\r
     * @param HTMLPurifier_Config $config\r
     * @param HTMLPurifier_Context $context\r
     * @return string\r
     * @todo Consider making protected\r
     */\r
    public function normalize($html, $config, $context)\r
    {\r
        // normalize newlines to \n\r
        if ($config->get('Core.NormalizeNewlines')) {\r
            $html = str_replace("\r\n", "\n", $html);\r
            $html = str_replace("\r", "\n", $html);\r
        }\r
\r
        if ($config->get('HTML.Trusted')) {\r
            // escape convoluted CDATA\r
            $html = $this->escapeCommentedCDATA($html);\r
        }\r
\r
        // escape CDATA\r
        $html = $this->escapeCDATA($html);\r
\r
        $html = $this->removeIEConditional($html);\r
\r
        // extract body from document if applicable\r
        if ($config->get('Core.ConvertDocumentToFragment')) {\r
            $e = false;\r
            if ($config->get('Core.CollectErrors')) {\r
                $e =& $context->get('ErrorCollector');\r
            }\r
            $new_html = $this->extractBody($html);\r
            if ($e && $new_html != $html) {\r
                $e->send(E_WARNING, 'Lexer: Extracted body');\r
            }\r
            $html = $new_html;\r
        }\r
\r
        // expand entities that aren't the big five\r
        $html = $this->_entity_parser->substituteNonSpecialEntities($html);\r
\r
        // clean into wellformed UTF-8 string for an SGML context: this has\r
        // to be done after entity expansion because the entities sometimes\r
        // represent non-SGML characters (horror, horror!)\r
        $html = HTMLPurifier_Encoder::cleanUTF8($html);\r
\r
        // if processing instructions are to removed, remove them now\r
        if ($config->get('Core.RemoveProcessingInstructions')) {\r
            $html = preg_replace('#<\?.+?\?>#s', '', $html);\r
        }\r
\r
        return $html;\r
    }\r
\r
    /**\r
     * Takes a string of HTML (fragment or document) and returns the content\r
     * @todo Consider making protected\r
     */\r
    public function extractBody($html)\r
    {\r
        $matches = array();\r
        $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);\r
        if ($result) {\r
            return $matches[1];\r
        } else {\r
            return $html;\r
        }\r
    }\r
}\r
\r
// vim: et sw=4 sts=4\r
Commit	Line	Data
d4949327 NL	1	<?php\r
	2	\r
	3	/**\r
	4	* Forgivingly lexes HTML (SGML-style) markup into tokens.\r
	5	*\r
	6	* A lexer parses a string of SGML-style markup and converts them into\r
	7	* corresponding tokens. It doesn't check for well-formedness, although its\r
	8	* internal mechanism may make this automatic (such as the case of\r
	9	* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose\r
	10	* from.\r
	11	*\r
	12	* A lexer is HTML-oriented: it might work with XML, but it's not\r
	13	* recommended, as we adhere to a subset of the specification for optimization\r
	14	* reasons. This might change in the future. Also, most tokenizers are not\r
	15	* expected to handle DTDs or PIs.\r
	16	*\r
	17	* This class should not be directly instantiated, but you may use create() to\r
	18	* retrieve a default copy of the lexer. Being a supertype, this class\r
	19	* does not actually define any implementation, but offers commonly used\r
	20	* convenience functions for subclasses.\r
	21	*\r
	22	* @note The unit tests will instantiate this class for testing purposes, as\r
	23	* many of the utility functions require a class to be instantiated.\r
	24	* This means that, even though this class is not runnable, it will\r
	25	* not be declared abstract.\r
	26	*\r
	27	* @par\r
	28	*\r
	29	* @note\r
	30	* We use tokens rather than create a DOM representation because DOM would:\r
	31	*\r
	32	* @par\r
	33	* -# Require more processing and memory to create,\r
	34	* -# Is not streamable, and\r
	35	* -# Has the entire document structure (html and body not needed).\r
	36	*\r
	37	* @par\r
	38	* However, DOM is helpful in that it makes it easy to move around nodes\r
	39	* without a lot of lookaheads to see when a tag is closed. This is a\r
	40	* limitation of the token system and some workarounds would be nice.\r
	41	*/\r
	42	class HTMLPurifier_Lexer\r
	43	{\r
	44	\r
	45	/**\r
	46	* Whether or not this lexer implements line-number/column-number tracking.\r
	47	* If it does, set to true.\r
	48	*/\r
	49	public $tracksLineNumbers = false;\r
	50	\r
	51	// -- STATIC ----------------------------------------------------------\r
	52	\r
	53	/**\r
	54	* Retrieves or sets the default Lexer as a Prototype Factory.\r
	55	*\r
	56	* By default HTMLPurifier_Lexer_DOMLex will be returned. There are\r
	57	* a few exceptions involving special features that only DirectLex\r
	58	* implements.\r
	59	*\r
	60	* @note The behavior of this class has changed, rather than accepting\r
	61	* a prototype object, it now accepts a configuration object.\r
	62	* To specify your own prototype, set %Core.LexerImpl to it.\r
	63	* This change in behavior de-singletonizes the lexer object.\r
	64	*\r
65	* @param HTMLPurifier_Config $config\r
66	* @return HTMLPurifier_Lexer\r
67	* @throws HTMLPurifier_Exception\r
68	*/\r
69	public static function create($config)\r
70	{\r
71	if (!($config instanceof HTMLPurifier_Config)) {\r
72	$lexer = $config;\r
73	trigger_error(\r
74	"Passing a prototype to\r
75	HTMLPurifier_Lexer::create() is deprecated, please instead\r
76	use %Core.LexerImpl",\r
77	E_USER_WARNING\r
78	);\r
79	} else {\r
80	$lexer = $config->get('Core.LexerImpl');\r
81	}\r
82	\r
83	$needs_tracking =\r
84	$config->get('Core.MaintainLineNumbers') \|\|\r
85	$config->get('Core.CollectErrors');\r
86	\r
87	$inst = null;\r
88	if (is_object($lexer)) {\r
89	$inst = $lexer;\r
90	} else {\r
91	if (is_null($lexer)) {\r
92	do {\r
93	// auto-detection algorithm\r
94	if ($needs_tracking) {\r
95	$lexer = 'DirectLex';\r
96	break;\r
97	}\r
98	\r
99	if (class_exists('DOMDocument') &&\r
100	method_exists('DOMDocument', 'loadHTML') &&\r
101	!extension_loaded('domxml')\r
102	) {\r
103	// check for DOM support, because while it's part of the\r
104	// core, it can be disabled compile time. Also, the PECL\r
105	// domxml extension overrides the default DOM, and is evil\r
106	// and nasty and we shan't bother to support it\r
107	$lexer = 'DOMLex';\r
108	} else {\r
109	$lexer = 'DirectLex';\r
110	}\r
111	} while (0);\r
112	} // do..while so we can break\r
113	\r
114	// instantiate recognized string names\r
115	switch ($lexer) {\r
116	case 'DOMLex':\r
117	$inst = new HTMLPurifier_Lexer_DOMLex();\r
118	break;\r
119	case 'DirectLex':\r
120	$inst = new HTMLPurifier_Lexer_DirectLex();\r
121	break;\r
122	case 'PH5P':\r
123	$inst = new HTMLPurifier_Lexer_PH5P();\r
124	break;\r
125	default:\r
126	throw new HTMLPurifier_Exception(\r
127	"Cannot instantiate unrecognized Lexer type " .\r
128	htmlspecialchars($lexer)\r
129	);\r
130	}\r
131	}\r
132	\r
133	if (!$inst) {\r
134	throw new HTMLPurifier_Exception('No lexer was instantiated');\r
135	}\r
136	\r
137	// once PHP DOM implements native line numbers, or we\r
138	// hack out something using XSLT, remove this stipulation\r
139	if ($needs_tracking && !$inst->tracksLineNumbers) {\r
140	throw new HTMLPurifier_Exception(\r
141	'Cannot use lexer that does not support line numbers with ' .\r
142	'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'\r
143	);\r
144	}\r
145	\r
146	return $inst;\r
147	\r
148	}\r
149	\r
150	// -- CONVENIENCE MEMBERS ---------------------------------------------\r
151	\r
152	public function __construct()\r
153	{\r
154	$this->_entity_parser = new HTMLPurifier_EntityParser();\r
155	}\r
156	\r
157	/**\r
158	* Most common entity to raw value conversion table for special entities.\r
159	* @type array\r
160	*/\r
161	protected $_special_entity2str =\r
162	array(\r
163	'"' => '"',\r
164	'&' => '&',\r
165	'<' => '<',\r
166	'>' => '>',\r
167	''' => "'",\r
168	''' => "'",\r
169	''' => "'"\r
170	);\r
171	\r
172	/**\r
173	* Parses special entities into the proper characters.\r
174	*\r
175	* This string will translate escaped versions of the special characters\r
176	* into the correct ones.\r
177	*\r
178	* @warning\r
179	* You should be able to treat the output of this function as\r
180	* completely parsed, but that's only because all other entities should\r
181	* have been handled previously in substituteNonSpecialEntities()\r
182	*\r
183	* @param string $string String character data to be parsed.\r
184	* @return string Parsed character data.\r
185	*/\r
186	public function parseData($string)\r
187	{\r
188	// following functions require at least one character\r
189	if ($string === '') {\r
190	return '';\r
191	}\r
192	\r
193	// subtracts amps that cannot possibly be escaped\r
194	$num_amp = substr_count($string, '&') - substr_count($string, '& ') -\r
195	($string[strlen($string) - 1] === '&' ? 1 : 0);\r
196	\r
197	if (!$num_amp) {\r
198	return $string;\r
199	} // abort if no entities\r
200	$num_esc_amp = substr_count($string, '&');\r
201	$string = strtr($string, $this->_special_entity2str);\r
202	\r
203	// code duplication for sake of optimization, see above\r
204	$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -\r
205	($string[strlen($string) - 1] === '&' ? 1 : 0);\r
206	\r
207	if ($num_amp_2 <= $num_esc_amp) {\r
208	return $string;\r
209	}\r
210	\r
211	// hmm... now we have some uncommon entities. Use the callback.\r
212	$string = $this->_entity_parser->substituteSpecialEntities($string);\r
213	return $string;\r
214	}\r
215	\r
216	/**\r
217	* Lexes an HTML string into tokens.\r
218	* @param $string String HTML.\r
219	* @param HTMLPurifier_Config $config\r
220	* @param HTMLPurifier_Context $context\r
221	* @return HTMLPurifier_Token[] array representation of HTML.\r
222	*/\r
223	public function tokenizeHTML($string, $config, $context)\r
224	{\r
225	trigger_error('Call to abstract class', E_USER_ERROR);\r
226	}\r
227	\r
228	/**\r
229	* Translates CDATA sections into regular sections (through escaping).\r
230	* @param string $string HTML string to process.\r
231	* @return string HTML with CDATA sections escaped.\r
232	*/\r
233	protected static function escapeCDATA($string)\r
234	{\r
235	return preg_replace_callback(\r
236	'/<!\[CDATA\[(.+?)\]\]>/s',\r
237	array('HTMLPurifier_Lexer', 'CDATACallback'),\r
238	$string\r
239	);\r
240	}\r
241	\r
242	/**\r
243	* Special CDATA case that is especially convoluted for <script>\r
244	* @param string $string HTML string to process.\r
245	* @return string HTML with CDATA sections escaped.\r
246	*/\r
247	protected static function escapeCommentedCDATA($string)\r
248	{\r
249	return preg_replace_callback(\r
250	'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',\r
251	array('HTMLPurifier_Lexer', 'CDATACallback'),\r
252	$string\r
253	);\r
254	}\r
255	\r
256	/**\r
257	* Special Internet Explorer conditional comments should be removed.\r
258	* @param string $string HTML string to process.\r
259	* @return string HTML with conditional comments removed.\r
260	*/\r
261	protected static function removeIEConditional($string)\r
262	{\r
263	return preg_replace(\r
264	'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings\r
265	'',\r
266	$string\r
267	);\r
268	}\r
269	\r
270	/**\r
271	* Callback function for escapeCDATA() that does the work.\r
272	*\r
273	* @warning Though this is public in order to let the callback happen,\r
274	* calling it directly is not recommended.\r
275	* @param array $matches PCRE matches array, with index 0 the entire match\r
276	* and 1 the inside of the CDATA section.\r
277	* @return string Escaped internals of the CDATA section.\r
278	*/\r
279	protected static function CDATACallback($matches)\r
280	{\r
281	// not exactly sure why the character set is needed, but whatever\r
282	return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');\r
283	}\r
284	\r
285	/**\r
286	* Takes a piece of HTML and normalizes it by converting entities, fixing\r
287	* encoding, extracting bits, and other good stuff.\r
288	* @param string $html HTML.\r
289	* @param HTMLPurifier_Config $config\r
290	* @param HTMLPurifier_Context $context\r
291	* @return string\r
292	* @todo Consider making protected\r
293	*/\r
294	public function normalize($html, $config, $context)\r
295	{\r
296	// normalize newlines to \n\r
297	if ($config->get('Core.NormalizeNewlines')) {\r
298	$html = str_replace("\r\n", "\n", $html);\r
299	$html = str_replace("\r", "\n", $html);\r
300	}\r
301	\r
302	if ($config->get('HTML.Trusted')) {\r
303	// escape convoluted CDATA\r
304	$html = $this->escapeCommentedCDATA($html);\r
305	}\r
306	\r
307	// escape CDATA\r
308	$html = $this->escapeCDATA($html);\r
309	\r
310	$html = $this->removeIEConditional($html);\r
311	\r
312	// extract body from document if applicable\r
313	if ($config->get('Core.ConvertDocumentToFragment')) {\r
314	$e = false;\r
315	if ($config->get('Core.CollectErrors')) {\r
316	$e =& $context->get('ErrorCollector');\r
317	}\r
318	$new_html = $this->extractBody($html);\r
319	if ($e && $new_html != $html) {\r
320	$e->send(E_WARNING, 'Lexer: Extracted body');\r
321	}\r
322	$html = $new_html;\r
323	}\r
324	\r
325	// expand entities that aren't the big five\r
326	$html = $this->_entity_parser->substituteNonSpecialEntities($html);\r
327	\r
328	// clean into wellformed UTF-8 string for an SGML context: this has\r
329	// to be done after entity expansion because the entities sometimes\r
330	// represent non-SGML characters (horror, horror!)\r
331	$html = HTMLPurifier_Encoder::cleanUTF8($html);\r
332	\r
333	// if processing instructions are to removed, remove them now\r
334	if ($config->get('Core.RemoveProcessingInstructions')) {\r
335	$html = preg_replace('#<\?.+?\?>#s', '', $html);\r
336	}\r
337	\r
338	return $html;\r
339	}\r
340	\r
341	/**\r
342	* Takes a string of HTML (fragment or document) and returns the content\r
343	* @todo Consider making protected\r
344	*/\r
345	public function extractBody($html)\r
346	{\r
347	$matches = array();\r
348	$result = preg_match('!<body[^>]>(.)</body>!is', $html, $matches);\r
349	if ($result) {\r
350	return $matches[1];\r
351	} else {\r
352	return $html;\r
353	}\r
354	}\r
355	}\r
356	\r
357	// vim: et sw=4 sts=4\r