]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php
remove autoload section in composer.json
[github/wallabag/wallabag.git] / inc / 3rdparty / htmlpurifier / HTMLPurifier / Lexer.php
CommitLineData
d4949327
NL
1<?php\r
2\r
3/**\r
4 * Forgivingly lexes HTML (SGML-style) markup into tokens.\r
5 *\r
6 * A lexer parses a string of SGML-style markup and converts them into\r
7 * corresponding tokens. It doesn't check for well-formedness, although its\r
8 * internal mechanism may make this automatic (such as the case of\r
9 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose\r
10 * from.\r
11 *\r
12 * A lexer is HTML-oriented: it might work with XML, but it's not\r
13 * recommended, as we adhere to a subset of the specification for optimization\r
14 * reasons. This might change in the future. Also, most tokenizers are not\r
15 * expected to handle DTDs or PIs.\r
16 *\r
17 * This class should not be directly instantiated, but you may use create() to\r
18 * retrieve a default copy of the lexer. Being a supertype, this class\r
19 * does not actually define any implementation, but offers commonly used\r
20 * convenience functions for subclasses.\r
21 *\r
22 * @note The unit tests will instantiate this class for testing purposes, as\r
23 * many of the utility functions require a class to be instantiated.\r
24 * This means that, even though this class is not runnable, it will\r
25 * not be declared abstract.\r
26 *\r
27 * @par\r
28 *\r
29 * @note\r
30 * We use tokens rather than create a DOM representation because DOM would:\r
31 *\r
32 * @par\r
33 * -# Require more processing and memory to create,\r
34 * -# Is not streamable, and\r
35 * -# Has the entire document structure (html and body not needed).\r
36 *\r
37 * @par\r
38 * However, DOM is helpful in that it makes it easy to move around nodes\r
39 * without a lot of lookaheads to see when a tag is closed. This is a\r
40 * limitation of the token system and some workarounds would be nice.\r
41 */\r
42class HTMLPurifier_Lexer\r
43{\r
44\r
45 /**\r
46 * Whether or not this lexer implements line-number/column-number tracking.\r
47 * If it does, set to true.\r
48 */\r
49 public $tracksLineNumbers = false;\r
50\r
51 // -- STATIC ----------------------------------------------------------\r
52\r
53 /**\r
54 * Retrieves or sets the default Lexer as a Prototype Factory.\r
55 *\r
56 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are\r
57 * a few exceptions involving special features that only DirectLex\r
58 * implements.\r
59 *\r
60 * @note The behavior of this class has changed, rather than accepting\r
61 * a prototype object, it now accepts a configuration object.\r
62 * To specify your own prototype, set %Core.LexerImpl to it.\r
63 * This change in behavior de-singletonizes the lexer object.\r
64 *\r
65 * @param HTMLPurifier_Config $config\r
66 * @return HTMLPurifier_Lexer\r
67 * @throws HTMLPurifier_Exception\r
68 */\r
69 public static function create($config)\r
70 {\r
71 if (!($config instanceof HTMLPurifier_Config)) {\r
72 $lexer = $config;\r
73 trigger_error(\r
74 "Passing a prototype to\r
75 HTMLPurifier_Lexer::create() is deprecated, please instead\r
76 use %Core.LexerImpl",\r
77 E_USER_WARNING\r
78 );\r
79 } else {\r
80 $lexer = $config->get('Core.LexerImpl');\r
81 }\r
82\r
83 $needs_tracking =\r
84 $config->get('Core.MaintainLineNumbers') ||\r
85 $config->get('Core.CollectErrors');\r
86\r
87 $inst = null;\r
88 if (is_object($lexer)) {\r
89 $inst = $lexer;\r
90 } else {\r
91 if (is_null($lexer)) {\r
92 do {\r
93 // auto-detection algorithm\r
94 if ($needs_tracking) {\r
95 $lexer = 'DirectLex';\r
96 break;\r
97 }\r
98\r
99 if (class_exists('DOMDocument') &&\r
100 method_exists('DOMDocument', 'loadHTML') &&\r
101 !extension_loaded('domxml')\r
102 ) {\r
103 // check for DOM support, because while it's part of the\r
104 // core, it can be disabled compile time. Also, the PECL\r
105 // domxml extension overrides the default DOM, and is evil\r
106 // and nasty and we shan't bother to support it\r
107 $lexer = 'DOMLex';\r
108 } else {\r
109 $lexer = 'DirectLex';\r
110 }\r
111 } while (0);\r
112 } // do..while so we can break\r
113\r
114 // instantiate recognized string names\r
115 switch ($lexer) {\r
116 case 'DOMLex':\r
117 $inst = new HTMLPurifier_Lexer_DOMLex();\r
118 break;\r
119 case 'DirectLex':\r
120 $inst = new HTMLPurifier_Lexer_DirectLex();\r
121 break;\r
122 case 'PH5P':\r
123 $inst = new HTMLPurifier_Lexer_PH5P();\r
124 break;\r
125 default:\r
126 throw new HTMLPurifier_Exception(\r
127 "Cannot instantiate unrecognized Lexer type " .\r
128 htmlspecialchars($lexer)\r
129 );\r
130 }\r
131 }\r
132\r
133 if (!$inst) {\r
134 throw new HTMLPurifier_Exception('No lexer was instantiated');\r
135 }\r
136\r
137 // once PHP DOM implements native line numbers, or we\r
138 // hack out something using XSLT, remove this stipulation\r
139 if ($needs_tracking && !$inst->tracksLineNumbers) {\r
140 throw new HTMLPurifier_Exception(\r
141 'Cannot use lexer that does not support line numbers with ' .\r
142 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'\r
143 );\r
144 }\r
145\r
146 return $inst;\r
147\r
148 }\r
149\r
150 // -- CONVENIENCE MEMBERS ---------------------------------------------\r
151\r
152 public function __construct()\r
153 {\r
154 $this->_entity_parser = new HTMLPurifier_EntityParser();\r
155 }\r
156\r
157 /**\r
158 * Most common entity to raw value conversion table for special entities.\r
159 * @type array\r
160 */\r
161 protected $_special_entity2str =\r
162 array(\r
163 '&quot;' => '"',\r
164 '&amp;' => '&',\r
165 '&lt;' => '<',\r
166 '&gt;' => '>',\r
167 '&#39;' => "'",\r
168 '&#039;' => "'",\r
169 '&#x27;' => "'"\r
170 );\r
171\r
172 /**\r
173 * Parses special entities into the proper characters.\r
174 *\r
175 * This string will translate escaped versions of the special characters\r
176 * into the correct ones.\r
177 *\r
178 * @warning\r
179 * You should be able to treat the output of this function as\r
180 * completely parsed, but that's only because all other entities should\r
181 * have been handled previously in substituteNonSpecialEntities()\r
182 *\r
183 * @param string $string String character data to be parsed.\r
184 * @return string Parsed character data.\r
185 */\r
186 public function parseData($string)\r
187 {\r
188 // following functions require at least one character\r
189 if ($string === '') {\r
190 return '';\r
191 }\r
192\r
193 // subtracts amps that cannot possibly be escaped\r
194 $num_amp = substr_count($string, '&') - substr_count($string, '& ') -\r
195 ($string[strlen($string) - 1] === '&' ? 1 : 0);\r
196\r
197 if (!$num_amp) {\r
198 return $string;\r
199 } // abort if no entities\r
200 $num_esc_amp = substr_count($string, '&amp;');\r
201 $string = strtr($string, $this->_special_entity2str);\r
202\r
203 // code duplication for sake of optimization, see above\r
204 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -\r
205 ($string[strlen($string) - 1] === '&' ? 1 : 0);\r
206\r
207 if ($num_amp_2 <= $num_esc_amp) {\r
208 return $string;\r
209 }\r
210\r
211 // hmm... now we have some uncommon entities. Use the callback.\r
212 $string = $this->_entity_parser->substituteSpecialEntities($string);\r
213 return $string;\r
214 }\r
215\r
216 /**\r
217 * Lexes an HTML string into tokens.\r
218 * @param $string String HTML.\r
219 * @param HTMLPurifier_Config $config\r
220 * @param HTMLPurifier_Context $context\r
221 * @return HTMLPurifier_Token[] array representation of HTML.\r
222 */\r
223 public function tokenizeHTML($string, $config, $context)\r
224 {\r
225 trigger_error('Call to abstract class', E_USER_ERROR);\r
226 }\r
227\r
228 /**\r
229 * Translates CDATA sections into regular sections (through escaping).\r
230 * @param string $string HTML string to process.\r
231 * @return string HTML with CDATA sections escaped.\r
232 */\r
233 protected static function escapeCDATA($string)\r
234 {\r
235 return preg_replace_callback(\r
236 '/<!\[CDATA\[(.+?)\]\]>/s',\r
237 array('HTMLPurifier_Lexer', 'CDATACallback'),\r
238 $string\r
239 );\r
240 }\r
241\r
242 /**\r
243 * Special CDATA case that is especially convoluted for <script>\r
244 * @param string $string HTML string to process.\r
245 * @return string HTML with CDATA sections escaped.\r
246 */\r
247 protected static function escapeCommentedCDATA($string)\r
248 {\r
249 return preg_replace_callback(\r
250 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',\r
251 array('HTMLPurifier_Lexer', 'CDATACallback'),\r
252 $string\r
253 );\r
254 }\r
255\r
256 /**\r
257 * Special Internet Explorer conditional comments should be removed.\r
258 * @param string $string HTML string to process.\r
259 * @return string HTML with conditional comments removed.\r
260 */\r
261 protected static function removeIEConditional($string)\r
262 {\r
263 return preg_replace(\r
264 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings\r
265 '',\r
266 $string\r
267 );\r
268 }\r
269\r
270 /**\r
271 * Callback function for escapeCDATA() that does the work.\r
272 *\r
273 * @warning Though this is public in order to let the callback happen,\r
274 * calling it directly is not recommended.\r
275 * @param array $matches PCRE matches array, with index 0 the entire match\r
276 * and 1 the inside of the CDATA section.\r
277 * @return string Escaped internals of the CDATA section.\r
278 */\r
279 protected static function CDATACallback($matches)\r
280 {\r
281 // not exactly sure why the character set is needed, but whatever\r
282 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');\r
283 }\r
284\r
285 /**\r
286 * Takes a piece of HTML and normalizes it by converting entities, fixing\r
287 * encoding, extracting bits, and other good stuff.\r
288 * @param string $html HTML.\r
289 * @param HTMLPurifier_Config $config\r
290 * @param HTMLPurifier_Context $context\r
291 * @return string\r
292 * @todo Consider making protected\r
293 */\r
294 public function normalize($html, $config, $context)\r
295 {\r
296 // normalize newlines to \n\r
297 if ($config->get('Core.NormalizeNewlines')) {\r
298 $html = str_replace("\r\n", "\n", $html);\r
299 $html = str_replace("\r", "\n", $html);\r
300 }\r
301\r
302 if ($config->get('HTML.Trusted')) {\r
303 // escape convoluted CDATA\r
304 $html = $this->escapeCommentedCDATA($html);\r
305 }\r
306\r
307 // escape CDATA\r
308 $html = $this->escapeCDATA($html);\r
309\r
310 $html = $this->removeIEConditional($html);\r
311\r
312 // extract body from document if applicable\r
313 if ($config->get('Core.ConvertDocumentToFragment')) {\r
314 $e = false;\r
315 if ($config->get('Core.CollectErrors')) {\r
316 $e =& $context->get('ErrorCollector');\r
317 }\r
318 $new_html = $this->extractBody($html);\r
319 if ($e && $new_html != $html) {\r
320 $e->send(E_WARNING, 'Lexer: Extracted body');\r
321 }\r
322 $html = $new_html;\r
323 }\r
324\r
325 // expand entities that aren't the big five\r
326 $html = $this->_entity_parser->substituteNonSpecialEntities($html);\r
327\r
328 // clean into wellformed UTF-8 string for an SGML context: this has\r
329 // to be done after entity expansion because the entities sometimes\r
330 // represent non-SGML characters (horror, horror!)\r
331 $html = HTMLPurifier_Encoder::cleanUTF8($html);\r
332\r
333 // if processing instructions are to removed, remove them now\r
334 if ($config->get('Core.RemoveProcessingInstructions')) {\r
335 $html = preg_replace('#<\?.+?\?>#s', '', $html);\r
336 }\r
337\r
338 return $html;\r
339 }\r
340\r
341 /**\r
342 * Takes a string of HTML (fragment or document) and returns the content\r
343 * @todo Consider making protected\r
344 */\r
345 public function extractBody($html)\r
346 {\r
347 $matches = array();\r
348 $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);\r
349 if ($result) {\r
350 return $matches[1];\r
351 } else {\r
352 return $html;\r
353 }\r
354 }\r
355}\r
356\r
357// vim: et sw=4 sts=4\r