diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-02-21 15:57:10 +0100 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-02-21 15:57:10 +0100 |
commit | 99679d06884120c57f43b44e55e03595f1f87bed (patch) | |
tree | a3f2a1aa1afdaeca1386d0c6e8a75344fd2241fb /inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php | |
parent | 655214ab30ee84884dc408488b85586f36263fcb (diff) | |
parent | d3b47e94705e17b3ba3529cbb1dc6efe69c5d2b7 (diff) | |
download | wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.gz wallabag-99679d06884120c57f43b44e55e03595f1f87bed.tar.zst wallabag-99679d06884120c57f43b44e55e03595f1f87bed.zip |
Merge pull request #481 from wallabag/dev1.5.2
1.5.2
Diffstat (limited to 'inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php')
-rw-r--r-- | inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php new file mode 100644 index 00000000..2a9a9d62 --- /dev/null +++ b/inc/3rdparty/htmlpurifier/HTMLPurifier/Lexer.php | |||
@@ -0,0 +1,357 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Forgivingly lexes HTML (SGML-style) markup into tokens. | ||
5 | * | ||
6 | * A lexer parses a string of SGML-style markup and converts them into | ||
7 | * corresponding tokens. It doesn't check for well-formedness, although its | ||
8 | * internal mechanism may make this automatic (such as the case of | ||
9 | * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose | ||
10 | * from. | ||
11 | * | ||
12 | * A lexer is HTML-oriented: it might work with XML, but it's not | ||
13 | * recommended, as we adhere to a subset of the specification for optimization | ||
14 | * reasons. This might change in the future. Also, most tokenizers are not | ||
15 | * expected to handle DTDs or PIs. | ||
16 | * | ||
17 | * This class should not be directly instantiated, but you may use create() to | ||
18 | * retrieve a default copy of the lexer. Being a supertype, this class | ||
19 | * does not actually define any implementation, but offers commonly used | ||
20 | * convenience functions for subclasses. | ||
21 | * | ||
22 | * @note The unit tests will instantiate this class for testing purposes, as | ||
23 | * many of the utility functions require a class to be instantiated. | ||
24 | * This means that, even though this class is not runnable, it will | ||
25 | * not be declared abstract. | ||
26 | * | ||
27 | * @par | ||
28 | * | ||
29 | * @note | ||
30 | * We use tokens rather than create a DOM representation because DOM would: | ||
31 | * | ||
32 | * @par | ||
33 | * -# Require more processing and memory to create, | ||
34 | * -# Is not streamable, and | ||
35 | * -# Has the entire document structure (html and body not needed). | ||
36 | * | ||
37 | * @par | ||
38 | * However, DOM is helpful in that it makes it easy to move around nodes | ||
39 | * without a lot of lookaheads to see when a tag is closed. This is a | ||
40 | * limitation of the token system and some workarounds would be nice. | ||
41 | */ | ||
42 | class HTMLPurifier_Lexer | ||
43 | { | ||
44 | |||
45 | /** | ||
46 | * Whether or not this lexer implements line-number/column-number tracking. | ||
47 | * If it does, set to true. | ||
48 | */ | ||
49 | public $tracksLineNumbers = false; | ||
50 | |||
51 | // -- STATIC ---------------------------------------------------------- | ||
52 | |||
53 | /** | ||
54 | * Retrieves or sets the default Lexer as a Prototype Factory. | ||
55 | * | ||
56 | * By default HTMLPurifier_Lexer_DOMLex will be returned. There are | ||
57 | * a few exceptions involving special features that only DirectLex | ||
58 | * implements. | ||
59 | * | ||
60 | * @note The behavior of this class has changed, rather than accepting | ||
61 | * a prototype object, it now accepts a configuration object. | ||
62 | * To specify your own prototype, set %Core.LexerImpl to it. | ||
63 | * This change in behavior de-singletonizes the lexer object. | ||
64 | * | ||
65 | * @param HTMLPurifier_Config $config | ||
66 | * @return HTMLPurifier_Lexer | ||
67 | * @throws HTMLPurifier_Exception | ||
68 | */ | ||
69 | public static function create($config) | ||
70 | { | ||
71 | if (!($config instanceof HTMLPurifier_Config)) { | ||
72 | $lexer = $config; | ||
73 | trigger_error( | ||
74 | "Passing a prototype to | ||
75 | HTMLPurifier_Lexer::create() is deprecated, please instead | ||
76 | use %Core.LexerImpl", | ||
77 | E_USER_WARNING | ||
78 | ); | ||
79 | } else { | ||
80 | $lexer = $config->get('Core.LexerImpl'); | ||
81 | } | ||
82 | |||
83 | $needs_tracking = | ||
84 | $config->get('Core.MaintainLineNumbers') || | ||
85 | $config->get('Core.CollectErrors'); | ||
86 | |||
87 | $inst = null; | ||
88 | if (is_object($lexer)) { | ||
89 | $inst = $lexer; | ||
90 | } else { | ||
91 | if (is_null($lexer)) { | ||
92 | do { | ||
93 | // auto-detection algorithm | ||
94 | if ($needs_tracking) { | ||
95 | $lexer = 'DirectLex'; | ||
96 | break; | ||
97 | } | ||
98 | |||
99 | if (class_exists('DOMDocument') && | ||
100 | method_exists('DOMDocument', 'loadHTML') && | ||
101 | !extension_loaded('domxml') | ||
102 | ) { | ||
103 | // check for DOM support, because while it's part of the | ||
104 | // core, it can be disabled compile time. Also, the PECL | ||
105 | // domxml extension overrides the default DOM, and is evil | ||
106 | // and nasty and we shan't bother to support it | ||
107 | $lexer = 'DOMLex'; | ||
108 | } else { | ||
109 | $lexer = 'DirectLex'; | ||
110 | } | ||
111 | } while (0); | ||
112 | } // do..while so we can break | ||
113 | |||
114 | // instantiate recognized string names | ||
115 | switch ($lexer) { | ||
116 | case 'DOMLex': | ||
117 | $inst = new HTMLPurifier_Lexer_DOMLex(); | ||
118 | break; | ||
119 | case 'DirectLex': | ||
120 | $inst = new HTMLPurifier_Lexer_DirectLex(); | ||
121 | break; | ||
122 | case 'PH5P': | ||
123 | $inst = new HTMLPurifier_Lexer_PH5P(); | ||
124 | break; | ||
125 | default: | ||
126 | throw new HTMLPurifier_Exception( | ||
127 | "Cannot instantiate unrecognized Lexer type " . | ||
128 | htmlspecialchars($lexer) | ||
129 | ); | ||
130 | } | ||
131 | } | ||
132 | |||
133 | if (!$inst) { | ||
134 | throw new HTMLPurifier_Exception('No lexer was instantiated'); | ||
135 | } | ||
136 | |||
137 | // once PHP DOM implements native line numbers, or we | ||
138 | // hack out something using XSLT, remove this stipulation | ||
139 | if ($needs_tracking && !$inst->tracksLineNumbers) { | ||
140 | throw new HTMLPurifier_Exception( | ||
141 | 'Cannot use lexer that does not support line numbers with ' . | ||
142 | 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' | ||
143 | ); | ||
144 | } | ||
145 | |||
146 | return $inst; | ||
147 | |||
148 | } | ||
149 | |||
150 | // -- CONVENIENCE MEMBERS --------------------------------------------- | ||
151 | |||
152 | public function __construct() | ||
153 | { | ||
154 | $this->_entity_parser = new HTMLPurifier_EntityParser(); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * Most common entity to raw value conversion table for special entities. | ||
159 | * @type array | ||
160 | */ | ||
161 | protected $_special_entity2str = | ||
162 | array( | ||
163 | '"' => '"', | ||
164 | '&' => '&', | ||
165 | '<' => '<', | ||
166 | '>' => '>', | ||
167 | ''' => "'", | ||
168 | ''' => "'", | ||
169 | ''' => "'" | ||
170 | ); | ||
171 | |||
172 | /** | ||
173 | * Parses special entities into the proper characters. | ||
174 | * | ||
175 | * This string will translate escaped versions of the special characters | ||
176 | * into the correct ones. | ||
177 | * | ||
178 | * @warning | ||
179 | * You should be able to treat the output of this function as | ||
180 | * completely parsed, but that's only because all other entities should | ||
181 | * have been handled previously in substituteNonSpecialEntities() | ||
182 | * | ||
183 | * @param string $string String character data to be parsed. | ||
184 | * @return string Parsed character data. | ||
185 | */ | ||
186 | public function parseData($string) | ||
187 | { | ||
188 | // following functions require at least one character | ||
189 | if ($string === '') { | ||
190 | return ''; | ||
191 | } | ||
192 | |||
193 | // subtracts amps that cannot possibly be escaped | ||
194 | $num_amp = substr_count($string, '&') - substr_count($string, '& ') - | ||
195 | ($string[strlen($string) - 1] === '&' ? 1 : 0); | ||
196 | |||
197 | if (!$num_amp) { | ||
198 | return $string; | ||
199 | } // abort if no entities | ||
200 | $num_esc_amp = substr_count($string, '&'); | ||
201 | $string = strtr($string, $this->_special_entity2str); | ||
202 | |||
203 | // code duplication for sake of optimization, see above | ||
204 | $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - | ||
205 | ($string[strlen($string) - 1] === '&' ? 1 : 0); | ||
206 | |||
207 | if ($num_amp_2 <= $num_esc_amp) { | ||
208 | return $string; | ||
209 | } | ||
210 | |||
211 | // hmm... now we have some uncommon entities. Use the callback. | ||
212 | $string = $this->_entity_parser->substituteSpecialEntities($string); | ||
213 | return $string; | ||
214 | } | ||
215 | |||
216 | /** | ||
217 | * Lexes an HTML string into tokens. | ||
218 | * @param $string String HTML. | ||
219 | * @param HTMLPurifier_Config $config | ||
220 | * @param HTMLPurifier_Context $context | ||
221 | * @return HTMLPurifier_Token[] array representation of HTML. | ||
222 | */ | ||
223 | public function tokenizeHTML($string, $config, $context) | ||
224 | { | ||
225 | trigger_error('Call to abstract class', E_USER_ERROR); | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * Translates CDATA sections into regular sections (through escaping). | ||
230 | * @param string $string HTML string to process. | ||
231 | * @return string HTML with CDATA sections escaped. | ||
232 | */ | ||
233 | protected static function escapeCDATA($string) | ||
234 | { | ||
235 | return preg_replace_callback( | ||
236 | '/<!\[CDATA\[(.+?)\]\]>/s', | ||
237 | array('HTMLPurifier_Lexer', 'CDATACallback'), | ||
238 | $string | ||
239 | ); | ||
240 | } | ||
241 | |||
242 | /** | ||
243 | * Special CDATA case that is especially convoluted for <script> | ||
244 | * @param string $string HTML string to process. | ||
245 | * @return string HTML with CDATA sections escaped. | ||
246 | */ | ||
247 | protected static function escapeCommentedCDATA($string) | ||
248 | { | ||
249 | return preg_replace_callback( | ||
250 | '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', | ||
251 | array('HTMLPurifier_Lexer', 'CDATACallback'), | ||
252 | $string | ||
253 | ); | ||
254 | } | ||
255 | |||
256 | /** | ||
257 | * Special Internet Explorer conditional comments should be removed. | ||
258 | * @param string $string HTML string to process. | ||
259 | * @return string HTML with conditional comments removed. | ||
260 | */ | ||
261 | protected static function removeIEConditional($string) | ||
262 | { | ||
263 | return preg_replace( | ||
264 | '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings | ||
265 | '', | ||
266 | $string | ||
267 | ); | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * Callback function for escapeCDATA() that does the work. | ||
272 | * | ||
273 | * @warning Though this is public in order to let the callback happen, | ||
274 | * calling it directly is not recommended. | ||
275 | * @param array $matches PCRE matches array, with index 0 the entire match | ||
276 | * and 1 the inside of the CDATA section. | ||
277 | * @return string Escaped internals of the CDATA section. | ||
278 | */ | ||
279 | protected static function CDATACallback($matches) | ||
280 | { | ||
281 | // not exactly sure why the character set is needed, but whatever | ||
282 | return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); | ||
283 | } | ||
284 | |||
285 | /** | ||
286 | * Takes a piece of HTML and normalizes it by converting entities, fixing | ||
287 | * encoding, extracting bits, and other good stuff. | ||
288 | * @param string $html HTML. | ||
289 | * @param HTMLPurifier_Config $config | ||
290 | * @param HTMLPurifier_Context $context | ||
291 | * @return string | ||
292 | * @todo Consider making protected | ||
293 | */ | ||
294 | public function normalize($html, $config, $context) | ||
295 | { | ||
296 | // normalize newlines to \n | ||
297 | if ($config->get('Core.NormalizeNewlines')) { | ||
298 | $html = str_replace("\r\n", "\n", $html); | ||
299 | $html = str_replace("\r", "\n", $html); | ||
300 | } | ||
301 | |||
302 | if ($config->get('HTML.Trusted')) { | ||
303 | // escape convoluted CDATA | ||
304 | $html = $this->escapeCommentedCDATA($html); | ||
305 | } | ||
306 | |||
307 | // escape CDATA | ||
308 | $html = $this->escapeCDATA($html); | ||
309 | |||
310 | $html = $this->removeIEConditional($html); | ||
311 | |||
312 | // extract body from document if applicable | ||
313 | if ($config->get('Core.ConvertDocumentToFragment')) { | ||
314 | $e = false; | ||
315 | if ($config->get('Core.CollectErrors')) { | ||
316 | $e =& $context->get('ErrorCollector'); | ||
317 | } | ||
318 | $new_html = $this->extractBody($html); | ||
319 | if ($e && $new_html != $html) { | ||
320 | $e->send(E_WARNING, 'Lexer: Extracted body'); | ||
321 | } | ||
322 | $html = $new_html; | ||
323 | } | ||
324 | |||
325 | // expand entities that aren't the big five | ||
326 | $html = $this->_entity_parser->substituteNonSpecialEntities($html); | ||
327 | |||
328 | // clean into wellformed UTF-8 string for an SGML context: this has | ||
329 | // to be done after entity expansion because the entities sometimes | ||
330 | // represent non-SGML characters (horror, horror!) | ||
331 | $html = HTMLPurifier_Encoder::cleanUTF8($html); | ||
332 | |||
333 | // if processing instructions are to removed, remove them now | ||
334 | if ($config->get('Core.RemoveProcessingInstructions')) { | ||
335 | $html = preg_replace('#<\?.+?\?>#s', '', $html); | ||
336 | } | ||
337 | |||
338 | return $html; | ||
339 | } | ||
340 | |||
341 | /** | ||
342 | * Takes a string of HTML (fragment or document) and returns the content | ||
343 | * @todo Consider making protected | ||
344 | */ | ||
345 | public function extractBody($html) | ||
346 | { | ||
347 | $matches = array(); | ||
348 | $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches); | ||
349 | if ($result) { | ||
350 | return $matches[1]; | ||
351 | } else { | ||
352 | return $html; | ||
353 | } | ||
354 | } | ||
355 | } | ||
356 | |||
357 | // vim: et sw=4 sts=4 | ||