]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/simplepie/library/SimplePie/Sanitize.php
[change] we now use Full-Text RSS 3.1, thank you so much @fivefilters
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / simplepie / library / SimplePie / Sanitize.php
CommitLineData
ec397236
NL
1<?php
2/**
3 * SimplePie
4 *
5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
7 *
42c80841 8 * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
ec397236
NL
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
13 *
14 * * Redistributions of source code must retain the above copyright notice, this list of
15 * conditions and the following disclaimer.
16 *
17 * * Redistributions in binary form must reproduce the above copyright notice, this list
18 * of conditions and the following disclaimer in the documentation and/or other materials
19 * provided with the distribution.
20 *
21 * * Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * to endorse or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 * @package SimplePie
42c80841
NL
36 * @version 1.3.1
37 * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
ec397236
NL
38 * @author Ryan Parman
39 * @author Geoffrey Sneddon
40 * @author Ryan McCue
41 * @link http://simplepie.org/ SimplePie
42 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
ec397236
NL
43 */
44
ec397236 45/**
42c80841
NL
46 * Used for data cleanup and post-processing
47 *
48 *
49 * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
50 *
51 * @package SimplePie
ec397236
NL
52 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
53 */
54class SimplePie_Sanitize
55{
56 // Private vars
57 var $base;
58
59 // Options
60 var $remove_div = true;
61 var $image_handler = '';
62 var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
63 var $encode_instead_of_strip = false;
64 var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
65 var $strip_comments = false;
66 var $output_encoding = 'UTF-8';
67 var $enable_cache = true;
68 var $cache_location = './cache';
69 var $cache_name_function = 'md5';
ec397236
NL
70 var $timeout = 10;
71 var $useragent = '';
72 var $force_fsockopen = false;
42c80841 73 var $replace_url_attributes = null;
ec397236 74
42c80841
NL
75 public function __construct()
76 {
77 // Set defaults
78 $this->set_url_replacements(null);
79 }
ec397236
NL
80
81 public function remove_div($enable = true)
82 {
83 $this->remove_div = (bool) $enable;
84 }
85
86 public function set_image_handler($page = false)
87 {
88 if ($page)
89 {
90 $this->image_handler = (string) $page;
91 }
92 else
93 {
94 $this->image_handler = false;
95 }
96 }
97
42c80841
NL
98 public function set_registry(SimplePie_Registry $registry)
99 {
100 $this->registry = $registry;
101 }
102
ec397236
NL
103 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
104 {
105 if (isset($enable_cache))
106 {
107 $this->enable_cache = (bool) $enable_cache;
108 }
109
110 if ($cache_location)
111 {
112 $this->cache_location = (string) $cache_location;
113 }
114
115 if ($cache_name_function)
116 {
117 $this->cache_name_function = (string) $cache_name_function;
118 }
ec397236
NL
119 }
120
121 public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
122 {
ec397236
NL
123 if ($timeout)
124 {
125 $this->timeout = (string) $timeout;
126 }
127
128 if ($useragent)
129 {
130 $this->useragent = (string) $useragent;
131 }
132
133 if ($force_fsockopen)
134 {
135 $this->force_fsockopen = (string) $force_fsockopen;
136 }
137 }
138
139 public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
140 {
141 if ($tags)
142 {
143 if (is_array($tags))
144 {
145 $this->strip_htmltags = $tags;
146 }
147 else
148 {
149 $this->strip_htmltags = explode(',', $tags);
150 }
151 }
152 else
153 {
154 $this->strip_htmltags = false;
155 }
156 }
157
158 public function encode_instead_of_strip($encode = false)
159 {
160 $this->encode_instead_of_strip = (bool) $encode;
161 }
162
163 public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
164 {
165 if ($attribs)
166 {
167 if (is_array($attribs))
168 {
169 $this->strip_attributes = $attribs;
170 }
171 else
172 {
173 $this->strip_attributes = explode(',', $attribs);
174 }
175 }
176 else
177 {
178 $this->strip_attributes = false;
179 }
180 }
181
182 public function strip_comments($strip = false)
183 {
184 $this->strip_comments = (bool) $strip;
185 }
186
187 public function set_output_encoding($encoding = 'UTF-8')
188 {
189 $this->output_encoding = (string) $encoding;
190 }
191
192 /**
193 * Set element/attribute key/value pairs of HTML attributes
194 * containing URLs that need to be resolved relative to the feed
195 *
42c80841
NL
196 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
197 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
198 * |q|@cite
199 *
ec397236 200 * @since 1.0
42c80841 201 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
ec397236 202 */
42c80841 203 public function set_url_replacements($element_attribute = null)
ec397236 204 {
42c80841
NL
205 if ($element_attribute === null)
206 {
207 $element_attribute = array(
208 'a' => 'href',
209 'area' => 'href',
210 'blockquote' => 'cite',
211 'del' => 'cite',
212 'form' => 'action',
213 'img' => array(
214 'longdesc',
215 'src'
216 ),
217 'input' => 'src',
218 'ins' => 'cite',
219 'q' => 'cite'
220 );
221 }
ec397236
NL
222 $this->replace_url_attributes = (array) $element_attribute;
223 }
224
225 public function sanitize($data, $type, $base = '')
226 {
227 $data = trim($data);
228 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
229 {
230 if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
231 {
232 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
233 {
234 $type |= SIMPLEPIE_CONSTRUCT_HTML;
235 }
236 else
237 {
238 $type |= SIMPLEPIE_CONSTRUCT_TEXT;
239 }
240 }
241
242 if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
243 {
244 $data = base64_decode($data);
245 }
246
ec397236
NL
247 if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
248 {
42c80841
NL
249
250 $document = new DOMDocument();
251 $document->encoding = 'UTF-8';
252 $data = $this->preprocess($data, $type);
253
254 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
255 $document->loadHTML($data);
256 restore_error_handler();
257
ec397236
NL
258 // Strip comments
259 if ($this->strip_comments)
260 {
42c80841
NL
261 $xpath = new DOMXPath($document);
262 $comments = $xpath->query('//comment()');
263
264 foreach ($comments as $comment)
265 {
266 $comment->parentNode->removeChild($comment);
267 }
ec397236
NL
268 }
269
270 // Strip out HTML tags and attributes that might cause various security problems.
271 // Based on recommendations by Mark Pilgrim at:
272 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
273 if ($this->strip_htmltags)
274 {
275 foreach ($this->strip_htmltags as $tag)
276 {
42c80841 277 $this->strip_tag($tag, $document, $type);
ec397236
NL
278 }
279 }
280
281 if ($this->strip_attributes)
282 {
283 foreach ($this->strip_attributes as $attrib)
284 {
42c80841 285 $this->strip_attr($attrib, $document);
ec397236
NL
286 }
287 }
288
289 // Replace relative URLs
290 $this->base = $base;
291 foreach ($this->replace_url_attributes as $element => $attributes)
292 {
42c80841 293 $this->replace_urls($document, $element, $attributes);
ec397236
NL
294 }
295
296 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
297 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
298 {
42c80841 299 $images = $document->getElementsByTagName('img');
ec397236
NL
300 foreach ($images as $img)
301 {
42c80841 302 if ($img->hasAttribute('src'))
ec397236 303 {
42c80841
NL
304 $image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
305 $cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
ec397236
NL
306
307 if ($cache->load())
308 {
42c80841 309 $img->setAttribute('src', $this->image_handler . $image_url);
ec397236
NL
310 }
311 else
312 {
42c80841 313 $file = $this->registry->create('File', array($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
ec397236
NL
314 $headers = $file->headers;
315
316 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
317 {
318 if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
319 {
42c80841 320 $img->setAttribute('src', $this->image_handler . $image_url);
ec397236
NL
321 }
322 else
323 {
324 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
325 }
326 }
327 }
328 }
329 }
330 }
331
42c80841
NL
332 // Remove the DOCTYPE
333 // Seems to cause segfaulting if we don't do this
334 if ($document->firstChild instanceof DOMDocumentType)
335 {
336 $document->removeChild($document->firstChild);
337 }
338
339 // Move everything from the body to the root
340 $real_body = $document->getElementsByTagName('body')->item(0)->childNodes->item(0);
341 $document->replaceChild($real_body, $document->firstChild);
342
343 // Finally, convert to a HTML string
344 $data = trim($document->saveHTML());
345
346 if ($this->remove_div)
347 {
348 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
349 $data = preg_replace('/<\/div>$/', '', $data);
350 }
351 else
352 {
353 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
354 }
ec397236
NL
355 }
356
357 if ($type & SIMPLEPIE_CONSTRUCT_IRI)
358 {
42c80841
NL
359 $absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
360 if ($absolute !== false)
361 {
362 $data = $absolute;
363 }
ec397236
NL
364 }
365
366 if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
367 {
368 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
369 }
370
371 if ($this->output_encoding !== 'UTF-8')
372 {
42c80841 373 $data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
ec397236
NL
374 }
375 }
376 return $data;
377 }
378
42c80841 379 protected function preprocess($html, $type)
ec397236 380 {
42c80841
NL
381 $ret = '';
382 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
383 {
384 // Atom XHTML constructs are wrapped with a div by default
385 // Note: No protection if $html contains a stray </div>!
386 $html = '<div>' . $html . '</div>';
387 $ret .= '<!DOCTYPE html>';
388 $content_type = 'text/html';
389 }
390 else
391 {
392 $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
393 $content_type = 'application/xhtml+xml';
394 }
395
396 $ret .= '<html><head>';
397 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
398 $ret .= '</head><body>' . $html . '</body></html>';
399 return $ret;
400 }
401
402 public function replace_urls($document, $tag, $attributes)
403 {
404 if (!is_array($attributes))
405 {
406 $attributes = array($attributes);
407 }
408
ec397236
NL
409 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
410 {
42c80841 411 $elements = $document->getElementsByTagName($tag);
ec397236
NL
412 foreach ($elements as $element)
413 {
42c80841 414 foreach ($attributes as $attribute)
ec397236 415 {
42c80841 416 if ($element->hasAttribute($attribute))
ec397236 417 {
42c80841
NL
418 $value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
419 if ($value !== false)
ec397236 420 {
42c80841 421 $element->setAttribute($attribute, $value);
ec397236
NL
422 }
423 }
424 }
ec397236
NL
425 }
426 }
ec397236
NL
427 }
428
429 public function do_strip_htmltags($match)
430 {
431 if ($this->encode_instead_of_strip)
432 {
433 if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
434 {
435 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
436 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
437 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
438 }
439 else
440 {
441 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
442 }
443 }
444 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
445 {
446 return $match[4];
447 }
448 else
449 {
450 return '';
451 }
452 }
42c80841
NL
453
454 protected function strip_tag($tag, $document, $type)
455 {
456 $xpath = new DOMXPath($document);
457 $elements = $xpath->query('body//' . $tag);
458 if ($this->encode_instead_of_strip)
459 {
460 foreach ($elements as $element)
461 {
462 $fragment = $document->createDocumentFragment();
463
464 // For elements which aren't script or style, include the tag itself
465 if (!in_array($tag, array('script', 'style')))
466 {
467 $text = '<' . $tag;
468 if ($element->hasAttributes())
469 {
470 $attrs = array();
471 foreach ($element->attributes as $name => $attr)
472 {
473 $value = $attr->value;
474
475 // In XHTML, empty values should never exist, so we repeat the value
476 if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
477 {
478 $value = $name;
479 }
480 // For HTML, empty is fine
481 elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
482 {
483 $attrs[] = $name;
484 continue;
485 }
486
487 // Standard attribute text
488 $attrs[] = $name . '="' . $attr->value . '"';
489 }
490 $text .= ' ' . implode(' ', $attrs);
491 }
492 $text .= '>';
493 $fragment->appendChild(new DOMText($text));
494 }
495
496 $number = $element->childNodes->length;
497 for ($i = $number; $i > 0; $i--)
498 {
499 $child = $element->childNodes->item(0);
500 $fragment->appendChild($child);
501 }
502
503 if (!in_array($tag, array('script', 'style')))
504 {
505 $fragment->appendChild(new DOMText('</' . $tag . '>'));
506 }
507
508 $element->parentNode->replaceChild($fragment, $element);
509 }
510
511 return;
512 }
513 elseif (in_array($tag, array('script', 'style')))
514 {
515 foreach ($elements as $element)
516 {
517 $element->parentNode->removeChild($element);
518 }
519
520 return;
521 }
522 else
523 {
524 foreach ($elements as $element)
525 {
526 $fragment = $document->createDocumentFragment();
527 $number = $element->childNodes->length;
528 for ($i = $number; $i > 0; $i--)
529 {
530 $child = $element->childNodes->item(0);
531 $fragment->appendChild($child);
532 }
533
534 $element->parentNode->replaceChild($fragment, $element);
535 }
536 }
537 }
538
539 protected function strip_attr($attrib, $document)
540 {
541 $xpath = new DOMXPath($document);
542 $elements = $xpath->query('//*[@' . $attrib . ']');
543
544 foreach ($elements as $element)
545 {
546 $element->removeAttribute($attrib);
547 }
548 }
ec397236 549}