5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
8 * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
14 * * Redistributions of source code must retain the above copyright notice, this list of
15 * conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright notice, this list
18 * of conditions and the following disclaimer in the documentation and/or other materials
19 * provided with the distribution.
21 * * Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * to endorse or promote products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
37 * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
39 * @author Geoffrey Sneddon
41 * @link http://simplepie.org/ SimplePie
42 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
46 * Used for data cleanup and post-processing
49 * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
52 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
54 class SimplePie_Sanitize
60 var $remove_div = true;
61 var $image_handler = '';
62 var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
63 var $encode_instead_of_strip = false;
64 var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
65 var $strip_comments = false;
66 var $output_encoding = 'UTF-8';
67 var $enable_cache = true;
68 var $cache_location = './cache';
69 var $cache_name_function = 'md5';
72 var $force_fsockopen = false;
73 var $replace_url_attributes = null;
75 public function __construct()
78 $this->set_url_replacements(null);
81 public function remove_div($enable = true)
83 $this->remove_div
= (bool) $enable;
86 public function set_image_handler($page = false)
90 $this->image_handler
= (string) $page;
94 $this->image_handler
= false;
98 public function set_registry(SimplePie_Registry
$registry)
100 $this->registry
= $registry;
103 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
105 if (isset($enable_cache))
107 $this->enable_cache
= (bool) $enable_cache;
112 $this->cache_location
= (string) $cache_location;
115 if ($cache_name_function)
117 $this->cache_name_function
= (string) $cache_name_function;
121 public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
125 $this->timeout
= (string) $timeout;
130 $this->useragent
= (string) $useragent;
133 if ($force_fsockopen)
135 $this->force_fsockopen
= (string) $force_fsockopen;
139 public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
145 $this->strip_htmltags
= $tags;
149 $this->strip_htmltags
= explode(',', $tags);
154 $this->strip_htmltags
= false;
158 public function encode_instead_of_strip($encode = false)
160 $this->encode_instead_of_strip
= (bool) $encode;
163 public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
167 if (is_array($attribs))
169 $this->strip_attributes
= $attribs;
173 $this->strip_attributes
= explode(',', $attribs);
178 $this->strip_attributes
= false;
182 public function strip_comments($strip = false)
184 $this->strip_comments
= (bool) $strip;
187 public function set_output_encoding($encoding = 'UTF-8')
189 $this->output_encoding
= (string) $encoding;
193 * Set element/attribute key/value pairs of HTML attributes
194 * containing URLs that need to be resolved relative to the feed
196 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
197 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
201 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
203 public function set_url_replacements($element_attribute = null)
205 if ($element_attribute === null)
207 $element_attribute = array(
210 'blockquote' => 'cite',
222 $this->replace_url_attributes
= (array) $element_attribute;
225 public function sanitize($data, $type, $base = '')
228 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI
)
230 if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML
)
232 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE
. '>)/', $data))
234 $type |= SIMPLEPIE_CONSTRUCT_HTML
;
238 $type |= SIMPLEPIE_CONSTRUCT_TEXT
;
242 if ($type & SIMPLEPIE_CONSTRUCT_BASE64
)
244 $data = base64_decode($data);
247 if ($type & (SIMPLEPIE_CONSTRUCT_HTML
| SIMPLEPIE_CONSTRUCT_XHTML
))
250 $document = new DOMDocument();
251 $document->encoding
= 'UTF-8';
252 $data = $this->preprocess($data, $type);
254 set_error_handler(array('SimplePie_Misc', 'silence_errors'));
255 $document->loadHTML($data);
256 restore_error_handler();
259 if ($this->strip_comments
)
261 $xpath = new DOMXPath($document);
262 $comments = $xpath->query('//comment()');
264 foreach ($comments as $comment)
266 $comment->parentNode
->removeChild($comment);
270 // Strip out HTML tags and attributes that might cause various security problems.
271 // Based on recommendations by Mark Pilgrim at:
272 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
273 if ($this->strip_htmltags
)
275 foreach ($this->strip_htmltags
as $tag)
277 $this->strip_tag($tag, $document, $type);
281 if ($this->strip_attributes
)
283 foreach ($this->strip_attributes
as $attrib)
285 $this->strip_attr($attrib, $document);
289 // Replace relative URLs
291 foreach ($this->replace_url_attributes
as $element => $attributes)
293 $this->replace_urls($document, $element, $attributes);
296 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
297 if (isset($this->image_handler
) && ((string) $this->image_handler
) !== '' && $this->enable_cache
)
299 $images = $document->getElementsByTagName('img');
300 foreach ($images as $img)
302 if ($img->hasAttribute('src'))
304 $image_url = call_user_func($this->cache_name_function
, $img->getAttribute('src'));
305 $cache = $this->registry
->call('Cache', 'get_handler', array($this->cache_location
, $image_url, 'spi'));
309 $img->setAttribute('src', $this->image_handler
. $image_url);
313 $file = $this->registry
->create('File', array($img['attribs']['src']['data'], $this->timeout
, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent
, $this->force_fsockopen
));
314 $headers = $file->headers
;
316 if ($file->success
&& ($file->method
& SIMPLEPIE_FILE_SOURCE_REMOTE
=== 0 || ($file->status_code
=== 200 || $file->status_code
> 206 && $file->status_code
< 300)))
318 if ($cache->save(array('headers' => $file->headers
, 'body' => $file->body
)))
320 $img->setAttribute('src', $this->image_handler
. $image_url);
324 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING
);
332 // Remove the DOCTYPE
333 // Seems to cause segfaulting if we don't do this
334 if ($document->firstChild
instanceof DOMDocumentType
)
336 $document->removeChild($document->firstChild
);
339 // Move everything from the body to the root
340 $real_body = $document->getElementsByTagName('body')->item(0)->childNodes
->item(0);
341 $document->replaceChild($real_body, $document->firstChild
);
343 // Finally, convert to a HTML string
344 $data = trim($document->saveHTML());
346 if ($this->remove_div
)
348 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE
. '>/', '', $data);
349 $data = preg_replace('/<\/div>$/', '', $data);
353 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE
. '>/', '<div>', $data);
357 if ($type & SIMPLEPIE_CONSTRUCT_IRI
)
359 $absolute = $this->registry
->call('Misc', 'absolutize_url', array($data, $base));
360 if ($absolute !== false)
366 if ($type & (SIMPLEPIE_CONSTRUCT_TEXT
| SIMPLEPIE_CONSTRUCT_IRI
))
368 $data = htmlspecialchars($data, ENT_COMPAT
, 'UTF-8');
371 if ($this->output_encoding
!== 'UTF-8')
373 $data = $this->registry
->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding
));
379 protected function preprocess($html, $type)
382 if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML
)
384 // Atom XHTML constructs are wrapped with a div by default
385 // Note: No protection if $html contains a stray </div>!
386 $html = '<div>' . $html . '</div>';
387 $ret .= '<!DOCTYPE html>';
388 $content_type = 'text/html';
392 $ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
393 $content_type = 'application/xhtml+xml';
396 $ret .= '<html><head>';
397 $ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
398 $ret .= '</head><body>' . $html . '</body></html>';
402 public function replace_urls($document, $tag, $attributes)
404 if (!is_array($attributes))
406 $attributes = array($attributes);
409 if (!is_array($this->strip_htmltags
) || !in_array($tag, $this->strip_htmltags
))
411 $elements = $document->getElementsByTagName($tag);
412 foreach ($elements as $element)
414 foreach ($attributes as $attribute)
416 if ($element->hasAttribute($attribute))
418 $value = $this->registry
->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base
));
419 if ($value !== false)
421 $element->setAttribute($attribute, $value);
429 public function do_strip_htmltags($match)
431 if ($this->encode_instead_of_strip
)
433 if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
435 $match[1] = htmlspecialchars($match[1], ENT_COMPAT
, 'UTF-8');
436 $match[2] = htmlspecialchars($match[2], ENT_COMPAT
, 'UTF-8');
437 return "<$match[1]$match[2]>$match[3]</$match[1]>";
441 return htmlspecialchars($match[0], ENT_COMPAT
, 'UTF-8');
444 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
454 protected function strip_tag($tag, $document, $type)
456 $xpath = new DOMXPath($document);
457 $elements = $xpath->query('body//' . $tag);
458 if ($this->encode_instead_of_strip
)
460 foreach ($elements as $element)
462 $fragment = $document->createDocumentFragment();
464 // For elements which aren't script or style, include the tag itself
465 if (!in_array($tag, array('script', 'style')))
468 if ($element->hasAttributes())
471 foreach ($element->attributes
as $name => $attr)
473 $value = $attr->value
;
475 // In XHTML, empty values should never exist, so we repeat the value
476 if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML
))
480 // For HTML, empty is fine
481 elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML
))
487 // Standard attribute text
488 $attrs[] = $name . '="' . $attr->value
. '"';
490 $text .= ' ' . implode(' ', $attrs);
493 $fragment->appendChild(new DOMText($text));
496 $number = $element->childNodes
->length
;
497 for ($i = $number; $i > 0; $i--)
499 $child = $element->childNodes
->item(0);
500 $fragment->appendChild($child);
503 if (!in_array($tag, array('script', 'style')))
505 $fragment->appendChild(new DOMText('</' . $tag . '>'));
508 $element->parentNode
->replaceChild($fragment, $element);
513 elseif (in_array($tag, array('script', 'style')))
515 foreach ($elements as $element)
517 $element->parentNode
->removeChild($element);
524 foreach ($elements as $element)
526 $fragment = $document->createDocumentFragment();
527 $number = $element->childNodes
->length
;
528 for ($i = $number; $i > 0; $i--)
530 $child = $element->childNodes
->item(0);
531 $fragment->appendChild($child);
534 $element->parentNode
->replaceChild($fragment, $element);
539 protected function strip_attr($attrib, $document)
541 $xpath = new DOMXPath($document);
542 $elements = $xpath->query('//*[@' . $attrib . ']');
544 foreach ($elements as $element)
546 $element->removeAttribute($attrib);