5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
8 * Copyright (c) 2004-2009, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
14 * * Redistributions of source code must retain the above copyright notice, this list of
15 * conditions and the following disclaimer.
17 * * Redistributions in binary form must reproduce the above copyright notice, this list
18 * of conditions and the following disclaimer in the documentation and/or other materials
19 * provided with the distribution.
21 * * Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * to endorse or promote products derived from this software without specific prior
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
37 * @copyright 2004-2010 Ryan Parman, Geoffrey Sneddon, Ryan McCue
39 * @author Geoffrey Sneddon
41 * @link http://simplepie.org/ SimplePie
42 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
43 * @todo phpDoc comments
48 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
50 class SimplePie_Sanitize
56 var $remove_div = true;
57 var $image_handler = '';
58 var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
59 var $encode_instead_of_strip = false;
60 var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
61 var $strip_comments = false;
62 var $output_encoding = 'UTF-8';
63 var $enable_cache = true;
64 var $cache_location = './cache';
65 var $cache_name_function = 'md5';
66 var $cache_class = 'SimplePie_Cache';
67 var $file_class = 'SimplePie_File';
70 var $force_fsockopen = false;
72 var $replace_url_attributes = array(
75 'blockquote' => 'cite',
78 'img' => array('longdesc', 'src'),
84 public function remove_div($enable = true)
86 $this->remove_div
= (bool) $enable;
89 public function set_image_handler($page = false)
93 $this->image_handler
= (string) $page;
97 $this->image_handler
= false;
101 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
103 if (isset($enable_cache))
105 $this->enable_cache
= (bool) $enable_cache;
110 $this->cache_location
= (string) $cache_location;
113 if ($cache_name_function)
115 $this->cache_name_function
= (string) $cache_name_function;
120 $this->cache_class
= (string) $cache_class;
124 public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
128 $this->file_class
= (string) $file_class;
133 $this->timeout
= (string) $timeout;
138 $this->useragent
= (string) $useragent;
141 if ($force_fsockopen)
143 $this->force_fsockopen
= (string) $force_fsockopen;
147 public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
153 $this->strip_htmltags
= $tags;
157 $this->strip_htmltags
= explode(',', $tags);
162 $this->strip_htmltags
= false;
166 public function encode_instead_of_strip($encode = false)
168 $this->encode_instead_of_strip
= (bool) $encode;
171 public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
175 if (is_array($attribs))
177 $this->strip_attributes
= $attribs;
181 $this->strip_attributes
= explode(',', $attribs);
186 $this->strip_attributes
= false;
190 public function strip_comments($strip = false)
192 $this->strip_comments
= (bool) $strip;
195 public function set_output_encoding($encoding = 'UTF-8')
197 $this->output_encoding
= (string) $encoding;
201 * Set element/attribute key/value pairs of HTML attributes
202 * containing URLs that need to be resolved relative to the feed
206 * @param array $element_attribute Element/attribute key/value pairs
208 public function set_url_replacements($element_attribute = array('a' => 'href', 'area' => 'href', 'blockquote' => 'cite', 'del' => 'cite', 'form' => 'action', 'img' => array('longdesc', 'src'), 'input' => 'src', 'ins' => 'cite', 'q' => 'cite'))
210 $this->replace_url_attributes
= (array) $element_attribute;
213 public function sanitize($data, $type, $base = '')
216 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI
)
218 if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML
)
220 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE
. '>)/', $data))
222 $type |= SIMPLEPIE_CONSTRUCT_HTML
;
226 $type |= SIMPLEPIE_CONSTRUCT_TEXT
;
230 if ($type & SIMPLEPIE_CONSTRUCT_BASE64
)
232 $data = base64_decode($data);
235 if ($type & SIMPLEPIE_CONSTRUCT_XHTML
)
237 if ($this->remove_div
)
239 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE
. '>/', '', $data);
240 $data = preg_replace('/<\/div>$/', '', $data);
244 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE
. '>/', '<div>', $data);
248 if ($type & (SIMPLEPIE_CONSTRUCT_HTML
| SIMPLEPIE_CONSTRUCT_XHTML
))
251 if ($this->strip_comments
)
253 $data = SimplePie_Misc
::strip_comments($data);
256 // Strip out HTML tags and attributes that might cause various security problems.
257 // Based on recommendations by Mark Pilgrim at:
258 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
259 if ($this->strip_htmltags
)
261 foreach ($this->strip_htmltags
as $tag)
263 $pcre = "/<($tag)" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE
. "(>(.*)<\/$tag" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\/)?>)/siU';
264 while (preg_match($pcre, $data))
266 $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
271 if ($this->strip_attributes)
273 foreach ($this->strip_attributes as $attrib)
275 $data = preg_replace('/(<[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\s*=\s*(?:"(?:[^
"]*)"|\'(?:[^\'
]*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E
][^\x09\x0A\x0B\x0C\x0D\x20\x3E
]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\
1\
2\
3>', $data);
279 // Replace relative URLs
281 foreach ($this->replace_url_attributes as $element => $attributes)
283 $data = $this->replace_urls($data, $element, $attributes);
286 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
287 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
289 $images = SimplePie_Misc::get_element('img
', $data);
290 foreach ($images as $img)
292 if (isset($img['attribs
']['src
']['data
']))
294 $image_url = call_user_func($this->cache_name_function, $img['attribs
']['src
']['data
']);
295 $cache = call_user_func(array($this->cache_class, 'create
'), $this->cache_location, $image_url, 'spi
');
299 $img['attribs
']['src
']['data
'] = $this->image_handler . $image_url;
300 $data = str_replace($img['full
'], SimplePie_Misc::element_implode($img), $data);
304 $file = new $this->file_class($img['attribs
']['src
']['data
'], $this->timeout, 5, array('X
-FORWARDED
-FOR' => $_SERVER['REMOTE_ADDR
']), $this->useragent, $this->force_fsockopen);
305 $headers = $file->headers;
307 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
309 if ($cache->save(array('headers
' => $file->headers, 'body
' => $file->body)))
311 $img['attribs
']['src
']['data
'] = $this->image_handler . $image_url;
312 $data = str_replace($img['full
'], SimplePie_Misc::element_implode($img), $data);
316 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative
or absolute path
, and that the location is server
-writable
.", E_USER_WARNING);
324 // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data
328 if ($type & SIMPLEPIE_CONSTRUCT_IRI)
330 $data = SimplePie_Misc::absolutize_url($data, $base);
333 if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
335 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
338 if ($this->output_encoding !== 'UTF-8')
340 $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
346 public function replace_urls($data, $tag, $attributes)
348 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
350 $elements = SimplePie_Misc::get_element($tag, $data);
351 foreach ($elements as $element)
353 if (is_array($attributes))
355 foreach ($attributes as $attribute)
357 if (isset($element['attribs'][$attribute]['data']))
359 $element['attribs'][$attribute]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attribute]['data'], $this->base);
360 $new_element = SimplePie_Misc::element_implode($element);
361 $data = str_replace($element['full'], $new_element, $data);
362 $element['full'] = $new_element;
366 elseif (isset($element['attribs'][$attributes]['data']))
368 $element['attribs'][$attributes]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attributes]['data'], $this->base);
369 $data = str_replace($element['full'], SimplePie_Misc::element_implode($element), $data);
376 public function do_strip_htmltags($match)
378 if ($this->encode_instead_of_strip)
380 if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
382 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
383 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
384 return "<
;$match[1]$match[2]>
;$match[3]<
;/$match[1]>
;";
388 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
391 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))