]> git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/simplepie/SimplePie/Sanitize.php
poche now uses Full Text RSS to fetch content
[github/wallabag/wallabag.git] / inc / 3rdparty / simplepie / SimplePie / Sanitize.php
1 <?php
2 /**
3 * SimplePie
4 *
5 * A PHP-Based RSS and Atom Feed Framework.
6 * Takes the hard work out of managing a complete RSS/Atom solution.
7 *
8 * Copyright (c) 2004-2009, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without modification, are
12 * permitted provided that the following conditions are met:
13 *
14 * * Redistributions of source code must retain the above copyright notice, this list of
15 * conditions and the following disclaimer.
16 *
17 * * Redistributions in binary form must reproduce the above copyright notice, this list
18 * of conditions and the following disclaimer in the documentation and/or other materials
19 * provided with the distribution.
20 *
21 * * Neither the name of the SimplePie Team nor the names of its contributors may be used
22 * to endorse or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
26 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
28 * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 * @package SimplePie
36 * @version 1.3-dev
37 * @copyright 2004-2010 Ryan Parman, Geoffrey Sneddon, Ryan McCue
38 * @author Ryan Parman
39 * @author Geoffrey Sneddon
40 * @author Ryan McCue
41 * @link http://simplepie.org/ SimplePie
42 * @license http://www.opensource.org/licenses/bsd-license.php BSD License
43 * @todo phpDoc comments
44 */
45
46
47 /**
48 * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
49 */
50 class SimplePie_Sanitize
51 {
52 // Private vars
53 var $base;
54
55 // Options
56 var $remove_div = true;
57 var $image_handler = '';
58 var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
59 var $encode_instead_of_strip = false;
60 var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
61 var $strip_comments = false;
62 var $output_encoding = 'UTF-8';
63 var $enable_cache = true;
64 var $cache_location = './cache';
65 var $cache_name_function = 'md5';
66 var $cache_class = 'SimplePie_Cache';
67 var $file_class = 'SimplePie_File';
68 var $timeout = 10;
69 var $useragent = '';
70 var $force_fsockopen = false;
71
72 var $replace_url_attributes = array(
73 'a' => 'href',
74 'area' => 'href',
75 'blockquote' => 'cite',
76 'del' => 'cite',
77 'form' => 'action',
78 'img' => array('longdesc', 'src'),
79 'input' => 'src',
80 'ins' => 'cite',
81 'q' => 'cite'
82 );
83
84 public function remove_div($enable = true)
85 {
86 $this->remove_div = (bool) $enable;
87 }
88
89 public function set_image_handler($page = false)
90 {
91 if ($page)
92 {
93 $this->image_handler = (string) $page;
94 }
95 else
96 {
97 $this->image_handler = false;
98 }
99 }
100
101 public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
102 {
103 if (isset($enable_cache))
104 {
105 $this->enable_cache = (bool) $enable_cache;
106 }
107
108 if ($cache_location)
109 {
110 $this->cache_location = (string) $cache_location;
111 }
112
113 if ($cache_name_function)
114 {
115 $this->cache_name_function = (string) $cache_name_function;
116 }
117
118 if ($cache_class)
119 {
120 $this->cache_class = (string) $cache_class;
121 }
122 }
123
124 public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
125 {
126 if ($file_class)
127 {
128 $this->file_class = (string) $file_class;
129 }
130
131 if ($timeout)
132 {
133 $this->timeout = (string) $timeout;
134 }
135
136 if ($useragent)
137 {
138 $this->useragent = (string) $useragent;
139 }
140
141 if ($force_fsockopen)
142 {
143 $this->force_fsockopen = (string) $force_fsockopen;
144 }
145 }
146
147 public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
148 {
149 if ($tags)
150 {
151 if (is_array($tags))
152 {
153 $this->strip_htmltags = $tags;
154 }
155 else
156 {
157 $this->strip_htmltags = explode(',', $tags);
158 }
159 }
160 else
161 {
162 $this->strip_htmltags = false;
163 }
164 }
165
166 public function encode_instead_of_strip($encode = false)
167 {
168 $this->encode_instead_of_strip = (bool) $encode;
169 }
170
171 public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
172 {
173 if ($attribs)
174 {
175 if (is_array($attribs))
176 {
177 $this->strip_attributes = $attribs;
178 }
179 else
180 {
181 $this->strip_attributes = explode(',', $attribs);
182 }
183 }
184 else
185 {
186 $this->strip_attributes = false;
187 }
188 }
189
190 public function strip_comments($strip = false)
191 {
192 $this->strip_comments = (bool) $strip;
193 }
194
195 public function set_output_encoding($encoding = 'UTF-8')
196 {
197 $this->output_encoding = (string) $encoding;
198 }
199
200 /**
201 * Set element/attribute key/value pairs of HTML attributes
202 * containing URLs that need to be resolved relative to the feed
203 *
204 * @access public
205 * @since 1.0
206 * @param array $element_attribute Element/attribute key/value pairs
207 */
208 public function set_url_replacements($element_attribute = array('a' => 'href', 'area' => 'href', 'blockquote' => 'cite', 'del' => 'cite', 'form' => 'action', 'img' => array('longdesc', 'src'), 'input' => 'src', 'ins' => 'cite', 'q' => 'cite'))
209 {
210 $this->replace_url_attributes = (array) $element_attribute;
211 }
212
213 public function sanitize($data, $type, $base = '')
214 {
215 $data = trim($data);
216 if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
217 {
218 if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
219 {
220 if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
221 {
222 $type |= SIMPLEPIE_CONSTRUCT_HTML;
223 }
224 else
225 {
226 $type |= SIMPLEPIE_CONSTRUCT_TEXT;
227 }
228 }
229
230 if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
231 {
232 $data = base64_decode($data);
233 }
234
235 if ($type & SIMPLEPIE_CONSTRUCT_XHTML)
236 {
237 if ($this->remove_div)
238 {
239 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
240 $data = preg_replace('/<\/div>$/', '', $data);
241 }
242 else
243 {
244 $data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
245 }
246 }
247
248 if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
249 {
250 // Strip comments
251 if ($this->strip_comments)
252 {
253 $data = SimplePie_Misc::strip_comments($data);
254 }
255
256 // Strip out HTML tags and attributes that might cause various security problems.
257 // Based on recommendations by Mark Pilgrim at:
258 // http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
259 if ($this->strip_htmltags)
260 {
261 foreach ($this->strip_htmltags as $tag)
262 {
263 $pcre = "/<($tag)" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\/$tag" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\/)?>)/siU';
264 while (preg_match($pcre, $data))
265 {
266 $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
267 }
268 }
269 }
270
271 if ($this->strip_attributes)
272 {
273 foreach ($this->strip_attributes as $attrib)
274 {
275 $data = preg_replace('/(<[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\s*=\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\1\2\3>', $data);
276 }
277 }
278
279 // Replace relative URLs
280 $this->base = $base;
281 foreach ($this->replace_url_attributes as $element => $attributes)
282 {
283 $data = $this->replace_urls($data, $element, $attributes);
284 }
285
286 // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
287 if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
288 {
289 $images = SimplePie_Misc::get_element('img', $data);
290 foreach ($images as $img)
291 {
292 if (isset($img['attribs']['src']['data']))
293 {
294 $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']);
295 $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi');
296
297 if ($cache->load())
298 {
299 $img['attribs']['src']['data'] = $this->image_handler . $image_url;
300 $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
301 }
302 else
303 {
304 $file = new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen);
305 $headers = $file->headers;
306
307 if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
308 {
309 if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
310 {
311 $img['attribs']['src']['data'] = $this->image_handler . $image_url;
312 $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
313 }
314 else
315 {
316 trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
317 }
318 }
319 }
320 }
321 }
322 }
323
324 // Having (possibly) taken stuff out, there may now be whitespace at the beginning/end of the data
325 $data = trim($data);
326 }
327
328 if ($type & SIMPLEPIE_CONSTRUCT_IRI)
329 {
330 $data = SimplePie_Misc::absolutize_url($data, $base);
331 }
332
333 if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
334 {
335 $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
336 }
337
338 if ($this->output_encoding !== 'UTF-8')
339 {
340 $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
341 }
342 }
343 return $data;
344 }
345
346 public function replace_urls($data, $tag, $attributes)
347 {
348 if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
349 {
350 $elements = SimplePie_Misc::get_element($tag, $data);
351 foreach ($elements as $element)
352 {
353 if (is_array($attributes))
354 {
355 foreach ($attributes as $attribute)
356 {
357 if (isset($element['attribs'][$attribute]['data']))
358 {
359 $element['attribs'][$attribute]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attribute]['data'], $this->base);
360 $new_element = SimplePie_Misc::element_implode($element);
361 $data = str_replace($element['full'], $new_element, $data);
362 $element['full'] = $new_element;
363 }
364 }
365 }
366 elseif (isset($element['attribs'][$attributes]['data']))
367 {
368 $element['attribs'][$attributes]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attributes]['data'], $this->base);
369 $data = str_replace($element['full'], SimplePie_Misc::element_implode($element), $data);
370 }
371 }
372 }
373 return $data;
374 }
375
376 public function do_strip_htmltags($match)
377 {
378 if ($this->encode_instead_of_strip)
379 {
380 if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
381 {
382 $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
383 $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
384 return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
385 }
386 else
387 {
388 return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
389 }
390 }
391 elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
392 {
393 return $match[4];
394 }
395 else
396 {
397 return '';
398 }
399 }
400 }