]>
Commit | Line | Data |
---|---|---|
1 | <?php\r | |
2 | // Full-Text RSS: Create Full-Text Feeds\r | |
3 | // Author: Keyvan Minoukadeh\r | |
4 | // Copyright (c) 2013 Keyvan Minoukadeh\r | |
5 | // License: AGPLv3\r | |
6 | // Version: 3.1\r | |
7 | // Date: 2013-03-05\r | |
8 | // More info: http://fivefilters.org/content-only/\r | |
9 | // Help: http://help.fivefilters.org\r | |
10 | \r | |
11 | /*\r | |
12 | This program is free software: you can redistribute it and/or modify\r | |
13 | it under the terms of the GNU Affero General Public License as published by\r | |
14 | the Free Software Foundation, either version 3 of the License, or\r | |
15 | (at your option) any later version.\r | |
16 | \r | |
17 | This program is distributed in the hope that it will be useful,\r | |
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of\r | |
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r | |
20 | GNU Affero General Public License for more details.\r | |
21 | \r | |
22 | You should have received a copy of the GNU Affero General Public License\r | |
23 | along with this program. If not, see <http://www.gnu.org/licenses/>.\r | |
24 | */\r | |
25 | \r | |
26 | // Usage\r | |
27 | // -----\r | |
28 | // Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org\r | |
29 | // The following options can be passed in the querystring:\r | |
30 | // * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))\r | |
31 | // * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)\r | |
32 | // * API key: key=[api key] (optional, refer to config.php)\r | |
33 | // * Max entries to process: max=[max number of items] (optional)\r | |
34 | \r | |
35 | error_reporting(E_ALL ^ E_NOTICE);\r | |
36 | ini_set("display_errors", 1);\r | |
37 | @set_time_limit(120);\r | |
38 | \r | |
39 | // Deal with magic quotes\r | |
40 | if (get_magic_quotes_gpc()) {\r | |
41 | $process = array(&$_GET, &$_POST, &$_REQUEST);\r | |
42 | while (list($key, $val) = each($process)) {\r | |
43 | foreach ($val as $k => $v) {\r | |
44 | unset($process[$key][$k]);\r | |
45 | if (is_array($v)) {\r | |
46 | $process[$key][stripslashes($k)] = $v;\r | |
47 | $process[] = &$process[$key][stripslashes($k)];\r | |
48 | } else {\r | |
49 | $process[$key][stripslashes($k)] = stripslashes($v);\r | |
50 | }\r | |
51 | }\r | |
52 | }\r | |
53 | unset($process);\r | |
54 | }\r | |
55 | \r | |
56 | // set include path\r | |
57 | set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path());\r | |
58 | \r | |
59 | require_once dirname(__FILE__).'/makefulltextfeedHelpers.php';\r | |
60 | \r | |
61 | ////////////////////////////////\r | |
62 | // Load config file\r | |
63 | ////////////////////////////////\r | |
64 | require dirname(__FILE__).'/config.php';\r | |
65 | \r | |
66 | ////////////////////////////////\r | |
67 | // Prevent indexing/following by search engines because:\r | |
68 | // 1. The content is already public and presumably indexed (why create duplicates?)\r | |
69 | // 2. Not doing so might increase number of requests from search engines, thus increasing server load\r | |
70 | // Note: feed readers and services such as Yahoo Pipes will not be affected by this header.\r | |
71 | // Note: Using Disallow in a robots.txt file will be more effective (search engines will check\r | |
72 | // that before even requesting makefulltextfeed.php).\r | |
73 | ////////////////////////////////\r | |
74 | header('X-Robots-Tag: noindex, nofollow');\r | |
75 | \r | |
76 | ////////////////////////////////\r | |
77 | // Check if service is enabled\r | |
78 | ////////////////////////////////\r | |
79 | if (!$options->enabled) { \r | |
80 | die('The full-text RSS service is currently disabled'); \r | |
81 | }\r | |
82 | \r | |
83 | ////////////////////////////////\r | |
84 | // Debug mode?\r | |
85 | // See the config file for debug options.\r | |
86 | ////////////////////////////////\r | |
87 | $debug_mode = false;\r | |
88 | if (isset($_GET['debug'])) {\r | |
89 | if ($options->debug === true || $options->debug == 'user') {\r | |
90 | $debug_mode = true;\r | |
91 | } elseif ($options->debug == 'admin') {\r | |
92 | session_start();\r | |
93 | $debug_mode = (@$_SESSION['auth'] == 1);\r | |
94 | }\r | |
95 | if ($debug_mode) {\r | |
96 | header('Content-Type: text/plain; charset=utf-8');\r | |
97 | } else {\r | |
98 | if ($options->debug == 'admin') {\r | |
99 | die('You must be logged in to the <a href="admin/">admin area</a> to see debug output.');\r | |
100 | } else {\r | |
101 | die('Debugging is disabled.');\r | |
102 | }\r | |
103 | }\r | |
104 | }\r | |
105 | \r | |
106 | ////////////////////////////////\r | |
107 | // Check for APC\r | |
108 | ////////////////////////////////\r | |
109 | $options->apc = $options->apc && function_exists('apc_add');\r | |
110 | if ($options->apc) {\r | |
111 | debug('APC is enabled and available on server');\r | |
112 | } else {\r | |
113 | debug('APC is disabled or not available on server');\r | |
114 | }\r | |
115 | \r | |
116 | ////////////////////////////////\r | |
117 | // Check for smart cache\r | |
118 | ////////////////////////////////\r | |
119 | $options->smart_cache = $options->smart_cache && function_exists('apc_inc');\r | |
120 | \r | |
121 | ////////////////////////////////\r | |
122 | // Check for feed URL\r | |
123 | ////////////////////////////////\r | |
124 | if (!isset($_GET['url'])) { \r | |
125 | die('No URL supplied'); \r | |
126 | }\r | |
127 | $url = trim($_GET['url']);\r | |
128 | if (strtolower(substr($url, 0, 7)) == 'feed://') {\r | |
129 | $url = 'http://'.substr($url, 7);\r | |
130 | }\r | |
131 | if (!preg_match('!^https?://.+!i', $url)) {\r | |
132 | $url = 'http://'.$url;\r | |
133 | }\r | |
134 | \r | |
135 | $url = filter_var($url, FILTER_SANITIZE_URL);\r | |
136 | $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r | |
137 | // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)\r | |
138 | if ($test === false) {\r | |
139 | $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r | |
140 | }\r | |
141 | if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {\r | |
142 | // all okay\r | |
143 | unset($test);\r | |
144 | } else {\r | |
145 | die('Invalid URL supplied');\r | |
146 | }\r | |
147 | debug("Supplied URL: $url");\r | |
148 | \r | |
149 | /////////////////////////////////\r | |
150 | // Redirect to hide API key\r | |
151 | /////////////////////////////////\r | |
152 | if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) {\r | |
153 | $host = $_SERVER['HTTP_HOST'];\r | |
154 | $path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');\r | |
155 | $_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url;\r | |
156 | $redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url);\r | |
157 | $redirect .= '&key='.$key_index;\r | |
158 | $redirect .= '&hash='.urlencode(sha1($_GET['key'].$url));\r | |
159 | if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);\r | |
160 | if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];\r | |
161 | if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);\r | |
162 | if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);\r | |
163 | if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);\r | |
164 | if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); \r | |
165 | if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);\r | |
166 | if (isset($_GET['xss'])) $redirect .= '&xss';\r | |
167 | if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';\r | |
168 | if (isset($_GET['debug'])) $redirect .= '&debug';\r | |
169 | if ($debug_mode) {\r | |
170 | debug('Redirecting to hide access key, follow URL below to continue');\r | |
171 | debug("Location: $redirect");\r | |
172 | } else {\r | |
173 | header("Location: $redirect");\r | |
174 | }\r | |
175 | exit;\r | |
176 | }\r | |
177 | \r | |
178 | ///////////////////////////////////////////////\r | |
179 | // Set timezone.\r | |
180 | // Prevents warnings, but needs more testing - \r | |
181 | // perhaps if timezone is set in php.ini we\r | |
182 | // don't need to set it at all...\r | |
183 | ///////////////////////////////////////////////\r | |
184 | if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timezone'))) {\r | |
185 | date_default_timezone_set('UTC');\r | |
186 | }\r | |
187 | \r | |
188 | ///////////////////////////////////////////////\r | |
189 | // Check if the request is explicitly for an HTML page\r | |
190 | ///////////////////////////////////////////////\r | |
191 | $html_only = (isset($_GET['html']) && ($_GET['html'] == '1' || $_GET['html'] == 'true'));\r | |
192 | \r | |
193 | ///////////////////////////////////////////////\r | |
194 | // Check if valid key supplied\r | |
195 | ///////////////////////////////////////////////\r | |
196 | $valid_key = false;\r | |
197 | if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int)$_GET['key']])) {\r | |
198 | $valid_key = ($_GET['hash'] == sha1($options->api_keys[(int)$_GET['key']].$url));\r | |
199 | }\r | |
200 | $key_index = ($valid_key) ? (int)$_GET['key'] : 0;\r | |
201 | if (!$valid_key && $options->key_required) {\r | |
202 | die('A valid key must be supplied'); \r | |
203 | }\r | |
204 | if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {\r | |
205 | die('The entered key is invalid');\r | |
206 | }\r | |
207 | \r | |
208 | if (file_exists('custom_init.php')) require 'custom_init.php';\r | |
209 | \r | |
210 | ///////////////////////////////////////////////\r | |
211 | // Check URL against list of blacklisted URLs\r | |
212 | ///////////////////////////////////////////////\r | |
213 | if (!url_allowed($url)) die('URL blocked');\r | |
214 | \r | |
215 | ///////////////////////////////////////////////\r | |
216 | // Max entries\r | |
217 | // see config.php to find these values\r | |
218 | ///////////////////////////////////////////////\r | |
219 | if (isset($_GET['max'])) {\r | |
220 | $max = (int)$_GET['max'];\r | |
221 | if ($valid_key) {\r | |
222 | $max = min($max, $options->max_entries_with_key);\r | |
223 | } else {\r | |
224 | $max = min($max, $options->max_entries);\r | |
225 | }\r | |
226 | } else {\r | |
227 | if ($valid_key) {\r | |
228 | $max = $options->default_entries_with_key;\r | |
229 | } else {\r | |
230 | $max = $options->default_entries;\r | |
231 | }\r | |
232 | }\r | |
233 | \r | |
234 | ///////////////////////////////////////////////\r | |
235 | // Link handling\r | |
236 | ///////////////////////////////////////////////\r | |
237 | if (isset($_GET['links']) && in_array($_GET['links'], array('preserve', 'footnotes', 'remove'))) {\r | |
238 | $links = $_GET['links'];\r | |
239 | } else {\r | |
240 | $links = 'preserve';\r | |
241 | }\r | |
242 | \r | |
243 | ///////////////////////////////////////////////\r | |
244 | // Favour item titles in feed?\r | |
245 | ///////////////////////////////////////////////\r | |
246 | $favour_feed_titles = true;\r | |
247 | if ($options->favour_feed_titles == 'user') {\r | |
248 | $favour_feed_titles = !isset($_GET['use_extracted_title']);\r | |
249 | } else {\r | |
250 | $favour_feed_titles = $options->favour_feed_titles;\r | |
251 | }\r | |
252 | \r | |
253 | ///////////////////////////////////////////////\r | |
254 | // Exclude items if extraction fails\r | |
255 | ///////////////////////////////////////////////\r | |
256 | if ($options->exclude_items_on_fail === 'user') {\r | |
257 | $exclude_on_fail = (isset($_GET['exc']) && ($_GET['exc'] == '1'));\r | |
258 | } else {\r | |
259 | $exclude_on_fail = $options->exclude_items_on_fail;\r | |
260 | }\r | |
261 | \r | |
262 | ///////////////////////////////////////////////\r | |
263 | // Detect language\r | |
264 | ///////////////////////////////////////////////\r | |
265 | if ($options->detect_language === 'user') {\r | |
266 | if (isset($_GET['l'])) {\r | |
267 | $detect_language = (int)$_GET['l'];\r | |
268 | } else {\r | |
269 | $detect_language = 1;\r | |
270 | }\r | |
271 | } else {\r | |
272 | $detect_language = $options->detect_language;\r | |
273 | }\r | |
274 | \r | |
275 | if ($detect_language >= 2) {\r | |
276 | $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',\r | |
277 | 'cebuano' => 'ceb', // ISO 639-2\r | |
278 | 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',\r | |
279 | 'hawaiian' => 'haw', // ISO 639-2 \r | |
280 | 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',\r | |
281 | 'pidgin' => 'cpe', // ISO 639-2 \r | |
282 | 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');\r | |
283 | }\r | |
284 | $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);\r | |
285 | \r | |
286 | /////////////////////////////////////\r | |
287 | // Check for valid format\r | |
288 | // (stick to RSS (or RSS as JSON) for the time being)\r | |
289 | /////////////////////////////////////\r | |
290 | if (isset($_GET['format']) && $_GET['format'] == 'json') {\r | |
291 | $format = 'json';\r | |
292 | } else {\r | |
293 | $format = 'rss';\r | |
294 | }\r | |
295 | \r | |
296 | /////////////////////////////////////\r | |
297 | // Should we do XSS filtering?\r | |
298 | /////////////////////////////////////\r | |
299 | if ($options->xss_filter === 'user') {\r | |
300 | $xss_filter = isset($_GET['xss']);\r | |
301 | } else {\r | |
302 | $xss_filter = $options->xss_filter;\r | |
303 | }\r | |
304 | if (!$xss_filter && isset($_GET['xss'])) {\r | |
305 | die('XSS filtering is disabled in config');\r | |
306 | }\r | |
307 | \r | |
308 | /////////////////////////////////////\r | |
309 | // Check for JSONP\r | |
310 | // Regex from https://gist.github.com/1217080\r | |
311 | /////////////////////////////////////\r | |
312 | $callback = null;\r | |
313 | if ($format =='json' && isset($_GET['callback'])) {\r | |
314 | $callback = trim($_GET['callback']);\r | |
315 | foreach (explode('.', $callback) as $_identifier) {\r | |
316 | if (!preg_match('/^[a-zA-Z_$][0-9a-zA-Z_$]*(?:\[(?:".+"|\'.+\'|\d+)\])*?$/', $_identifier)) {\r | |
317 | die('Invalid JSONP callback');\r | |
318 | }\r | |
319 | }\r | |
320 | debug("JSONP callback: $callback");\r | |
321 | }\r | |
322 | \r | |
323 | //////////////////////////////////\r | |
324 | // Enable Cross-Origin Resource Sharing (CORS)\r | |
325 | //////////////////////////////////\r | |
326 | if ($options->cors) header('Access-Control-Allow-Origin: *');\r | |
327 | \r | |
328 | //////////////////////////////////\r | |
329 | // Check for cached copy\r | |
330 | //////////////////////////////////\r | |
331 | if ($options->caching) {\r | |
332 | debug('Caching is enabled...');\r | |
333 | $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));\r | |
334 | $check_cache = true;\r | |
335 | if ($options->apc && $options->smart_cache) {\r | |
336 | apc_add("cache.$cache_id", 0, 10*60);\r | |
337 | $apc_cache_hits = (int)apc_fetch("cache.$cache_id");\r | |
338 | $check_cache = ($apc_cache_hits >= 2);\r | |
339 | apc_inc("cache.$cache_id");\r | |
340 | if ($check_cache) {\r | |
341 | debug('Cache key found in APC, we\'ll try to load cache file from disk');\r | |
342 | } else {\r | |
343 | debug('Cache key not found in APC');\r | |
344 | }\r | |
345 | }\r | |
346 | if ($check_cache) {\r | |
347 | $cache = get_cache();\r | |
348 | if ($data = $cache->load($cache_id)) {\r | |
349 | if ($debug_mode) {\r | |
350 | debug('Loaded cached copy');\r | |
351 | exit;\r | |
352 | }\r | |
353 | if ($format == 'json') {\r | |
354 | if ($callback === null) {\r | |
355 | header('Content-type: application/json; charset=UTF-8');\r | |
356 | } else {\r | |
357 | header('Content-type: application/javascript; charset=UTF-8');\r | |
358 | }\r | |
359 | } else {\r | |
360 | header('Content-type: text/xml; charset=UTF-8');\r | |
361 | header('X-content-type-options: nosniff');\r | |
362 | }\r | |
363 | if (headers_sent()) die('Some data has already been output, can\'t send RSS file');\r | |
364 | if ($callback) {\r | |
365 | echo "$callback($data);";\r | |
366 | } else {\r | |
367 | echo $data;\r | |
368 | }\r | |
369 | exit;\r | |
370 | }\r | |
371 | }\r | |
372 | }\r | |
373 | \r | |
374 | //////////////////////////////////\r | |
375 | // Set Expires header\r | |
376 | //////////////////////////////////\r | |
377 | if (!$debug_mode) {\r | |
378 | header('Expires: ' . gmdate('D, d M Y H:i:s', time()+(60*10)) . ' GMT');\r | |
379 | }\r | |
380 | \r | |
381 | //////////////////////////////////\r | |
382 | // Set up HTTP agent\r | |
383 | //////////////////////////////////\r | |
384 | global $http;\r | |
385 | $http = new HumbleHttpAgent();\r | |
386 | $http->debug = $debug_mode;\r | |
387 | $http->userAgentMap = $options->user_agents;\r | |
388 | $http->headerOnlyTypes = array_keys($options->content_type_exc);\r | |
389 | $http->rewriteUrls = $options->rewrite_url;\r | |
390 | \r | |
391 | //////////////////////////////////\r | |
392 | // Set up Content Extractor\r | |
393 | //////////////////////////////////\r | |
394 | global $extractor;\r | |
395 | $extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');\r | |
396 | $extractor->debug = $debug_mode;\r | |
397 | SiteConfig::$debug = $debug_mode;\r | |
398 | SiteConfig::use_apc($options->apc);\r | |
399 | $extractor->fingerprints = $options->fingerprints;\r | |
400 | $extractor->allowedParsers = $options->allowed_parsers;\r | |
401 | \r | |
402 | ////////////////////////////////\r | |
403 | // Get RSS/Atom feed\r | |
404 | ////////////////////////////////\r | |
405 | if (!$html_only) {\r | |
406 | debug('--------');\r | |
407 | debug("Attempting to process URL as feed");\r | |
408 | // Send user agent header showing PHP (prevents a HTML response from feedburner)\r | |
409 | $http->userAgentDefault = HumbleHttpAgent::UA_PHP;\r | |
410 | // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance\r | |
411 | SimplePie_HumbleHttpAgent::set_agent($http);\r | |
412 | $feed = new SimplePie();\r | |
413 | // some feeds use the text/html content type - force_feed tells SimplePie to process anyway\r | |
414 | $feed->force_feed(true);\r | |
415 | $feed->set_file_class('SimplePie_HumbleHttpAgent');\r | |
416 | //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded\r | |
417 | $feed->feed_url = $url;\r | |
418 | $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);\r | |
419 | $feed->set_timeout(20);\r | |
420 | $feed->enable_cache(false);\r | |
421 | $feed->set_stupidly_fast(true);\r | |
422 | $feed->enable_order_by_date(false); // we don't want to do anything to the feed\r | |
423 | $feed->set_url_replacements(array());\r | |
424 | // initialise the feed\r | |
425 | // the @ suppresses notices which on some servers causes a 500 internal server error\r | |
426 | $result = @$feed->init();\r | |
427 | //$feed->handle_content_type();\r | |
428 | //$feed->get_title();\r | |
429 | if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {\r | |
430 | die('Sorry, no feed items found');\r | |
431 | }\r | |
432 | // from now on, we'll identify ourselves as a browser\r | |
433 | $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;\r | |
434 | }\r | |
435 | \r | |
436 | ////////////////////////////////////////////////////////////////////////////////\r | |
437 | // Our given URL is not a feed, so let's create our own feed with a single item:\r | |
438 | // the given URL. This basically treats all non-feed URLs as if they were\r | |
439 | // single-item feeds.\r | |
440 | ////////////////////////////////////////////////////////////////////////////////\r | |
441 | $isDummyFeed = false;\r | |
442 | if ($html_only || !$result) {\r | |
443 | debug('--------');\r | |
444 | debug("Constructing a single-item feed from URL");\r | |
445 | $isDummyFeed = true;\r | |
446 | unset($feed, $result);\r | |
447 | // create single item dummy feed object\r | |
448 | $feed = new DummySingleItemFeed($url);\r | |
449 | }\r | |
450 | \r | |
451 | ////////////////////////////////////////////\r | |
452 | // Create full-text feed\r | |
453 | ////////////////////////////////////////////\r | |
454 | $output = new FeedWriter();\r | |
455 | $output->setTitle(strip_tags($feed->get_title()));\r | |
456 | $output->setDescription(strip_tags($feed->get_description()));\r | |
457 | $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it\r | |
458 | if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment\r | |
459 | $output->addHub('http://fivefilters.superfeedr.com/');\r | |
460 | $output->addHub('http://pubsubhubbub.appspot.com/');\r | |
461 | $output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);\r | |
462 | }\r | |
463 | $output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons\r | |
464 | if ($img_url = $feed->get_image_url()) {\r | |
465 | $output->setImage($feed->get_title(), $feed->get_link(), $img_url);\r | |
466 | }\r | |
467 | \r | |
468 | ////////////////////////////////////////////\r | |
469 | // Loop through feed items\r | |
470 | ////////////////////////////////////////////\r | |
471 | $items = $feed->get_items(0, $max); \r | |
472 | // Request all feed items in parallel (if supported)\r | |
473 | $urls_sanitized = array();\r | |
474 | $urls = array();\r | |
475 | foreach ($items as $key => $item) {\r | |
476 | $permalink = htmlspecialchars_decode($item->get_permalink());\r | |
477 | // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded\r | |
478 | $permalink = str_replace('%3A', ':', $permalink);\r | |
479 | // validateUrl() strips non-ascii characters\r | |
480 | // simplepie already sanitizes URLs so let's not do it again here.\r | |
481 | //$permalink = $http->validateUrl($permalink);\r | |
482 | if ($permalink) {\r | |
483 | $urls_sanitized[] = $permalink;\r | |
484 | }\r | |
485 | $urls[$key] = $permalink;\r | |
486 | }\r | |
487 | debug('--------');\r | |
488 | debug('Fetching feed items');\r | |
489 | $http->fetchAll($urls_sanitized);\r | |
490 | //$http->cacheAll();\r | |
491 | \r | |
492 | // count number of items added to full feed\r | |
493 | $item_count = 0;\r | |
494 | \r | |
495 | foreach ($items as $key => $item) {\r | |
496 | debug('--------');\r | |
497 | debug('Processing feed item '.($item_count+1));\r | |
498 | $do_content_extraction = true;\r | |
499 | $extract_result = false;\r | |
500 | $text_sample = null;\r | |
501 | $permalink = $urls[$key];\r | |
502 | debug("Item URL: $permalink");\r | |
503 | $extracted_title = '';\r | |
504 | $feed_item_title = $item->get_title();\r | |
505 | if ($feed_item_title !== null) {\r | |
506 | $feed_item_title = strip_tags(htmlspecialchars_decode($feed_item_title));\r | |
507 | }\r | |
508 | $newitem = $output->createNewItem();\r | |
509 | $newitem->setTitle($feed_item_title);\r | |
510 | if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment\r | |
511 | if ($permalink !== false) {\r | |
512 | $newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($permalink));\r | |
513 | } else {\r | |
514 | $newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()));\r | |
515 | }\r | |
516 | } else {\r | |
517 | if ($permalink !== false) {\r | |
518 | $newitem->setLink($permalink);\r | |
519 | } else {\r | |
520 | $newitem->setLink($item->get_permalink());\r | |
521 | }\r | |
522 | }\r | |
523 | //if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {\r | |
524 | // Allowing error codes - some sites return correct content with error status\r | |
525 | // e.g. prospectmagazine.co.uk returns 403\r | |
526 | if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {\r | |
527 | $effective_url = $response['effective_url'];\r | |
528 | if (!url_allowed($effective_url)) continue;\r | |
529 | // check if action defined for returned Content-Type\r | |
530 | $mime_info = get_mime_action_info($response['headers']);\r | |
531 | if (isset($mime_info['action'])) {\r | |
532 | if ($mime_info['action'] == 'exclude') {\r | |
533 | continue; // skip this feed item entry\r | |
534 | } elseif ($mime_info['action'] == 'link') {\r | |
535 | if ($mime_info['type'] == 'image') {\r | |
536 | $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";\r | |
537 | } else {\r | |
538 | $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";\r | |
539 | }\r | |
540 | $extracted_title = $mime_info['name'];\r | |
541 | $do_content_extraction = false;\r | |
542 | }\r | |
543 | }\r | |
544 | if ($do_content_extraction) {\r | |
545 | $html = $response['body'];\r | |
546 | // remove strange things\r | |
547 | $html = str_replace('</[>', '', $html);\r | |
548 | $html = convert_to_utf8($html, $response['headers']);\r | |
549 | // check site config for single page URL - fetch it if found\r | |
550 | $is_single_page = false;\r | |
551 | if ($single_page_response = getSinglePage($item, $html, $effective_url)) {\r | |
552 | $is_single_page = true;\r | |
553 | $html = $single_page_response['body'];\r | |
554 | // remove strange things\r | |
555 | $html = str_replace('</[>', '', $html); \r | |
556 | $html = convert_to_utf8($html, $single_page_response['headers']);\r | |
557 | $effective_url = $single_page_response['effective_url'];\r | |
558 | debug("Retrieved single-page view from $effective_url");\r | |
559 | unset($single_page_response);\r | |
560 | }\r | |
561 | debug('--------');\r | |
562 | debug('Attempting to extract content');\r | |
563 | $extract_result = $extractor->process($html, $effective_url);\r | |
564 | $readability = $extractor->readability;\r | |
565 | $content_block = ($extract_result) ? $extractor->getContent() : null; \r | |
566 | $extracted_title = ($extract_result) ? $extractor->getTitle() : '';\r | |
567 | // Deal with multi-page articles\r | |
568 | //die('Next: '.$extractor->getNextPageUrl());\r | |
569 | $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());\r | |
570 | if ($options->multipage && $is_multi_page) {\r | |
571 | debug('--------');\r | |
572 | debug('Attempting to process multi-page article');\r | |
573 | $multi_page_urls = array();\r | |
574 | $multi_page_content = array();\r | |
575 | while ($next_page_url = $extractor->getNextPageUrl()) {\r | |
576 | debug('--------');\r | |
577 | debug('Processing next page: '.$next_page_url);\r | |
578 | // If we've got URL, resolve against $url\r | |
579 | if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) {\r | |
580 | // check it's not what we have already!\r | |
581 | if (!in_array($next_page_url, $multi_page_urls)) {\r | |
582 | // it's not, so let's attempt to fetch it\r | |
583 | $multi_page_urls[] = $next_page_url; \r | |
584 | $_prev_ref = $http->referer;\r | |
585 | if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {\r | |
586 | // make sure mime type is not something with a different action associated\r | |
587 | $page_mime_info = get_mime_action_info($response['headers']);\r | |
588 | if (!isset($page_mime_info['action'])) {\r | |
589 | $html = $response['body'];\r | |
590 | // remove strange things\r | |
591 | $html = str_replace('</[>', '', $html);\r | |
592 | $html = convert_to_utf8($html, $response['headers']);\r | |
593 | if ($extractor->process($html, $next_page_url)) {\r | |
594 | $multi_page_content[] = $extractor->getContent();\r | |
595 | continue;\r | |
596 | } else { debug('Failed to extract content'); }\r | |
597 | } else { debug('MIME type requires different action'); }\r | |
598 | } else { debug('Failed to fetch URL'); }\r | |
599 | } else { debug('URL already processed'); }\r | |
600 | } else { debug('Failed to resolve against '.$effective_url); }\r | |
601 | // failed to process next_page_url, so cancel further requests\r | |
602 | $multi_page_content = array();\r | |
603 | break;\r | |
604 | }\r | |
605 | // did we successfully deal with this multi-page article?\r | |
606 | if (empty($multi_page_content)) {\r | |
607 | debug('Failed to extract all parts of multi-page article, so not going to include them');\r | |
608 | $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';\r | |
609 | }\r | |
610 | foreach ($multi_page_content as $_page) {\r | |
611 | $_page = $content_block->ownerDocument->importNode($_page, true);\r | |
612 | $content_block->appendChild($_page);\r | |
613 | }\r | |
614 | unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url);\r | |
615 | }\r | |
616 | }\r | |
617 | // use extracted title for both feed and item title if we're using single-item dummy feed\r | |
618 | if ($isDummyFeed) {\r | |
619 | $output->setTitle($extracted_title);\r | |
620 | $newitem->setTitle($extracted_title);\r | |
621 | } else {\r | |
622 | // use extracted title instead of feed item title?\r | |
623 | if (!$favour_feed_titles && $extracted_title != '') {\r | |
624 | debug('Using extracted title in generated feed');\r | |
625 | $newitem->setTitle($extracted_title);\r | |
626 | }\r | |
627 | }\r | |
628 | }\r | |
629 | if ($do_content_extraction) {\r | |
630 | // if we failed to extract content...\r | |
631 | if (!$extract_result) {\r | |
632 | if ($exclude_on_fail) {\r | |
633 | debug('Failed to extract, so skipping (due to exclude on fail parameter)');\r | |
634 | continue; // skip this and move to next item\r | |
635 | }\r | |
636 | //TODO: get text sample for language detection\r | |
637 | $html = $options->error_message;\r | |
638 | // keep the original item description\r | |
639 | $html .= $item->get_description();\r | |
640 | } else {\r | |
641 | $readability->clean($content_block, 'select');\r | |
642 | if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);\r | |
643 | // footnotes\r | |
644 | if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {\r | |
645 | $readability->addFootnotes($content_block);\r | |
646 | }\r | |
647 | // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>\r | |
648 | while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {\r | |
649 | // only follow these tag names\r | |
650 | if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) break;\r | |
651 | //$html = $content_block->firstChild->innerHTML; // FTR 2.9.5\r | |
652 | $content_block = $content_block->firstChild;\r | |
653 | }\r | |
654 | // convert content block to HTML string\r | |
655 | // Need to preserve things like body: //img[@id='feature']\r | |
656 | if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {\r | |
657 | $html = $content_block->innerHTML;\r | |
658 | } else {\r | |
659 | $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML\r | |
660 | }\r | |
661 | unset($content_block);\r | |
662 | // post-processing cleanup\r | |
663 | $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);\r | |
664 | if ($links == 'remove') {\r | |
665 | $html = preg_replace('!</?a[^>]*>!', '', $html);\r | |
666 | }\r | |
667 | // get text sample for language detection\r | |
668 | $text_sample = strip_tags(substr($html, 0, 500));\r | |
669 | $html = make_substitutions($options->message_to_prepend).$html;\r | |
670 | $html .= make_substitutions($options->message_to_append);\r | |
671 | }\r | |
672 | }\r | |
673 | \r | |
674 | if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment\r | |
675 | $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));\r | |
676 | } else {\r | |
677 | $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));\r | |
678 | }\r | |
679 | // filter xss?\r | |
680 | if ($xss_filter) {\r | |
681 | debug('Filtering HTML to remove XSS');\r | |
682 | $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));\r | |
683 | }\r | |
684 | $newitem->setDescription($html);\r | |
685 | \r | |
686 | // set date\r | |
687 | if ((int)$item->get_date('U') > 0) {\r | |
688 | $newitem->setDate((int)$item->get_date('U'));\r | |
689 | } elseif ($extractor->getDate()) {\r | |
690 | $newitem->setDate($extractor->getDate());\r | |
691 | }\r | |
692 | \r | |
693 | // add authors\r | |
694 | if ($authors = $item->get_authors()) {\r | |
695 | foreach ($authors as $author) {\r | |
696 | // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel\r | |
697 | if ($author->get_name() !== null) {\r | |
698 | $newitem->addElement('dc:creator', $author->get_name());\r | |
699 | } elseif ($author->get_email() !== null) {\r | |
700 | $newitem->addElement('dc:creator', $author->get_email());\r | |
701 | }\r | |
702 | }\r | |
703 | } elseif ($authors = $extractor->getAuthors()) {\r | |
704 | //TODO: make sure the list size is reasonable\r | |
705 | foreach ($authors as $author) {\r | |
706 | // TODO: xpath often selects authors from other articles linked from the page.\r | |
707 | // for now choose first item\r | |
708 | $newitem->addElement('dc:creator', $author);\r | |
709 | break;\r | |
710 | }\r | |
711 | }\r | |
712 | \r | |
713 | // add language\r | |
714 | if ($detect_language) {\r | |
715 | $language = $extractor->getLanguage();\r | |
716 | if (!$language) $language = $feed->get_language();\r | |
717 | if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {\r | |
718 | try {\r | |
719 | if ($use_cld) {\r | |
720 | // Use PHP-CLD extension\r | |
721 | $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error\r | |
722 | $res = $php_cld($text_sample);\r | |
723 | if (is_array($res) && count($res) > 0) {\r | |
724 | $language = $res[0]['code'];\r | |
725 | } \r | |
726 | } else {\r | |
727 | //die('what');\r | |
728 | // Use PEAR's Text_LanguageDetect\r | |
729 | if (!isset($l)) {\r | |
730 | $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');\r | |
731 | }\r | |
732 | $l_result = $l->detect($text_sample, 1);\r | |
733 | if (count($l_result) > 0) {\r | |
734 | $language = $language_codes[key($l_result)];\r | |
735 | }\r | |
736 | }\r | |
737 | } catch (Exception $e) {\r | |
738 | //die('error: '.$e); \r | |
739 | // do nothing\r | |
740 | }\r | |
741 | }\r | |
742 | if ($language && (strlen($language) < 7)) { \r | |
743 | $newitem->addElement('dc:language', $language);\r | |
744 | }\r | |
745 | }\r | |
746 | \r | |
747 | // add MIME type (if it appeared in our exclusions lists)\r | |
748 | if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);\r | |
749 | // add effective URL (URL after redirects)\r | |
750 | if (isset($effective_url)) {\r | |
751 | //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.\r | |
752 | Content-type: text/html ]>