From 3ec62cf95ab4436923d4c665fad7aef226cbb822 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 22 May 2014 17:16:38 +0300 Subject: update to 3.2 version of full-text-rss, issue #694 --- inc/3rdparty/makefulltextfeed.php | 349 ++++++++++++++++++++++---------------- 1 file changed, 203 insertions(+), 146 deletions(-) (limited to 'inc/3rdparty/makefulltextfeed.php') diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php index 4faad6d9..7a56be8c 100755 --- a/inc/3rdparty/makefulltextfeed.php +++ b/inc/3rdparty/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh // License: AGPLv3 -// Version: 3.1 -// Date: 2013-03-05 +// Version: 3.2 +// Date: 2013-05-13 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -25,12 +25,8 @@ along with this program. If not, see . // Usage // ----- -// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org -// The following options can be passed in the querystring: -// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url)) -// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected) -// * API key: key=[api key] (optional, refer to config.php) -// * Max entries to process: max=[max number of items] (optional) +// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article +// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage error_reporting(E_ALL ^ E_NOTICE); ini_set("display_errors", 1); @@ -76,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow'); //////////////////////////////// // Check if service is enabled //////////////////////////////// -if (!$options->enabled) { - die('The full-text RSS service is currently disabled'); +if (!$options->enabled) { + die('The full-text RSS service is currently disabled'); } //////////////////////////////// @@ -121,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc'); //////////////////////////////// // Check for feed URL //////////////////////////////// -if (!isset($_GET['url'])) { - die('No URL supplied'); +if (!isset($_GET['url'])) { + die('No URL supplied'); } $url = trim($_GET['url']); if (strtolower(substr($url, 0, 7)) == 'feed://') { @@ -161,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); - if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); + if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']); if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); if (isset($_GET['xss'])) $redirect .= '&xss'; if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; + if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']); + if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']); if (isset($_GET['debug'])) $redirect .= '&debug'; if ($debug_mode) { debug('Redirecting to hide access key, follow URL below to continue'); @@ -177,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap /////////////////////////////////////////////// // Set timezone. -// Prevents warnings, but needs more testing - +// Prevents warnings, but needs more testing - // perhaps if timezone is set in php.ini we // don't need to set it at all... /////////////////////////////////////////////// @@ -199,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int } $key_index = ($valid_key) ? (int)$_GET['key'] : 0; if (!$valid_key && $options->key_required) { - die('A valid key must be supplied'); + die('A valid key must be supplied'); } if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') { die('The entered key is invalid'); @@ -250,6 +248,28 @@ if ($options->favour_feed_titles == 'user') { $favour_feed_titles = $options->favour_feed_titles; } +/////////////////////////////////////////////// +// Include full content in output? +/////////////////////////////////////////////// +if ($options->content === 'user') { + if (isset($_GET['content']) && $_GET['content'] === '0') { + $options->content = false; + } else { + $options->content = true; + } +} + +/////////////////////////////////////////////// +// Include summaries in output? +/////////////////////////////////////////////// +if ($options->summary === 'user') { + if (isset($_GET['summary']) && $_GET['summary'] === '1') { + $options->summary = true; + } else { + $options->summary = false; + } +} + /////////////////////////////////////////////// // Exclude items if extraction fails /////////////////////////////////////////////// @@ -272,15 +292,6 @@ if ($options->detect_language === 'user') { $detect_language = $options->detect_language; } -if ($detect_language >= 2) { - $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg', - 'cebuano' => 'ceb', // ISO 639-2 - 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha', - 'hawaiian' => 'haw', // ISO 639-2 - 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps', - 'pidgin' => 'cpe', // ISO 639-2 - 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy'); -} $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); ///////////////////////////////////// @@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *'); ////////////////////////////////// if ($options->caching) { debug('Caching is enabled...'); - $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); + $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); $check_cache = true; if ($options->apc && $options->smart_cache) { apc_add("cache.$cache_id", 0, 10*60); @@ -468,7 +479,7 @@ if ($img_url = $feed->get_image_url()) { //////////////////////////////////////////// // Loop through feed items //////////////////////////////////////////// -$items = $feed->get_items(0, $max); +$items = $feed->get_items(0, $max); // Request all feed items in parallel (if supported) $urls_sanitized = array(); $urls = array(); @@ -550,24 +561,43 @@ foreach ($items as $key => $item) { $is_single_page = false; if ($single_page_response = getSinglePage($item, $html, $effective_url)) { $is_single_page = true; - $html = $single_page_response['body']; - // remove strange things - $html = str_replace('', '', $html); - $html = convert_to_utf8($html, $single_page_response['headers']); $effective_url = $single_page_response['effective_url']; - debug("Retrieved single-page view from $effective_url"); + // check if action defined for returned Content-Type + $mime_info = get_mime_action_info($single_page_response['headers']); + if (isset($mime_info['action'])) { + if ($mime_info['action'] == 'exclude') { + continue; // skip this feed item entry + } elseif ($mime_info['action'] == 'link') { + if ($mime_info['type'] == 'image') { + $html = "\"{$mime_info['name']}\""; + } else { + $html = "Download {$mime_info['name']}"; + } + $extracted_title = $mime_info['name']; + $do_content_extraction = false; + } + } + if ($do_content_extraction) { + $html = $single_page_response['body']; + // remove strange things + $html = str_replace('', '', $html); + $html = convert_to_utf8($html, $single_page_response['headers']); + debug("Retrieved single-page view from $effective_url"); + } unset($single_page_response); } + } + if ($do_content_extraction) { debug('--------'); debug('Attempting to extract content'); $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; - $content_block = ($extract_result) ? $extractor->getContent() : null; + $content_block = ($extract_result) ? $extractor->getContent() : null; $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); - if ($options->multipage && $is_multi_page) { + if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); @@ -580,7 +610,7 @@ foreach ($items as $key => $item) { // check it's not what we have already! if (!in_array($next_page_url, $multi_page_urls)) { // it's not, so let's attempt to fetch it - $multi_page_urls[] = $next_page_url; + $multi_page_urls[] = $next_page_url; $_prev_ref = $http->referer; if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) { // make sure mime type is not something with a different action associated @@ -605,13 +635,15 @@ foreach ($items as $key => $item) { // did we successfully deal with this multi-page article? if (empty($multi_page_content)) { debug('Failed to extract all parts of multi-page article, so not going to include them'); - $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = 'This article appears to continue on subsequent pages which we could not extract'; + $_page = $readability->dom->createElement('p'); + $_page->innerHTML = 'This article appears to continue on subsequent pages which we could not extract'; + $multi_page_content[] = $_page; } foreach ($multi_page_content as $_page) { $_page = $content_block->ownerDocument->importNode($_page, true); $content_block->appendChild($_page); } - unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); + unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page); } } // use extracted title for both feed and item title if we're using single-item dummy feed @@ -658,7 +690,7 @@ foreach ($items as $key => $item) { } else { $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML } - unset($content_block); + //unset($content_block); // post-processing cleanup $html = preg_replace('!

[\s\h\v]*

!u', '', $html); if ($links == 'remove') { @@ -671,130 +703,155 @@ foreach ($items as $key => $item) { } } - if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment - $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); + if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment + $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); + } else { + $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); + } + // filter xss? + if ($xss_filter) { + debug('Filtering HTML to remove XSS'); + $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); + } + + // add content + if ($options->summary === true) { + // get summary + $summary = ''; + if (!$do_content_extraction) { + $summary = $html; } else { - $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); - } - // filter xss? - if ($xss_filter) { - debug('Filtering HTML to remove XSS'); - $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); - } - $newitem->setDescription($html); - - // set date - if ((int)$item->get_date('U') > 0) { - $newitem->setDate((int)$item->get_date('U')); - } elseif ($extractor->getDate()) { - $newitem->setDate($extractor->getDate()); - } - - // add authors - if ($authors = $item->get_authors()) { - foreach ($authors as $author) { - // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel - if ($author->get_name() !== null) { - $newitem->addElement('dc:creator', $author->get_name()); - } elseif ($author->get_email() !== null) { - $newitem->addElement('dc:creator', $author->get_email()); + // Try to get first few paragraphs + if (isset($content_block) && ($content_block instanceof DOMElement)) { + $_paras = $content_block->getElementsByTagName('p'); + foreach ($_paras as $_para) { + $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' '; + if (strlen($summary) > 200) break; } + } else { + $summary = $html; } - } elseif ($authors = $extractor->getAuthors()) { - //TODO: make sure the list size is reasonable - foreach ($authors as $author) { - // TODO: xpath often selects authors from other articles linked from the page. - // for now choose first item - $newitem->addElement('dc:creator', $author); - break; + } + unset($_paras, $_para); + $summary = get_excerpt($summary); + $newitem->setDescription($summary); + if ($options->content) $newitem->setElement('content:encoded', $html); + } else { + if ($options->content) $newitem->setDescription($html); + } + + // set date + if ((int)$item->get_date('U') > 0) { + $newitem->setDate((int)$item->get_date('U')); + } elseif ($extractor->getDate()) { + $newitem->setDate($extractor->getDate()); + } + + // add authors + if ($authors = $item->get_authors()) { + foreach ($authors as $author) { + // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel + if ($author->get_name() !== null) { + $newitem->addElement('dc:creator', $author->get_name()); + } elseif ($author->get_email() !== null) { + $newitem->addElement('dc:creator', $author->get_email()); } } - - // add language - if ($detect_language) { - $language = $extractor->getLanguage(); - if (!$language) $language = $feed->get_language(); - if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { - try { - if ($use_cld) { - // Use PHP-CLD extension - $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error - $res = $php_cld($text_sample); - if (is_array($res) && count($res) > 0) { - $language = $res[0]['code']; - } - } else { - //die('what'); - // Use PEAR's Text_LanguageDetect - if (!isset($l)) { - $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); - } - $l_result = $l->detect($text_sample, 1); - if (count($l_result) > 0) { - $language = $language_codes[key($l_result)]; - } + } elseif ($authors = $extractor->getAuthors()) { + //TODO: make sure the list size is reasonable + foreach ($authors as $author) { + // TODO: xpath often selects authors from other articles linked from the page. + // for now choose first item + $newitem->addElement('dc:creator', $author); + break; + } + } + + // add language + if ($detect_language) { + $language = $extractor->getLanguage(); + if (!$language) $language = $feed->get_language(); + if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { + try { + if ($use_cld) { + // Use PHP-CLD extension + $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error + $res = $php_cld($text_sample); + if (is_array($res) && count($res) > 0) { + $language = $res[0]['code']; + } + } else { + //die('what'); + // Use PEAR's Text_LanguageDetect + if (!isset($l)) { + $l = new Text_LanguageDetect(); + $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en") + } + $l_result = $l->detect($text_sample, 1); + if (count($l_result) > 0) { + $language = key($l_result); } - } catch (Exception $e) { - //die('error: '.$e); - // do nothing } - } - if ($language && (strlen($language) < 7)) { - $newitem->addElement('dc:language', $language); + } catch (Exception $e) { + //die('error: '.$e); + // do nothing } } - - // add MIME type (if it appeared in our exclusions lists) - if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); - // add effective URL (URL after redirects) - if (isset($effective_url)) { - //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. - //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi) - //temporary measure: use utf8_encode() - $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); - } else { - $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); + if ($language && (strlen($language) < 7)) { + $newitem->addElement('dc:language', $language); } - - // add categories - if ($categories = $item->get_categories()) { - foreach ($categories as $category) { - if ($category->get_label() !== null) { - $newitem->addElement('category', $category->get_label()); - } + } + + // add MIME type (if it appeared in our exclusions lists) + if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); + // add effective URL (URL after redirects) + if (isset($effective_url)) { + //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. + //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi) + //temporary measure: use utf8_encode() + $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); + } else { + $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); + } + + // add categories + if ($categories = $item->get_categories()) { + foreach ($categories as $category) { + if ($category->get_label() !== null) { + $newitem->addElement('category', $category->get_label()); } } - - // check for enclosures - if ($options->keep_enclosures) { - if ($enclosures = $item->get_enclosures()) { - foreach ($enclosures as $enclosure) { - // thumbnails - foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { - $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); - } - if (!$enclosure->get_link()) continue; - $enc = array(); - // Media RSS spec ($enc): http://search.yahoo.com/mrss - // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 - $enc['url'] = $enclosure->get_link(); - if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); - if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); - if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); - if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); - if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); - if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); - if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); - if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); - if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); - if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); - if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); - if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); - $newitem->addElement('media:content', '', $enc); + } + + // check for enclosures + if ($options->keep_enclosures) { + if ($enclosures = $item->get_enclosures()) { + foreach ($enclosures as $enclosure) { + // thumbnails + foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { + $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); } + if (!$enclosure->get_link()) continue; + $enc = array(); + // Media RSS spec ($enc): http://search.yahoo.com/mrss + // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 + $enc['url'] = $enclosure->get_link(); + if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); + if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); + if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); + if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); + if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); + if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); + if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); + if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); + if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); + if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); + if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); + if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); + $newitem->addElement('media:content', '', $enc); } } - /* } */ + } $output->addItem($newitem); unset($html); $item_count++; -- cgit v1.2.3