+ $http = new HumbleHttpAgent();
+ $http->userAgentMap = $this->user_agents;
+ $http->headerOnlyTypes = array_keys($this->content_type_exc);
+ $http->rewriteUrls = $this->rewrite_url;
+ $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
+ // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
+ SimplePie_HumbleHttpAgent::set_agent($http);
+ $feed = new SimplePie();
+ // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
+ $feed->force_feed(true);
+ $feed->set_file_class('SimplePie_HumbleHttpAgent');
+ $feed->feed_url = $this->url;
+ $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
+ $feed->set_timeout(20);
+ $feed->enable_cache(false);
+ $feed->set_stupidly_fast(true);
+ $feed->enable_order_by_date(false); // we don't want to do anything to the feed
+ $feed->set_url_replacements(array());
+ // initialise the feed
+ // the @ suppresses notices which on some servers causes a 500 internal server error
+ $result = @$feed->init();
+ if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
+ die('Sorry, no feed items found');
+ }
+ // from now on, we'll identify ourselves as a browser
+ $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
+ unset($feed, $result);
+
+ $feed = new DummySingleItemFeed($this->url);
+
+ $items = $feed->get_items(0, 1);
+ // Request all feed items in parallel (if supported)
+ $urls_sanitized = array();
+ $urls = array();
+ foreach ($items as $key => $item) {
+ $permalink = htmlspecialchars_decode($item->get_permalink());
+ // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
+ $permalink = str_replace('%3A', ':', $permalink);
+ if ($permalink) {
+ $urls_sanitized[] = $permalink;
+ }
+ $urls[$key] = $permalink;
+ }
+ $http->fetchAll($urls_sanitized);
+
+ foreach ($items as $key => $item) {
+ $do_content_extraction = true;
+ $extract_result = false;
+ $permalink = $urls[$key];
+
+ // TODO: Allow error codes - some sites return correct content with error status
+ // e.g. prospectmagazine.co.uk returns 403
+
+ if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
+ $effective_url = $response['effective_url'];
+ // check if action defined for returned Content-Type
+ $type = null;
+ if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $response['headers'], $match)) {
+ // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
+ $match[1] = strtolower(trim($match[1]));
+ $match[2] = strtolower(trim($match[2]));
+ foreach (array($match[1], $match[2]) as $_mime) {
+ if (isset($this->content_type_exc[$_mime])) {
+ $type = $match[1];
+ $_act = $this->content_type_exc[$_mime]['action'];
+ $_name = $this->content_type_exc[$_mime]['name'];
+ if ($_act == 'exclude') {
+ continue 2; // skip this feed item entry
+ } elseif ($_act == 'link') {
+ if ($match[2] == 'image') {
+ $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"$_name\" /></a>";
+ } else {
+ $html = "<a href=\"$effective_url\">Download $_name</a>";
+ }
+ $title = $_name;
+ $do_content_extraction = false;
+ break;
+ }
+ }
+ }
+ unset($_mime, $_act, $_name, $match);
+ }
+ if ($do_content_extraction) {
+ $html = $response['body'];
+ // remove strange things
+ $html = str_replace('</[>', '', $html);
+ $html = $this->convert_to_utf8($html, $response['headers']);
+
+ // check site config for single page URL - fetch it if found
+ if ($single_page_response = $this->getSinglePage($item, $html, $effective_url)) {
+ $html = $single_page_response['body'];
+ // remove strange things
+ $html = str_replace('</[>', '', $html);
+ $html = $this->convert_to_utf8($html, $single_page_response['headers']);
+ $effective_url = $single_page_response['effective_url'];
+ unset($single_page_response);
+ }
+ $extract_result = $extractor->process($html, $effective_url);
+ $readability = $extractor->readability;
+ $content_block = ($extract_result) ? $extractor->getContent() : null;
+ }
+ }
+ if ($do_content_extraction) {
+ // if we failed to extract content...
+ if (!$extract_result) {
+ $html = $this->error_message;
+ // keep the original item description
+ $html .= $item->get_description();
+ } else {
+ $readability->clean($content_block, 'select');
+ if ($this->rewrite_relative_urls) $this->makeAbsolute($effective_url, $content_block);
+ if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
+ $html = $content_block->firstChild->innerHTML;
+ } else {
+ $html = $content_block->innerHTML;
+ }
+ // post-processing cleanup
+ $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
+ }
+ }
+ }
+
+ $title = ($extractor->getTitle() != '' ? $extractor->getTitle() : _('Untitled'));
+ $content = array ('title' => $title, 'body' => $html);
+
+ return $content;