inc/poche/Url.class.php

   1 <?php
   2 /**
   3  * poche, a read it later open source system
   4  *
   5  * @category   poche
   6  * @author     Nicolas Lœuillet <support@inthepoche.com>
   7  * @copyright  2013
   8  * @license    http://www.wtfpl.net/ see COPYING file
   9  */
  10
  11 class Url
  12 {
  13     public $url;
  14
  15     private $fingerprints = array(
  16         // Posterous
  17         '<meta name="generator" content="Posterous"' => array('hostname'=>'fingerprint.posterous.com', 'head'=>true),
  18         // Blogger
  19         '<meta content=\'blogger\' name=\'generator\'' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
  20         '<meta name="generator" content="Blogger"' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
  21         // WordPress (self-hosted and hosted)
  22         '<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
  23     );
  24
  25     private $user_agents = array( 'lifehacker.com' => 'PHP/5.2',
  26                                    'gawker.com' => 'PHP/5.2',
  27                                    'deadspin.com' => 'PHP/5.2',
  28                                    'kotaku.com' => 'PHP/5.2',
  29                                    'jezebel.com' => 'PHP/5.2',
  30                                    'io9.com' => 'PHP/5.2',
  31                                    'jalopnik.com' => 'PHP/5.2',
  32                                    'gizmodo.com' => 'PHP/5.2',
  33                                    '.wikipedia.org' => 'Mozilla/5.2'
  34                                   );
  35
  36     private $content_type_exc = array(
  37                                    'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
  38                                    'image' => array('action'=>'link', 'name'=>'Image'),
  39                                    'audio' => array('action'=>'link', 'name'=>'Audio'),
  40                                    'video' => array('action'=>'link', 'name'=>'Video')
  41                                   );
  42
  43     private $rewrite_url = array(
  44         // Rewrite public Google Docs URLs to point to HTML view:
  45         // if a URL contains docs.google.com, replace /Doc? with /View?
  46         'docs.google.com' => array('/Doc?' => '/View?'),
  47         'tnr.com' => array('tnr.com/article/' => 'tnr.com/print/article/'),
  48         '.m.wikipedia.org' => array('.m.wikipedia.org' => '.wikipedia.org')
  49     );
  50
  51     private $rewrite_relative_urls = true;
  52     private $error_message = '[unable to retrieve full-text content]';
  53
  54     function __construct($url)
  55     {
  56         $this->url = base64_decode($url);
  57     }
  58
  59     public function getUrl() {
  60         return $this->url;
  61     }
  62
  63     public function setUrl($url) {
  64         $this->url = $url;
  65     }
  66
  67     public function isCorrect() {
  68         return filter_var($this->url, FILTER_VALIDATE_URL) !== FALSE;
  69     }
  70
  71     public function extract() {
  72         global $http, $extractor;
  73         $extractor = new ContentExtractor(dirname(__FILE__).'/../3rdparty/site_config/custom', dirname(__FILE__).'/../3rdparty/site_config/standard');
  74         $extractor->fingerprints = $this->fingerprints;
  75
  76         $http = new HumbleHttpAgent();
  77         $http->userAgentMap = $this->user_agents;
  78         $http->headerOnlyTypes = array_keys($this->content_type_exc);
  79         $http->rewriteUrls = $this->rewrite_url;
  80         $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
  81         // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
  82         SimplePie_HumbleHttpAgent::set_agent($http);
  83         $feed = new SimplePie();
  84         // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
  85         $feed->force_feed(true);
  86         $feed->set_file_class('SimplePie_HumbleHttpAgent');
  87         $feed->feed_url = $this->url;
  88         $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
  89         $feed->set_timeout(20);
  90         $feed->enable_cache(false);
  91         $feed->set_stupidly_fast(true);
  92         $feed->enable_order_by_date(false); // we don't want to do anything to the feed
  93         $feed->set_url_replacements(array());
  94         // initialise the feed
  95         // the @ suppresses notices which on some servers causes a 500 internal server error
  96         $result = @$feed->init();
  97         if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
  98             die('Sorry, no feed items found');
  99         }
 100         // from now on, we'll identify ourselves as a browser
 101         $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
 102         unset($feed, $result);
 103
 104         $feed = new DummySingleItemFeed($this->url);
 105
 106         $items = $feed->get_items(0, 1);
 107         // Request all feed items in parallel (if supported)
 108         $urls_sanitized = array();
 109         $urls = array();
 110         foreach ($items as $key => $item) {
 111             $permalink = htmlspecialchars_decode($item->get_permalink());
 112             // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
 113             $permalink = str_replace('%3A', ':', $permalink);
 114             if ($permalink) {
 115                 $urls_sanitized[] = $permalink;
 116             }
 117             $urls[$key] = $permalink;
 118         }
 119         $http->fetchAll($urls_sanitized);
 120
 121         foreach ($items as $key => $item) {
 122             $do_content_extraction = true;
 123             $extract_result = false;
 124             $permalink = $urls[$key];
 125
 126             // TODO: Allow error codes - some sites return correct content with error status
 127             // e.g. prospectmagazine.co.uk returns 403
 128
 129             if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
 130                 $effective_url = $response['effective_url'];
 131                 // check if action defined for returned Content-Type
 132                 $type = null;
 133                 if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $response['headers'], $match)) {
 134                     // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
 135                     $match[1] = strtolower(trim($match[1]));
 136                     $match[2] = strtolower(trim($match[2]));
 137                     foreach (array($match[1], $match[2]) as $_mime) {
 138                         if (isset($this->content_type_exc[$_mime])) {
 139                             $type = $match[1];
 140                             $_act = $this->content_type_exc[$_mime]['action'];
 141                             $_name = $this->content_type_exc[$_mime]['name'];
 142                             if ($_act == 'exclude') {
 143                                 continue 2; // skip this feed item entry
 144                             } elseif ($_act == 'link') {
 145                                 if ($match[2] == 'image') {
 146                                     $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"$_name\" /></a>";
 147                                 } else {
 148                                     $html = "<a href=\"$effective_url\">Download $_name</a>";
 149                                 }
 150                                 $title = $_name;
 151                                 $do_content_extraction = false;
 152                                 break;
 153                             }
 154                         }
 155                     }
 156                     unset($_mime, $_act, $_name, $match);
 157                 }
 158                 if ($do_content_extraction) {
 159                     $html = $response['body'];
 160                     // remove strange things
 161                     $html = str_replace('</[>', '', $html);
 162                     $html = $this->convert_to_utf8($html, $response['headers']);
 163
 164                     // check site config for single page URL - fetch it if found
 165                     if ($single_page_response = $this->getSinglePage($item, $html, $effective_url)) {
 166                         $html = $single_page_response['body'];
 167                         // remove strange things
 168                         $html = str_replace('</[>', '', $html);
 169                         $html = $this->convert_to_utf8($html, $single_page_response['headers']);
 170                         $effective_url = $single_page_response['effective_url'];
 171                         unset($single_page_response);
 172                     }
 173                     $extract_result = $extractor->process($html, $effective_url);
 174                     $readability = $extractor->readability;
 175                     $content_block = ($extract_result) ? $extractor->getContent() : null;
 176                 }
 177             }
 178             if ($do_content_extraction) {
 179                 // if we failed to extract content...
 180                 if (!$extract_result) {
 181                     $html = $this->error_message;
 182                     // keep the original item description
 183                     $html .= $item->get_description();
 184                 } else {
 185                     $readability->clean($content_block, 'select');
 186                     if ($this->rewrite_relative_urls) $this->makeAbsolute($effective_url, $content_block);
 187                     if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
 188                         $html = $content_block->firstChild->innerHTML;
 189                     } else {
 190                         $html = $content_block->innerHTML;
 191                     }
 192                     // post-processing cleanup
 193                     $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
 194                 }
 195             }
 196         }
 197
 198         $title = ($extractor->getTitle() != '' ? $extractor->getTitle() : _('Untitled'));
 199         $content = array ('title' => $title, 'body' => $html);
 200
 201         return $content;
 202     }
 203
 204     private function convert_to_utf8($html, $header=null)
 205     {
 206         $encoding = null;
 207         if ($html || $header) {
 208             if (is_array($header)) $header = implode("\n", $header);
 209             if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
 210                 // error parsing the response
 211             } else {
 212                 $match = end($match); // get last matched element (in case of redirects)
 213                 if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
 214             }
 215             // TODO: check to see if encoding is supported (can we convert it?)
 216             // If it's not, result will be empty string.
 217             // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
 218             // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
 219             if (!$encoding || $encoding == 'none') {
 220                 // search for encoding in HTML - only look at the first 35000 characters
 221                 $html_head = substr($html, 0, 40000);
 222                 if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
 223                     $encoding = trim($match[1], '"\'');
 224                 } elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
 225                     $encoding = trim($match[1]);
 226                 } elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
 227                     foreach ($match[1] as $_test) {
 228                         if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
 229                             $encoding = trim($_m[1]);
 230                             break;
 231                         }
 232                     }
 233                 }
 234             }
 235             if (isset($encoding)) $encoding = trim($encoding);
 236             // trim is important here!
 237             if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
 238                 // replace MS Word smart qutoes
 239                 $trans = array();
 240                 $trans[chr(130)] = '&sbquo;';    // Single Low-9 Quotation Mark
 241                 $trans[chr(131)] = '&fnof;';    // Latin Small Letter F With Hook
 242                 $trans[chr(132)] = '&bdquo;';    // Double Low-9 Quotation Mark
 243                 $trans[chr(133)] = '&hellip;';    // Horizontal Ellipsis
 244                 $trans[chr(134)] = '&dagger;';    // Dagger
 245                 $trans[chr(135)] = '&Dagger;';    // Double Dagger
 246                 $trans[chr(136)] = '&circ;';    // Modifier Letter Circumflex Accent
 247                 $trans[chr(137)] = '&permil;';    // Per Mille Sign
 248                 $trans[chr(138)] = '&Scaron;';    // Latin Capital Letter S With Caron
 249                 $trans[chr(139)] = '&lsaquo;';    // Single Left-Pointing Angle Quotation Mark
 250                 $trans[chr(140)] = '&OElig;';    // Latin Capital Ligature OE
 251                 $trans[chr(145)] = '&lsquo;';    // Left Single Quotation Mark
 252                 $trans[chr(146)] = '&rsquo;';    // Right Single Quotation Mark
 253                 $trans[chr(147)] = '&ldquo;';    // Left Double Quotation Mark
 254                 $trans[chr(148)] = '&rdquo;';    // Right Double Quotation Mark
 255                 $trans[chr(149)] = '&bull;';    // Bullet
 256                 $trans[chr(150)] = '&ndash;';    // En Dash
 257                 $trans[chr(151)] = '&mdash;';    // Em Dash
 258                 $trans[chr(152)] = '&tilde;';    // Small Tilde
 259                 $trans[chr(153)] = '&trade;';    // Trade Mark Sign
 260                 $trans[chr(154)] = '&scaron;';    // Latin Small Letter S With Caron
 261                 $trans[chr(155)] = '&rsaquo;';    // Single Right-Pointing Angle Quotation Mark
 262                 $trans[chr(156)] = '&oelig;';    // Latin Small Ligature OE
 263                 $trans[chr(159)] = '&Yuml;';    // Latin Capital Letter Y With Diaeresis
 264                 $html = strtr($html, $trans);
 265             }
 266             if (!$encoding) {
 267                 $encoding = 'utf-8';
 268             } else {
 269                 if (strtolower($encoding) != 'utf-8') {
 270                     $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
 271                     /*
 272                     if (function_exists('iconv')) {
 273                         // iconv appears to handle certain character encodings better than mb_convert_encoding
 274                         $html = iconv($encoding, 'utf-8', $html);
 275                     } else {
 276                         $html = mb_convert_encoding($html, 'utf-8', $encoding);
 277                     }
 278                     */
 279                 }
 280             }
 281         }
 282         return $html;
 283     }
 284
 285     private function makeAbsolute($base, $elem) {
 286         $base = new SimplePie_IRI($base);
 287         // remove '//' in URL path (used to prevent URLs from resolving properly)
 288         // TODO: check if this is still the case
 289         if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
 290         foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
 291             $elems = $elem->getElementsByTagName($tag);
 292             for ($i = $elems->length-1; $i >= 0; $i--) {
 293                 $e = $elems->item($i);
 294                 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
 295                 $this->makeAbsoluteAttr($base, $e, $attr);
 296             }
 297             if (strtolower($elem->tagName) == $tag) $this->makeAbsoluteAttr($base, $elem, $attr);
 298         }
 299     }
 300
 301     private function makeAbsoluteAttr($base, $e, $attr) {
 302         if ($e->hasAttribute($attr)) {
 303             // Trim leading and trailing white space. I don't really like this but
 304             // unfortunately it does appear on some sites. e.g.  <img src=" /path/to/image.jpg" />
 305             $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
 306             $url = str_replace(' ', '%20', $url);
 307             if (!preg_match('!https?://!i', $url)) {
 308                 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
 309                     $e->setAttribute($attr, $absolute);
 310                 }
 311             }
 312         }
 313     }
 314
 315     private function makeAbsoluteStr($base, $url) {
 316         $base = new SimplePie_IRI($base);
 317         // remove '//' in URL path (causes URLs not to resolve properly)
 318         if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
 319         if (preg_match('!^https?://!i', $url)) {
 320             // already absolute
 321             return $url;
 322         } else {
 323             if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
 324                 return $absolute;
 325             }
 326             return false;
 327         }
 328     }
 329
 330     // returns single page response, or false if not found
 331     private function getSinglePage($item, $html, $url) {
 332         global $http, $extractor;
 333         $host = @parse_url($url, PHP_URL_HOST);
 334         $site_config = SiteConfig::build($host);
 335         if ($site_config === false) {
 336             // check for fingerprints
 337             if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
 338                 $site_config = SiteConfig::build($_fphost);
 339             }
 340             if ($site_config === false) $site_config = new SiteConfig();
 341             SiteConfig::add_to_cache($host, $site_config);
 342             return false;
 343         } else {
 344             SiteConfig::add_to_cache($host, $site_config);
 345         }
 346         $splink = null;
 347         if (!empty($site_config->single_page_link)) {
 348             $splink = $site_config->single_page_link;
 349         } elseif (!empty($site_config->single_page_link_in_feed)) {
 350             // single page link xpath is targeted at feed
 351             $splink = $site_config->single_page_link_in_feed;
 352             // so let's replace HTML with feed item description
 353             $html = $item->get_description();
 354         }
 355         if (isset($splink)) {
 356             // Build DOM tree from HTML
 357             $readability = new Readability($html, $url);
 358             $xpath = new DOMXPath($readability->dom);
 359             // Loop through single_page_link xpath expressions
 360             $single_page_url = null;
 361             foreach ($splink as $pattern) {
 362                 $elems = @$xpath->evaluate($pattern, $readability->dom);
 363                 if (is_string($elems)) {
 364                     $single_page_url = trim($elems);
 365                     break;
 366                 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
 367                     foreach ($elems as $item) {
 368                         if ($item instanceof DOMElement && $item->hasAttribute('href')) {
 369                             $single_page_url = $item->getAttribute('href');
 370                             break;
 371                         } elseif ($item instanceof DOMAttr && $item->value) {
 372                             $single_page_url = $item->value;
 373                             break;
 374                         }
 375                     }
 376                 }
 377             }
 378             // If we've got URL, resolve against $url
 379             if (isset($single_page_url) && ($single_page_url = $this->makeAbsoluteStr($url, $single_page_url))) {
 380                 // check it's not what we have already!
 381                 if ($single_page_url != $url) {
 382                     // it's not, so let's try to fetch it...
 383                     $_prev_ref = $http->referer;
 384                     $http->referer = $single_page_url;
 385                     if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
 386                         $http->referer = $_prev_ref;
 387                         return $response;
 388                     }
 389                     $http->referer = $_prev_ref;
 390                 }
 391             }
 392         }
 393         return false;
 394     }
 395 }