]> git.immae.eu Git - github/wallabag/wallabag.git/commitdiff
update to 3.2 version of full-text-rss, issue #694
authorMaryana Rozhankivska <mariroz@mr.lviv.ua>
Thu, 22 May 2014 14:16:38 +0000 (17:16 +0300)
committerMaryana Rozhankivska <mariroz@mr.lviv.ua>
Thu, 22 May 2014 14:16:38 +0000 (17:16 +0300)
15 files changed:
inc/3rdparty/config.php
inc/3rdparty/libraries/content-extractor/ContentExtractor.php
inc/3rdparty/libraries/content-extractor/SiteConfig.php
inc/3rdparty/libraries/feedwriter/FeedItem.php [changed mode: 0644->0755]
inc/3rdparty/libraries/feedwriter/FeedWriter.php
inc/3rdparty/libraries/html5/TreeBuilder.php
inc/3rdparty/libraries/humble-http-agent/CookieJar.php
inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
inc/3rdparty/libraries/language-detect/LanguageDetect.php
inc/3rdparty/libraries/readability/Readability.php
inc/3rdparty/makefulltextfeed.php
inc/3rdparty/makefulltextfeedHelpers.php
inc/3rdparty/site_config/index.php
inc/3rdparty/site_config/standard/version.txt

index e618117b7190da886aa40731fba02896f4521664..ec680d8659cbc3f3b84dad4d678bf58ff7bae760 100755 (executable)
@@ -19,7 +19,7 @@ if (!isset($options)) $options = new stdClass();
 // Enable service\r
 // ----------------------\r
 // Set this to false if you want to disable the service.\r
-// If set to false, no feed is produced and users will \r
+// If set to false, no feed is produced and users will\r
 // be told that the service is disabled.\r
 $options->enabled = true;\r
 \r
@@ -43,10 +43,64 @@ $options->default_entries = 5;
 // ----------------------\r
 // The maximum number of feed items to process when no access key is supplied.\r
 // This limits the user-supplied &max=x value. For example, if the user\r
-// asks for 20 items to be processed (&max=20), if max_entries is set to \r
+// asks for 20 items to be processed (&max=20), if max_entries is set to\r
 // 10, only 10 will be processed.\r
 $options->max_entries = 10;\r
 \r
+// Full content\r
+// ----------------------\r
+// By default Full-Text RSS includes the extracted content in the output.\r
+// You can exclude this from the output by passing '&content=0' in the querystring.\r
+//\r
+// Possible values...\r
+// Always include: true\r
+// Never include: false\r
+// Include unless user overrides (&content=0): 'user' (default)\r
+//\r
+// Note: currently this does not disable full content extraction. It simply omits it\r
+// from the output.\r
+$options->content = 'user';\r
+\r
+// Excerpts\r
+// ----------------------\r
+// By default Full-Text RSS does not include excerpts in the output.\r
+// You can enable this by passing '&summary=1' in the querystring.\r
+// This will include a plain text excerpt from the extracted content.\r
+//\r
+// Possible values...\r
+// Always include: true (recommended for new users)\r
+// Never include: false\r
+// Don't include unless user overrides (&summary=1): 'user' (default)\r
+//\r
+// Important: if both content and excerpts are requested, the excerpt will be\r
+// placed in the description element and the full content inside content:encoded.\r
+// If excerpts are not requested, the full content will go inside the description element.\r
+//\r
+// Why are we not returning both excerpts and content by default?\r
+// Mainly for backward compatibility.\r
+// Excerpts should appear in the feed item's description element. Previous versions\r
+// of Full-Text RSS did not return excerpts, so the description element was always\r
+// used for the full content (as recommended by the RSS advisory). When returning both,\r
+// we need somewhere else to place the content (content:encoded).\r
+// Having both enabled should not create any problems for news readers, but it may create\r
+// problems for developers upgrading from one of our earlier versions who may now find\r
+// their applications are returning excerpts instead of the full content they were\r
+// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,\r
+// excerpts must be explicitly requested in the querystring by default.\r
+//\r
+// Why not use a different element name for excerpts?\r
+// According to the RSS advisory:\r
+// "Publishers who employ summaries should store the summary in description and\r
+// the full content in content:encoded, ordering description first within the item.\r
+// On items with no summary, the full content should be stored in description."\r
+// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded\r
+//\r
+// For more consistent element naming, we recommend new users set this option to true.\r
+// The full content can still be excluded via the querystring, but the element names\r
+// will not change: when $options->summary = true, the description element will always\r
+// be reserved for the excerpt and content:encoded always for full content.\r
+$options->summary = 'user';\r
+\r
 // Rewrite relative URLs\r
 // ----------------------\r
 // With this enabled relative URLs found in the extracted content\r
@@ -67,7 +121,7 @@ $options->exclude_items_on_fail = 'user';
 // Enable multi-page support\r
 // -------------------------\r
 // If enabled, we will try to follow next page links on multi-page articles.\r
-// Currently this only happens for sites where next_page_link has been defined \r
+// Currently this only happens for sites where next_page_link has been defined\r
 // in a site config file.\r
 $options->multipage = true;\r
 \r
@@ -125,10 +179,10 @@ $options->detect_language = 1;
 \r
 // Registration key\r
 // ---------------\r
-// The registration key is optional. It is not required to use Full-Text RSS, \r
-// and does not affect the normal operation of Full-Text RSS. It is currently \r
-// only used on admin pages which help you update site patterns with the \r
-// latest version offered by FiveFilters.org. For these admin-related \r
+// The registration key is optional. It is not required to use Full-Text RSS,\r
+// and does not affect the normal operation of Full-Text RSS. It is currently\r
+// only used on admin pages which help you update site patterns with the\r
+// latest version offered by FiveFilters.org. For these admin-related\r
 // tasks to complete, we will require a valid registration key.\r
 // If you would like one, you can purchase the latest version of Full-Text RSS\r
 // at http://fivefilters.org/content-only/\r
@@ -144,12 +198,12 @@ $options->registration_key = '';
 // ----------------------\r
 // Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.\r
 // To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.\r
-// If no password or username is set, pages requiring admin privelages will be inaccessible. \r
+// If no password or username is set, pages requiring admin privelages will be inaccessible.\r
 // The default username is 'admin'.\r
 // If overriding with an environment variable, separate username and password with a colon, e.g.:\r
 // ftr_admin_credentials: admin:my-secret-password\r
 // Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');\r
-$options->admin_credentials = array('username'=>'admin', 'password'=>'admin');\r
+$options->admin_credentials = array('username'=>'admin', 'password'=>'');\r
 \r
 // URLs to allow\r
 // ----------------------\r
@@ -178,12 +232,12 @@ $options->key_required = false;
 // ----------------------\r
 // By default, when processing feeds, we assume item titles in the feed\r
 // have not been truncated. So after processing web pages, the extracted titles\r
-// are not used in the generated feed. If you prefer to have extracted titles in \r
-// the feed you can either set this to false, in which case we will always favour \r
-// extracted titles. Alternatively, if set to 'user' (default) we'll use the \r
+// are not used in the generated feed. If you prefer to have extracted titles in\r
+// the feed you can either set this to false, in which case we will always favour\r
+// extracted titles. Alternatively, if set to 'user' (default) we'll use the\r
 // extracted title if you pass '&use_extracted_title' in the querystring.\r
 // Possible values:\r
-// * Favour feed titles: true \r
+// * Favour feed titles: true\r
 // * Favour extracted titles: false\r
 // * Favour feed titles with user override: 'user' (default)\r
 // Note: this has no effect when the input URL is to a web page - in these cases\r
@@ -192,17 +246,17 @@ $options->favour_feed_titles = 'user';
 \r
 // Access keys (password protected access)\r
 // ------------------------------------\r
-// NOTE: You do not need an API key from fivefilters.org to run your own \r
+// NOTE: You do not need an API key from fivefilters.org to run your own\r
 // copy of the code. This is here if you'd like to restrict access to\r
 // _your_ copy.\r
 // Keys let you group users - those with a key and those without - and\r
 // restrict access to the service to those without a key.\r
 // If you want everyone to access the service in the same way, you can\r
 // leave the array below empty and ignore the access key options further down.\r
-// The options further down let you control how the service should behave \r
+// The options further down let you control how the service should behave\r
 // in each mode.\r
-// Note: Explicitly including the index number (1 and 2 in the examples below) \r
-// is highly recommended (when generating feeds, we encode the key and \r
+// Note: Explicitly including the index number (1 and 2 in the examples below)\r
+// is highly recommended (when generating feeds, we encode the key and\r
 // refer to it by index number and hash).\r
 $options->api_keys = array();\r
 // Example:\r
@@ -232,13 +286,13 @@ $options->max_entries_with_key = 10;
 // filter the resulting HTML for XSS attacks, making it redundant for\r
 // Full-Text RSS do the same. Similarly with frameworks/CMS which display\r
 // feed content - the content should be treated like any other user-submitted content.\r
-// \r
+//\r
 // If you are writing an application yourself which is processing feeds generated by\r
 // Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks\r
 // or enable this option. This might be useful if you are processing our generated\r
 // feeds with JavaScript on the client side - although there's client side xss\r
 // filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer\r
-// \r
+//\r
 // If enabled, we'll pass retrieved HTML content through htmLawed with\r
 // safe flag on and style attributes denied, see\r
 // http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6\r
@@ -253,8 +307,8 @@ $options->xss_filter = 'user';
 // Allowed parsers\r
 // ----------------------\r
 // Full-Text RSS attempts to use PHP's libxml extension to process HTML.\r
-// While fast, on some sites it may not always produce good results. \r
-// For these sites, you can specify an alternative HTML parser: \r
+// While fast, on some sites it may not always produce good results.\r
+// For these sites, you can specify an alternative HTML parser:\r
 // parser: html5lib\r
 // The html5lib parser is bundled with Full-Text RSS.\r
 // see http://code.google.com/p/html5lib/\r
@@ -273,7 +327,7 @@ $options->cors = false;
 \r
 // Use APC user cache?\r
 // ----------------------\r
-// If enabled we will store site config files (when requested \r
+// If enabled we will store site config files (when requested\r
 // for the first time) in APC's user cache. Keys prefixed with 'sc.'\r
 // This improves performance by reducing disk access.\r
 // Note: this has no effect if APC is unavailable on your server.\r
@@ -346,7 +400,7 @@ $options->rewrite_url = array(
 // Valid actions:\r
 // * 'exclude' - exclude this item from the result\r
 // * 'link' - create HTML link to the item\r
-$options->content_type_exc = array( \r
+$options->content_type_exc = array(\r
                                                           'application/pdf' => array('action'=>'link', 'name'=>'PDF'),\r
                                                           'image' => array('action'=>'link', 'name'=>'Image'),\r
                                                           'audio' => array('action'=>'link', 'name'=>'Audio'),\r
@@ -375,13 +429,13 @@ $options->cache_cleanup = 100;
 /// DO NOT CHANGE ANYTHING BELOW THIS ///////////\r
 /////////////////////////////////////////////////\r
 \r
-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1');\r
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');\r
 \r
 if (basename(__FILE__) == 'config.php') {\r
        if (file_exists(dirname(__FILE__).'/custom_config.php')) {\r
                require_once dirname(__FILE__).'/custom_config.php';\r
        }\r
-       \r
+\r
        // check for environment variables - often used on cloud platforms\r
        // environment variables should be prefixed with 'ftr_', e.g.\r
        // ftr_max_entries: 1\r
index ddd33bb5a8859908509d4281520aa56e9f978b2c..21e693e7db6ed0c7bbb30f6a0ba885c79ebe1f9d 100644 (file)
-<?php\r
-/**\r
- * Content Extractor\r
- * \r
- * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) \r
- * to extract content from HTML files.\r
- * \r
- * @version 1.0\r
- * @date 2013-02-05\r
- * @author Keyvan Minoukadeh\r
- * @copyright 2013 Keyvan Minoukadeh\r
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
- */\r
-\r
-class ContentExtractor\r
-{\r
-       protected static $tidy_config = array(\r
-                                'clean' => true,\r
-                                'output-xhtml' => true,\r
-                                'logical-emphasis' => true,\r
-                                'show-body-only' => false,\r
-                                'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',\r
-                                'new-inline-tags' => 'mark, time, meter, progress, data',\r
-                                'wrap' => 0,\r
-                                'drop-empty-paras' => true,\r
-                                'drop-proprietary-attributes' => false,\r
-                                'enclose-text' => true,\r
-                                'enclose-block-text' => true,\r
-                                'merge-divs' => true,\r
-                                'merge-spans' => true,\r
-                                'char-encoding' => 'utf8',\r
-                                'hide-comments' => true\r
-                                );\r
-       protected $html;\r
-       protected $config;\r
-       protected $title;\r
-       protected $author = array();\r
-       protected $language;\r
-       protected $date;\r
-       protected $body;\r
-       protected $success = false;\r
-       protected $nextPageUrl;\r
-       public $allowedParsers = array('libxml', 'html5lib');\r
-       public $fingerprints = array();\r
-       public $readability;\r
-       public $debug = false;\r
-       public $debugVerbose = false;\r
-\r
-       function __construct($path, $fallback=null) {\r
-               SiteConfig::set_config_path($path, $fallback);  \r
-       }\r
-       \r
-       protected function debug($msg) {\r
-               if ($this->debug) {\r
-                       $mem = round(memory_get_usage()/1024, 2);\r
-                       $memPeak = round(memory_get_peak_usage()/1024, 2);\r
-                       echo '* ',$msg;\r
-                       if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";\r
-                       echo "\n";\r
-                       ob_flush();\r
-                       flush();\r
-               }\r
-       }\r
-       \r
-       public function reset() {\r
-               $this->html = null;\r
-               $this->readability = null;\r
-               $this->config = null;\r
-               $this->title = null;\r
-               $this->body = null;\r
-               $this->author = array();\r
-               $this->language = null;\r
-               $this->date = null;\r
-               $this->nextPageUrl = null;\r
-               $this->success = false;\r
-       }\r
-\r
-       public function findHostUsingFingerprints($html) {\r
-               $this->debug('Checking fingerprints...');\r
-               $head = substr($html, 0, 8000);\r
-               foreach ($this->fingerprints as $_fp => $_fphost) {\r
-                       $lookin = 'html';\r
-                       if (is_array($_fphost)) {\r
-                               if (isset($_fphost['head']) && $_fphost['head']) {\r
-                                       $lookin = 'head';\r
-                               }\r
-                               $_fphost = $_fphost['hostname'];\r
-                       }\r
-                       if (strpos($$lookin, $_fp) !== false) {\r
-                               $this->debug("Found match: $_fphost");\r
-                               return $_fphost;\r
-                       }\r
-               }\r
-               $this->debug('No fingerprint matches');\r
-               return false;\r
-       }\r
-       \r
-       // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)\r
-       public function buildSiteConfig($url, $html='', $add_to_cache=true) {\r
-               // extract host name\r
-               $host = @parse_url($url, PHP_URL_HOST);\r
-               $host = strtolower($host);\r
-               if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);\r
-               // is merged version already cached?\r
-               if (SiteConfig::is_cached("$host.merged")) {\r
-                       $this->debug("Returning cached and merged site config for $host");\r
-                       return SiteConfig::build("$host.merged");\r
-               }\r
-               // let's build from site_config/custom/ and standard/\r
-               $config = SiteConfig::build($host);\r
-               if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {\r
-                       SiteConfig::add_to_cache($host, $config);\r
-               }\r
-               // if no match, use defaults\r
-               if (!$config) $config = new SiteConfig();\r
-               // load fingerprint config?\r
-               if ($config->autodetect_on_failure()) {\r
-                       // check HTML for fingerprints\r
-                       if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {\r
-                               if ($config_fingerprint = SiteConfig::build($_fphost)) {\r
-                                       $this->debug("Appending site config settings from $_fphost (fingerprint match)");\r
-                                       $config->append($config_fingerprint);\r
-                                       if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {\r
-                                               //$config_fingerprint->cache_in_apc = true;\r
-                                               SiteConfig::add_to_cache($_fphost, $config_fingerprint);\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-               // load global config?\r
-               if ($config->autodetect_on_failure()) {\r
-                       if ($config_global = SiteConfig::build('global', true)) {\r
-                               $this->debug('Appending site config settings from global.txt');\r
-                               $config->append($config_global);\r
-                               if ($add_to_cache && !SiteConfig::is_cached('global')) {\r
-                                       //$config_global->cache_in_apc = true;\r
-                                       SiteConfig::add_to_cache('global', $config_global);\r
-                               }\r
-                       }\r
-               }\r
-               // store copy of merged config\r
-               if ($add_to_cache) {\r
-                       // do not store in APC if wildcard match\r
-                       $use_apc = ($host == $config->cache_key);\r
-                       $config->cache_key = null;\r
-                       SiteConfig::add_to_cache("$host.merged", $config, $use_apc);\r
-               }\r
-               return $config;\r
-       }\r
-       \r
-       // returns true on success, false on failure\r
-       // $smart_tidy indicates that if tidy is used and no results are produced, we will\r
-       // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time\r
-       // but it has problems of its own which we try to avoid with this option.\r
-       public function process($html, $url, $smart_tidy=true) {\r
-               $this->reset();\r
-               $this->config = $this->buildSiteConfig($url, $html);\r
-               \r
-               // do string replacements\r
-               if (!empty($this->config->find_string)) {\r
-                       if (count($this->config->find_string) == count($this->config->replace_string)) {\r
-                               $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);\r
-                               $this->debug("Strings replaced: $_count (find_string and/or replace_string)");\r
-                       } else {\r
-                               $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');\r
-                       }\r
-                       unset($_count);\r
-               }\r
-               \r
-               // use tidy (if it exists)?\r
-               // This fixes problems with some sites which would otherwise\r
-               // trouble DOMDocument's HTML parsing. (Although sometimes it\r
-               // makes matters worse, which is why you can override it in site config files.)\r
-               $tidied = false;\r
-               if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {\r
-                       $this->debug('Using Tidy');\r
-                       $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');\r
-                       if (tidy_clean_repair($tidy)) {\r
-                               $original_html = $html;\r
-                               $tidied = true;\r
-                               $html = $tidy->value;\r
-                       }\r
-                       unset($tidy);\r
-               }\r
-               \r
-               // load and parse html\r
-               $_parser = $this->config->parser();\r
-               if (!in_array($_parser, $this->allowedParsers)) {\r
-                       $this->debug("HTML parser $_parser not listed, using libxml instead");\r
-                       $_parser = 'libxml';\r
-               }\r
-               $this->debug("Attempting to parse HTML with $_parser");\r
-               $this->readability = new Readability($html, $url, $_parser);\r
-               \r
-               // we use xpath to find elements in the given HTML document\r
-               // see http://en.wikipedia.org/wiki/XPath_1.0\r
-               $xpath = new DOMXPath($this->readability->dom);\r
-\r
-               // try to get next page link\r
-               foreach ($this->config->next_page_link as $pattern) {\r
-                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);\r
-                       if (is_string($elems)) {\r
-                               $this->nextPageUrl = trim($elems);\r
-                               break;\r
-                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {\r
-                               foreach ($elems as $item) {\r
-                                       if ($item instanceof DOMElement && $item->hasAttribute('href')) {\r
-                                               $this->nextPageUrl = $item->getAttribute('href');\r
-                                               break 2;\r
-                                       } elseif ($item instanceof DOMAttr && $item->value) {\r
-                                               $this->nextPageUrl = $item->value;\r
-                                               break 2;\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-               \r
-               // try to get title\r
-               foreach ($this->config->title as $pattern) {\r
-                       // $this->debug("Trying $pattern");\r
-                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);\r
-                       if (is_string($elems)) {\r
-                               $this->title = trim($elems);\r
-                               $this->debug('Title expression evaluated as string: '.$this->title);\r
-                               $this->debug("...XPath match: $pattern");\r
-                               break;\r
-                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {\r
-                               $this->title = $elems->item(0)->textContent;\r
-                               $this->debug('Title matched: '.$this->title);\r
-                               $this->debug("...XPath match: $pattern");\r
-                               // remove title from document\r
-                               try {\r
-                                       $elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                               } catch (DOMException $e) {\r
-                                       // do nothing\r
-                               }\r
-                               break;\r
-                       }\r
-               }\r
-               \r
-               // try to get author (if it hasn't already been set)\r
-               if (empty($this->author)) {\r
-                       foreach ($this->config->author as $pattern) {\r
-                               $elems = @$xpath->evaluate($pattern, $this->readability->dom);\r
-                               if (is_string($elems)) {\r
-                                       if (trim($elems) != '') {\r
-                                               $this->author[] = trim($elems);\r
-                                               $this->debug('Author expression evaluated as string: '.trim($elems));\r
-                                               $this->debug("...XPath match: $pattern");\r
-                                               break;\r
-                                       }\r
-                               } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {\r
-                                       foreach ($elems as $elem) {\r
-                                               if (!isset($elem->parentNode)) continue;\r
-                                               $this->author[] = trim($elem->textContent);\r
-                                               $this->debug('Author matched: '.trim($elem->textContent));\r
-                                       }\r
-                                       if (!empty($this->author)) {\r
-                                               $this->debug("...XPath match: $pattern");\r
-                                               break;\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-               \r
-               // try to get language\r
-               $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');\r
-               foreach ($_lang_xpath as $pattern) {\r
-                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);\r
-                       if (is_string($elems)) {\r
-                               if (trim($elems) != '') {\r
-                                       $this->language = trim($elems);\r
-                                       $this->debug('Language matched: '.$this->language);\r
-                                       break;\r
-                               }\r
-                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {\r
-                               foreach ($elems as $elem) {\r
-                                       if (!isset($elem->parentNode)) continue;\r
-                                       $this->language = trim($elem->textContent);\r
-                                       $this->debug('Language matched: '.$this->language);                                     \r
-                               }\r
-                               if ($this->language) break;\r
-                       }\r
-               }\r
-               \r
-               // try to get date\r
-               foreach ($this->config->date as $pattern) {\r
-                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);\r
-                       if (is_string($elems)) {\r
-                               $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));                                \r
-                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {\r
-                               $this->date = $elems->item(0)->textContent;\r
-                               $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));\r
-                               // remove date from document\r
-                               // $elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                       }\r
-                       if (!$this->date) {\r
-                               $this->date = null;\r
-                       } else {\r
-                               $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));\r
-                               $this->debug("...XPath match: $pattern");\r
-                               break;\r
-                       }\r
-               }\r
-\r
-               // strip elements (using xpath expressions)\r
-               foreach ($this->config->strip as $pattern) {\r
-                       $elems = @$xpath->query($pattern, $this->readability->dom);\r
-                       // check for matches\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('Stripping '.$elems->length.' elements (strip)');\r
-                               for ($i=$elems->length-1; $i >= 0; $i--) {\r
-                                       $elems->item($i)->parentNode->removeChild($elems->item($i));\r
-                               }\r
-                       }\r
-               }\r
-               \r
-               // strip elements (using id and class attribute values)\r
-               foreach ($this->config->strip_id_or_class as $string) {\r
-                       $string = strtr($string, array("'"=>'', '"'=>''));\r
-                       $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);\r
-                       // check for matches\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');\r
-                               for ($i=$elems->length-1; $i >= 0; $i--) {\r
-                                       $elems->item($i)->parentNode->removeChild($elems->item($i));\r
-                               }\r
-                       }\r
-               }\r
-               \r
-               // strip images (using src attribute values)\r
-               foreach ($this->config->strip_image_src as $string) {\r
-                       $string = strtr($string, array("'"=>'', '"'=>''));\r
-                       $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);\r
-                       // check for matches\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('Stripping '.$elems->length.' image elements');\r
-                               for ($i=$elems->length-1; $i >= 0; $i--) {\r
-                                       $elems->item($i)->parentNode->removeChild($elems->item($i));\r
-                               }\r
-                       }\r
-               }\r
-               // strip elements using Readability.com and Instapaper.com ignore class names\r
-               // .entry-unrelated and .instapaper_ignore\r
-               // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines\r
-               // and http://blog.instapaper.com/post/730281947\r
-               $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);\r
-               // check for matches\r
-               if ($elems && $elems->length > 0) {\r
-                       $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');\r
-                       for ($i=$elems->length-1; $i >= 0; $i--) {\r
-                               $elems->item($i)->parentNode->removeChild($elems->item($i));\r
-                       }\r
-               }\r
-               \r
-               // strip elements that contain style="display: none;"\r
-               $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);\r
-               // check for matches\r
-               if ($elems && $elems->length > 0) {\r
-                       $this->debug('Stripping '.$elems->length.' elements with inline display:none style');\r
-                       for ($i=$elems->length-1; $i >= 0; $i--) {\r
-                               $elems->item($i)->parentNode->removeChild($elems->item($i));\r
-                       }\r
-               }\r
-               \r
-               // try to get body\r
-               foreach ($this->config->body as $pattern) {\r
-                       $elems = @$xpath->query($pattern, $this->readability->dom);\r
-                       // check for matches\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('Body matched');\r
-                               $this->debug("...XPath match: $pattern");\r
-                               if ($elems->length == 1) {                              \r
-                                       $this->body = $elems->item(0);\r
-                                       // prune (clean up elements that may not be content)\r
-                                       if ($this->config->prune()) {\r
-                                               $this->debug('...pruning content');\r
-                                               $this->readability->prepArticle($this->body);\r
-                                       }\r
-                                       break;\r
-                               } else {\r
-                                       $this->body = $this->readability->dom->createElement('div');\r
-                                       $this->debug($elems->length.' body elems found');\r
-                                       foreach ($elems as $elem) {\r
-                                               if (!isset($elem->parentNode)) continue;\r
-                                               $isDescendant = false;\r
-                                               foreach ($this->body->childNodes as $parent) {\r
-                                                       if ($this->isDescendant($parent, $elem)) {\r
-                                                               $isDescendant = true;\r
-                                                               break;\r
-                                                       }\r
-                                               }\r
-                                               if ($isDescendant) {\r
-                                                       $this->debug('...element is child of another body element, skipping.');\r
-                                               } else {\r
-                                                       // prune (clean up elements that may not be content)\r
-                                                       if ($this->config->prune()) {\r
-                                                               $this->debug('Pruning content');\r
-                                                               $this->readability->prepArticle($elem);\r
-                                                       }\r
-                                                       $this->debug('...element added to body');\r
-                                                       $this->body->appendChild($elem);\r
-                                               }\r
-                                       }\r
-                                       if ($this->body->hasChildNodes()) break;\r
-                               }\r
-                       }\r
-               }               \r
-               \r
-               // auto detect?\r
-               $detect_title = $detect_body = $detect_author = $detect_date = false;\r
-               // detect title?\r
-               if (!isset($this->title)) {\r
-                       if (empty($this->config->title) || $this->config->autodetect_on_failure()) {\r
-                               $detect_title = true;\r
-                       }\r
-               }\r
-               // detect body?\r
-               if (!isset($this->body)) {\r
-                       if (empty($this->config->body) || $this->config->autodetect_on_failure()) {\r
-                               $detect_body = true;\r
-                       }\r
-               }\r
-               // detect author?\r
-               if (empty($this->author)) {\r
-                       if (empty($this->config->author) || $this->config->autodetect_on_failure()) {\r
-                               $detect_author = true;\r
-                       }\r
-               }\r
-               // detect date?\r
-               if (!isset($this->date)) {\r
-                       if (empty($this->config->date) || $this->config->autodetect_on_failure()) {\r
-                               $detect_date = true;\r
-                       }\r
-               }\r
-\r
-               // check for hNews\r
-               if ($detect_title || $detect_body) {\r
-                       // check for hentry\r
-                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('hNews: found hentry');\r
-                               $hentry = $elems->item(0);\r
-                               \r
-                               if ($detect_title) {\r
-                                       // check for entry-title\r
-                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);\r
-                                       if ($elems && $elems->length > 0) {\r
-                                               $this->title = $elems->item(0)->textContent;\r
-                                               $this->debug('hNews: found entry-title: '.$this->title);\r
-                                               // remove title from document\r
-                                               $elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                                               $detect_title = false;\r
-                                       }\r
-                               }\r
-                               \r
-                               if ($detect_date) {\r
-                                       // check for time element with pubdate attribute\r
-                                       $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);\r
-                                       if ($elems && $elems->length > 0) {\r
-                                               $this->date = strtotime(trim($elems->item(0)->textContent));\r
-                                               // remove date from document\r
-                                               //$elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                                               if ($this->date) {\r
-                                                       $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));\r
-                                                       $detect_date = false;\r
-                                               } else {\r
-                                                       $this->date = null;\r
-                                               }\r
-                                       }\r
-                               }\r
-\r
-                               if ($detect_author) {\r
-                                       // check for time element with pubdate attribute\r
-                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);\r
-                                       if ($elems && $elems->length > 0) {\r
-                                               $author = $elems->item(0);\r
-                                               $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);\r
-                                               if ($fn && $fn->length > 0) {\r
-                                                       foreach ($fn as $_fn) {\r
-                                                               if (trim($_fn->textContent) != '') {\r
-                                                                       $this->author[] = trim($_fn->textContent);\r
-                                                                       $this->debug('hNews: found author: '.trim($_fn->textContent));\r
-                                                               }\r
-                                                       }\r
-                                               } else {\r
-                                                       if (trim($author->textContent) != '') {\r
-                                                               $this->author[] = trim($author->textContent);\r
-                                                               $this->debug('hNews: found author: '.trim($author->textContent));\r
-                                                       }\r
-                                               }\r
-                                               $detect_author = empty($this->author);\r
-                                       }\r
-                               }\r
-                               \r
-                               // check for entry-content.\r
-                               // according to hAtom spec, if there are multiple elements marked entry-content,\r
-                               // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content\r
-                               if ($detect_body) {\r
-                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);\r
-                                       if ($elems && $elems->length > 0) {\r
-                                               $this->debug('hNews: found entry-content');\r
-                                               if ($elems->length == 1) {\r
-                                                       // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)\r
-                                                       $e = $elems->item(0);\r
-                                                       if (($e->tagName == 'img') || (trim($e->textContent) != '')) {\r
-                                                               $this->body = $elems->item(0);\r
-                                                               // prune (clean up elements that may not be content)\r
-                                                               if ($this->config->prune()) {\r
-                                                                       $this->debug('Pruning content');\r
-                                                                       $this->readability->prepArticle($this->body);\r
-                                                               }\r
-                                                               $detect_body = false;\r
-                                                       } else {\r
-                                                               $this->debug('hNews: skipping entry-content - appears not to contain content');\r
-                                                       }\r
-                                                       unset($e);\r
-                                               } else {\r
-                                                       $this->body = $this->readability->dom->createElement('div');\r
-                                                       $this->debug($elems->length.' entry-content elems found');\r
-                                                       foreach ($elems as $elem) {\r
-                                                               if (!isset($elem->parentNode)) continue;\r
-                                                               $isDescendant = false;\r
-                                                               foreach ($this->body->childNodes as $parent) {\r
-                                                                       if ($this->isDescendant($parent, $elem)) {\r
-                                                                               $isDescendant = true;\r
-                                                                               break;\r
-                                                                       }\r
-                                                               }\r
-                                                               if ($isDescendant) {\r
-                                                                       $this->debug('Element is child of another body element, skipping.');\r
-                                                               } else {\r
-                                                                       // prune (clean up elements that may not be content)\r
-                                                                       if ($this->config->prune()) {\r
-                                                                               $this->debug('Pruning content');\r
-                                                                               $this->readability->prepArticle($elem);\r
-                                                                       }                                                               \r
-                                                                       $this->debug('Element added to body');                                                                  \r
-                                                                       $this->body->appendChild($elem);\r
-                                                               }\r
-                                                       }\r
-                                                       $detect_body = false;\r
-                                               }\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-\r
-               // check for elements marked with instapaper_title\r
-               if ($detect_title) {\r
-                       // check for instapaper_title\r
-                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->title = $elems->item(0)->textContent;\r
-                               $this->debug('Title found (.instapaper_title): '.$this->title);\r
-                               // remove title from document\r
-                               $elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                               $detect_title = false;\r
-                       }\r
-               }\r
-               // check for elements marked with instapaper_body\r
-               if ($detect_body) {\r
-                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);\r
-                       if ($elems && $elems->length > 0) {\r
-                               $this->debug('body found (.instapaper_body)');\r
-                               $this->body = $elems->item(0);\r
-                               // prune (clean up elements that may not be content)\r
-                               if ($this->config->prune()) {\r
-                                       $this->debug('Pruning content');\r
-                                       $this->readability->prepArticle($this->body);\r
-                               }\r
-                               $detect_body = false;\r
-                       }\r
-               }\r
-               \r
-               // Find author in rel="author" marked element\r
-               // We only use this if there's exactly one.\r
-               // If there's more than one, it could indicate more than\r
-               // one author, but it could also indicate that we're processing\r
-               // a page listing different articles with different authors.\r
-               if ($detect_author) {\r
-                       $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);\r
-                       if ($elems && $elems->length == 1) {\r
-                               $author = trim($elems->item(0)->textContent);\r
-                               if ($author != '') {\r
-                                       $this->debug("Author found (rel=\"author\"): $author");\r
-                                       $this->author[] = $author;\r
-                                       $detect_author = false;\r
-                               }\r
-                       }\r
-               }\r
-\r
-               // Find date in pubdate marked time element\r
-               // For the same reason given above, we only use this\r
-               // if there's exactly one element.\r
-               if ($detect_date) {\r
-                       $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);\r
-                       if ($elems && $elems->length == 1) {\r
-                               $this->date = strtotime(trim($elems->item(0)->textContent));\r
-                               // remove date from document\r
-                               //$elems->item(0)->parentNode->removeChild($elems->item(0));\r
-                               if ($this->date) {\r
-                                       $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));\r
-                                       $detect_date = false;\r
-                               } else {\r
-                                       $this->date = null;\r
-                               }\r
-                       }\r
-               }\r
-\r
-               // still missing title or body, so we detect using Readability\r
-               if ($detect_title || $detect_body) {\r
-                       $this->debug('Using Readability');\r
-                       // clone body if we're only using Readability for title (otherwise it may interfere with body element)\r
-                       if (isset($this->body)) $this->body = $this->body->cloneNode(true);\r
-                       $success = $this->readability->init();\r
-               }\r
-               if ($detect_title) {\r
-                       $this->debug('Detecting title');\r
-                       $this->title = $this->readability->getTitle()->textContent;\r
-               }\r
-               if ($detect_body && $success) {\r
-                       $this->debug('Detecting body');\r
-                       $this->body = $this->readability->getContent();\r
-                       if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {\r
-                               $this->body = $this->body->firstChild;\r
-                       }\r
-                       // prune (clean up elements that may not be content)\r
-                       if ($this->config->prune()) {\r
-                               $this->debug('Pruning content');\r
-                               $this->readability->prepArticle($this->body);\r
-                       }\r
-               }\r
-               if (isset($this->body)) {\r
-                       // remove scripts\r
-                       $this->readability->removeScripts($this->body);\r
-                       // remove any h1-h6 elements that appear as first thing in the body\r
-                       // and which match our title\r
-                       if (isset($this->title) && ($this->title != '')) {\r
-                               $firstChild = $this->body->firstChild;\r
-                               while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {\r
-                                       $firstChild = $firstChild->nextSibling;\r
-                               }\r
-                               if (($firstChild->nodeType === XML_ELEMENT_NODE)\r
-                                       && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))\r
-                                       && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {\r
-                                               $this->body->removeChild($firstChild);\r
-                               }\r
-                       }\r
-                       // prevent self-closing iframes\r
-                       $elems = $this->body->getElementsByTagName('iframe');\r
-                       for ($i = $elems->length-1; $i >= 0; $i--) {\r
-                               $e = $elems->item($i);\r
-                               if (!$e->hasChildNodes()) {\r
-                                       $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));\r
-                               }\r
-                       }\r
-                       // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/\r
-                       // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src\r
-                       // inside the data-lazy-src attribute. It also places the original image inside a noscript element \r
-                       // next to the amended one.\r
-                       $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);\r
-                       for ($i = $elems->length-1; $i >= 0; $i--) {\r
-                               $e = $elems->item($i);\r
-                               // let's see if we can grab image from noscript\r
-                               if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {\r
-                                       $_new_elem = $e->ownerDocument->createDocumentFragment();\r
-                                       @$_new_elem->appendXML($e->nextSibling->innerHTML);\r
-                                       $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);\r
-                                       $e->parentNode->removeChild($e);\r
-                               } else {\r
-                                       // Use data-lazy-src as src value\r
-                                       $e->setAttribute('src', $e->getAttribute('data-lazy-src'));\r
-                                       $e->removeAttribute('data-lazy-src');\r
-                               }\r
-                       }\r
-               \r
-                       $this->success = true;\r
-               }\r
-               \r
-               // if we've had no success and we've used tidy, there's a chance\r
-               // that tidy has messed up. So let's try again without tidy...\r
-               if (!$this->success && $tidied && $smart_tidy) {\r
-                       $this->debug('Trying again without tidy');\r
-                       $this->process($original_html, $url, false);\r
-               }\r
-\r
-               return $this->success;\r
-       }\r
-       \r
-       private function isDescendant(DOMElement $parent, DOMElement $child) {\r
-               $node = $child->parentNode;\r
-               while ($node != null) {\r
-                       if ($node->isSameNode($parent)) return true;\r
-                       $node = $node->parentNode;\r
-               }\r
-               return false;\r
-       }\r
-\r
-       public function getContent() {\r
-               return $this->body;\r
-       }\r
-       \r
-       public function getTitle() {\r
-               return $this->title;\r
-       }\r
-       \r
-       public function getAuthors() {\r
-               return $this->author;\r
-       }\r
-       \r
-       public function getLanguage() {\r
-               return $this->language;\r
-       }\r
-       \r
-       public function getDate() {\r
-               return $this->date;\r
-       }\r
-       \r
-       public function getSiteConfig() {\r
-               return $this->config;\r
-       }\r
-       \r
-       public function getNextPageUrl() {\r
-               return $this->nextPageUrl;\r
-       }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Content Extractor
+ * 
+ * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 
+ * to extract content from HTML files.
+ * 
+ * @version 1.0
+ * @date 2013-02-05
+ * @author Keyvan Minoukadeh
+ * @copyright 2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class ContentExtractor
+{
+       protected static $tidy_config = array(
+                                'clean' => true,
+                                'output-xhtml' => true,
+                                'logical-emphasis' => true,
+                                'show-body-only' => false,
+                                'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
+                                'new-inline-tags' => 'mark, time, meter, progress, data',
+                                'wrap' => 0,
+                                'drop-empty-paras' => true,
+                                'drop-proprietary-attributes' => false,
+                                'enclose-text' => true,
+                                'enclose-block-text' => true,
+                                'merge-divs' => true,
+                                'merge-spans' => true,
+                                'char-encoding' => 'utf8',
+                                'hide-comments' => true
+                                );
+       protected $html;
+       protected $config;
+       protected $title;
+       protected $author = array();
+       protected $language;
+       protected $date;
+       protected $body;
+       protected $success = false;
+       protected $nextPageUrl;
+       public $allowedParsers = array('libxml', 'html5lib');
+       public $fingerprints = array();
+       public $readability;
+       public $debug = false;
+       public $debugVerbose = false;
+
+       function __construct($path, $fallback=null) {
+               SiteConfig::set_config_path($path, $fallback);  
+       }
+       
+       protected function debug($msg) {
+               if ($this->debug) {
+                       $mem = round(memory_get_usage()/1024, 2);
+                       $memPeak = round(memory_get_peak_usage()/1024, 2);
+                       echo '* ',$msg;
+                       if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
+                       echo "\n";
+                       ob_flush();
+                       flush();
+               }
+       }
+       
+       public function reset() {
+               $this->html = null;
+               $this->readability = null;
+               $this->config = null;
+               $this->title = null;
+               $this->body = null;
+               $this->author = array();
+               $this->language = null;
+               $this->date = null;
+               $this->nextPageUrl = null;
+               $this->success = false;
+       }
+
+       public function findHostUsingFingerprints($html) {
+               $this->debug('Checking fingerprints...');
+               $head = substr($html, 0, 8000);
+               foreach ($this->fingerprints as $_fp => $_fphost) {
+                       $lookin = 'html';
+                       if (is_array($_fphost)) {
+                               if (isset($_fphost['head']) && $_fphost['head']) {
+                                       $lookin = 'head';
+                               }
+                               $_fphost = $_fphost['hostname'];
+                       }
+                       if (strpos($$lookin, $_fp) !== false) {
+                               $this->debug("Found match: $_fphost");
+                               return $_fphost;
+                       }
+               }
+               $this->debug('No fingerprint matches');
+               return false;
+       }
+       
+       // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
+       public function buildSiteConfig($url, $html='', $add_to_cache=true) {
+               // extract host name
+               $host = @parse_url($url, PHP_URL_HOST);
+               $host = strtolower($host);
+               if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+               // is merged version already cached?
+               if (SiteConfig::is_cached("$host.merged")) {
+                       $this->debug("Returning cached and merged site config for $host");
+                       return SiteConfig::build("$host.merged");
+               }
+               // let's build from site_config/custom/ and standard/
+               $config = SiteConfig::build($host);
+               if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
+                       SiteConfig::add_to_cache($host, $config);
+               }
+               // if no match, use defaults
+               if (!$config) $config = new SiteConfig();
+               // load fingerprint config?
+               if ($config->autodetect_on_failure()) {
+                       // check HTML for fingerprints
+                       if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
+                               if ($config_fingerprint = SiteConfig::build($_fphost)) {
+                                       $this->debug("Appending site config settings from $_fphost (fingerprint match)");
+                                       $config->append($config_fingerprint);
+                                       if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
+                                               //$config_fingerprint->cache_in_apc = true;
+                                               SiteConfig::add_to_cache($_fphost, $config_fingerprint);
+                                       }
+                               }
+                       }
+               }
+               // load global config?
+               if ($config->autodetect_on_failure()) {
+                       if ($config_global = SiteConfig::build('global', true)) {
+                               $this->debug('Appending site config settings from global.txt');
+                               $config->append($config_global);
+                               if ($add_to_cache && !SiteConfig::is_cached('global')) {
+                                       //$config_global->cache_in_apc = true;
+                                       SiteConfig::add_to_cache('global', $config_global);
+                               }
+                       }
+               }
+               // store copy of merged config
+               if ($add_to_cache) {
+                       // do not store in APC if wildcard match
+                       $use_apc = ($host == $config->cache_key);
+                       $config->cache_key = null;
+                       SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
+               }
+               return $config;
+       }
+       
+       // returns true on success, false on failure
+       // $smart_tidy indicates that if tidy is used and no results are produced, we will
+       // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
+       // but it has problems of its own which we try to avoid with this option.
+       public function process($html, $url, $smart_tidy=true) {
+               $this->reset();
+               $this->config = $this->buildSiteConfig($url, $html);
+               
+               // do string replacements
+               if (!empty($this->config->find_string)) {
+                       if (count($this->config->find_string) == count($this->config->replace_string)) {
+                               $html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
+                               $this->debug("Strings replaced: $_count (find_string and/or replace_string)");
+                       } else {
+                               $this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
+                       }
+                       unset($_count);
+               }
+               
+               // use tidy (if it exists)?
+               // This fixes problems with some sites which would otherwise
+               // trouble DOMDocument's HTML parsing. (Although sometimes it
+               // makes matters worse, which is why you can override it in site config files.)
+               $tidied = false;
+               if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
+                       $this->debug('Using Tidy');
+                       $tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
+                       if (tidy_clean_repair($tidy)) {
+                               $original_html = $html;
+                               $tidied = true;
+                               $html = $tidy->value;
+                       }
+                       unset($tidy);
+               }
+               
+               // load and parse html
+               $_parser = $this->config->parser();
+               if (!in_array($_parser, $this->allowedParsers)) {
+                       $this->debug("HTML parser $_parser not listed, using libxml instead");
+                       $_parser = 'libxml';
+               }
+               $this->debug("Attempting to parse HTML with $_parser");
+               $this->readability = new Readability($html, $url, $_parser);
+               
+               // we use xpath to find elements in the given HTML document
+               // see http://en.wikipedia.org/wiki/XPath_1.0
+               $xpath = new DOMXPath($this->readability->dom);
+
+               // try to get next page link
+               foreach ($this->config->next_page_link as $pattern) {
+                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);
+                       if (is_string($elems)) {
+                               $this->nextPageUrl = trim($elems);
+                               break;
+                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+                               foreach ($elems as $item) {
+                                       if ($item instanceof DOMElement && $item->hasAttribute('href')) {
+                                               $this->nextPageUrl = $item->getAttribute('href');
+                                               break 2;
+                                       } elseif ($item instanceof DOMAttr && $item->value) {
+                                               $this->nextPageUrl = $item->value;
+                                               break 2;
+                                       }
+                               }
+                       }
+               }
+               
+               // try to get title
+               foreach ($this->config->title as $pattern) {
+                       // $this->debug("Trying $pattern");
+                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);
+                       if (is_string($elems)) {
+                               $this->title = trim($elems);
+                               $this->debug('Title expression evaluated as string: '.$this->title);
+                               $this->debug("...XPath match: $pattern");
+                               break;
+                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+                               $this->title = $elems->item(0)->textContent;
+                               $this->debug('Title matched: '.$this->title);
+                               $this->debug("...XPath match: $pattern");
+                               // remove title from document
+                               try {
+                                       @$elems->item(0)->parentNode->removeChild($elems->item(0));
+                               } catch (DOMException $e) {
+                                       // do nothing
+                               }
+                               break;
+                       }
+               }
+               
+               // try to get author (if it hasn't already been set)
+               if (empty($this->author)) {
+                       foreach ($this->config->author as $pattern) {
+                               $elems = @$xpath->evaluate($pattern, $this->readability->dom);
+                               if (is_string($elems)) {
+                                       if (trim($elems) != '') {
+                                               $this->author[] = trim($elems);
+                                               $this->debug('Author expression evaluated as string: '.trim($elems));
+                                               $this->debug("...XPath match: $pattern");
+                                               break;
+                                       }
+                               } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+                                       foreach ($elems as $elem) {
+                                               if (!isset($elem->parentNode)) continue;
+                                               $this->author[] = trim($elem->textContent);
+                                               $this->debug('Author matched: '.trim($elem->textContent));
+                                       }
+                                       if (!empty($this->author)) {
+                                               $this->debug("...XPath match: $pattern");
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               
+               // try to get language
+               $_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
+               foreach ($_lang_xpath as $pattern) {
+                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);
+                       if (is_string($elems)) {
+                               if (trim($elems) != '') {
+                                       $this->language = trim($elems);
+                                       $this->debug('Language matched: '.$this->language);
+                                       break;
+                               }
+                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+                               foreach ($elems as $elem) {
+                                       if (!isset($elem->parentNode)) continue;
+                                       $this->language = trim($elem->textContent);
+                                       $this->debug('Language matched: '.$this->language);                                     
+                               }
+                               if ($this->language) break;
+                       }
+               }
+               
+               // try to get date
+               foreach ($this->config->date as $pattern) {
+                       $elems = @$xpath->evaluate($pattern, $this->readability->dom);
+                       if (is_string($elems)) {
+                               $this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));                                
+                       } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+                               $this->date = $elems->item(0)->textContent;
+                               $this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
+                               // remove date from document
+                               // $elems->item(0)->parentNode->removeChild($elems->item(0));
+                       }
+                       if (!$this->date) {
+                               $this->date = null;
+                       } else {
+                               $this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
+                               $this->debug("...XPath match: $pattern");
+                               break;
+                       }
+               }
+
+               // strip elements (using xpath expressions)
+               foreach ($this->config->strip as $pattern) {
+                       $elems = @$xpath->query($pattern, $this->readability->dom);
+                       // check for matches
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('Stripping '.$elems->length.' elements (strip)');
+                               for ($i=$elems->length-1; $i >= 0; $i--) {
+                                       $elems->item($i)->parentNode->removeChild($elems->item($i));
+                               }
+                       }
+               }
+               
+               // strip elements (using id and class attribute values)
+               foreach ($this->config->strip_id_or_class as $string) {
+                       $string = strtr($string, array("'"=>'', '"'=>''));
+                       $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
+                       // check for matches
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
+                               for ($i=$elems->length-1; $i >= 0; $i--) {
+                                       $elems->item($i)->parentNode->removeChild($elems->item($i));
+                               }
+                       }
+               }
+               
+               // strip images (using src attribute values)
+               foreach ($this->config->strip_image_src as $string) {
+                       $string = strtr($string, array("'"=>'', '"'=>''));
+                       $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
+                       // check for matches
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('Stripping '.$elems->length.' image elements');
+                               for ($i=$elems->length-1; $i >= 0; $i--) {
+                                       $elems->item($i)->parentNode->removeChild($elems->item($i));
+                               }
+                       }
+               }
+               // strip elements using Readability.com and Instapaper.com ignore class names
+               // .entry-unrelated and .instapaper_ignore
+               // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
+               // and http://blog.instapaper.com/post/730281947
+               $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
+               // check for matches
+               if ($elems && $elems->length > 0) {
+                       $this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
+                       for ($i=$elems->length-1; $i >= 0; $i--) {
+                               $elems->item($i)->parentNode->removeChild($elems->item($i));
+                       }
+               }
+               
+               // strip elements that contain style="display: none;"
+               $elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
+               // check for matches
+               if ($elems && $elems->length > 0) {
+                       $this->debug('Stripping '.$elems->length.' elements with inline display:none style');
+                       for ($i=$elems->length-1; $i >= 0; $i--) {
+                               $elems->item($i)->parentNode->removeChild($elems->item($i));
+                       }
+               }
+               
+               // try to get body
+               foreach ($this->config->body as $pattern) {
+                       $elems = @$xpath->query($pattern, $this->readability->dom);
+                       // check for matches
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('Body matched');
+                               $this->debug("...XPath match: $pattern");
+                               if ($elems->length == 1) {                              
+                                       $this->body = $elems->item(0);
+                                       // prune (clean up elements that may not be content)
+                                       if ($this->config->prune()) {
+                                               $this->debug('...pruning content');
+                                               $this->readability->prepArticle($this->body);
+                                       }
+                                       break;
+                               } else {
+                                       $this->body = $this->readability->dom->createElement('div');
+                                       $this->debug($elems->length.' body elems found');
+                                       foreach ($elems as $elem) {
+                                               if (!isset($elem->parentNode)) continue;
+                                               $isDescendant = false;
+                                               foreach ($this->body->childNodes as $parent) {
+                                                       if ($this->isDescendant($parent, $elem)) {
+                                                               $isDescendant = true;
+                                                               break;
+                                                       }
+                                               }
+                                               if ($isDescendant) {
+                                                       $this->debug('...element is child of another body element, skipping.');
+                                               } else {
+                                                       // prune (clean up elements that may not be content)
+                                                       if ($this->config->prune()) {
+                                                               $this->debug('Pruning content');
+                                                               $this->readability->prepArticle($elem);
+                                                       }
+                                                       $this->debug('...element added to body');
+                                                       $this->body->appendChild($elem);
+                                               }
+                                       }
+                                       if ($this->body->hasChildNodes()) break;
+                               }
+                       }
+               }               
+               
+               // auto detect?
+               $detect_title = $detect_body = $detect_author = $detect_date = false;
+               // detect title?
+               if (!isset($this->title)) {
+                       if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
+                               $detect_title = true;
+                       }
+               }
+               // detect body?
+               if (!isset($this->body)) {
+                       if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
+                               $detect_body = true;
+                       }
+               }
+               // detect author?
+               if (empty($this->author)) {
+                       if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
+                               $detect_author = true;
+                       }
+               }
+               // detect date?
+               if (!isset($this->date)) {
+                       if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
+                               $detect_date = true;
+                       }
+               }
+
+               // check for hNews
+               if ($detect_title || $detect_body) {
+                       // check for hentry
+                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('hNews: found hentry');
+                               $hentry = $elems->item(0);
+                               
+                               if ($detect_title) {
+                                       // check for entry-title
+                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
+                                       if ($elems && $elems->length > 0) {
+                                               $this->title = $elems->item(0)->textContent;
+                                               $this->debug('hNews: found entry-title: '.$this->title);
+                                               // remove title from document
+                                               $elems->item(0)->parentNode->removeChild($elems->item(0));
+                                               $detect_title = false;
+                                       }
+                               }
+                               
+                               if ($detect_date) {
+                                       // check for time element with pubdate attribute
+                                       $elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
+                                       if ($elems && $elems->length > 0) {
+                                               $this->date = strtotime(trim($elems->item(0)->textContent));
+                                               // remove date from document
+                                               //$elems->item(0)->parentNode->removeChild($elems->item(0));
+                                               if ($this->date) {
+                                                       $this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
+                                                       $detect_date = false;
+                                               } else {
+                                                       $this->date = null;
+                                               }
+                                       }
+                               }
+
+                               if ($detect_author) {
+                                       // check for time element with pubdate attribute
+                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
+                                       if ($elems && $elems->length > 0) {
+                                               $author = $elems->item(0);
+                                               $fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
+                                               if ($fn && $fn->length > 0) {
+                                                       foreach ($fn as $_fn) {
+                                                               if (trim($_fn->textContent) != '') {
+                                                                       $this->author[] = trim($_fn->textContent);
+                                                                       $this->debug('hNews: found author: '.trim($_fn->textContent));
+                                                               }
+                                                       }
+                                               } else {
+                                                       if (trim($author->textContent) != '') {
+                                                               $this->author[] = trim($author->textContent);
+                                                               $this->debug('hNews: found author: '.trim($author->textContent));
+                                                       }
+                                               }
+                                               $detect_author = empty($this->author);
+                                       }
+                               }
+                               
+                               // check for entry-content.
+                               // according to hAtom spec, if there are multiple elements marked entry-content,
+                               // we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
+                               if ($detect_body) {
+                                       $elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
+                                       if ($elems && $elems->length > 0) {
+                                               $this->debug('hNews: found entry-content');
+                                               if ($elems->length == 1) {
+                                                       // what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
+                                                       $e = $elems->item(0);
+                                                       if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
+                                                               $this->body = $elems->item(0);
+                                                               // prune (clean up elements that may not be content)
+                                                               if ($this->config->prune()) {
+                                                                       $this->debug('Pruning content');
+                                                                       $this->readability->prepArticle($this->body);
+                                                               }
+                                                               $detect_body = false;
+                                                       } else {
+                                                               $this->debug('hNews: skipping entry-content - appears not to contain content');
+                                                       }
+                                                       unset($e);
+                                               } else {
+                                                       $this->body = $this->readability->dom->createElement('div');
+                                                       $this->debug($elems->length.' entry-content elems found');
+                                                       foreach ($elems as $elem) {
+                                                               if (!isset($elem->parentNode)) continue;
+                                                               $isDescendant = false;
+                                                               foreach ($this->body->childNodes as $parent) {
+                                                                       if ($this->isDescendant($parent, $elem)) {
+                                                                               $isDescendant = true;
+                                                                               break;
+                                                                       }
+                                                               }
+                                                               if ($isDescendant) {
+                                                                       $this->debug('Element is child of another body element, skipping.');
+                                                               } else {
+                                                                       // prune (clean up elements that may not be content)
+                                                                       if ($this->config->prune()) {
+                                                                               $this->debug('Pruning content');
+                                                                               $this->readability->prepArticle($elem);
+                                                                       }                                                               
+                                                                       $this->debug('Element added to body');                                                                  
+                                                                       $this->body->appendChild($elem);
+                                                               }
+                                                       }
+                                                       $detect_body = false;
+                                               }
+                                       }
+                               }
+                       }
+               }
+
+               // check for elements marked with instapaper_title
+               if ($detect_title) {
+                       // check for instapaper_title
+                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
+                       if ($elems && $elems->length > 0) {
+                               $this->title = $elems->item(0)->textContent;
+                               $this->debug('Title found (.instapaper_title): '.$this->title);
+                               // remove title from document
+                               $elems->item(0)->parentNode->removeChild($elems->item(0));
+                               $detect_title = false;
+                       }
+               }
+               // check for elements marked with instapaper_body
+               if ($detect_body) {
+                       $elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
+                       if ($elems && $elems->length > 0) {
+                               $this->debug('body found (.instapaper_body)');
+                               $this->body = $elems->item(0);
+                               // prune (clean up elements that may not be content)
+                               if ($this->config->prune()) {
+                                       $this->debug('Pruning content');
+                                       $this->readability->prepArticle($this->body);
+                               }
+                               $detect_body = false;
+                       }
+               }
+               
+               // Find author in rel="author" marked element
+               // We only use this if there's exactly one.
+               // If there's more than one, it could indicate more than
+               // one author, but it could also indicate that we're processing
+               // a page listing different articles with different authors.
+               if ($detect_author) {
+                       $elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
+                       if ($elems && $elems->length == 1) {
+                               $author = trim($elems->item(0)->textContent);
+                               if ($author != '') {
+                                       $this->debug("Author found (rel=\"author\"): $author");
+                                       $this->author[] = $author;
+                                       $detect_author = false;
+                               }
+                       }
+               }
+
+               // Find date in pubdate marked time element
+               // For the same reason given above, we only use this
+               // if there's exactly one element.
+               if ($detect_date) {
+                       $elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
+                       if ($elems && $elems->length == 1) {
+                               $this->date = strtotime(trim($elems->item(0)->textContent));
+                               // remove date from document
+                               //$elems->item(0)->parentNode->removeChild($elems->item(0));
+                               if ($this->date) {
+                                       $this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
+                                       $detect_date = false;
+                               } else {
+                                       $this->date = null;
+                               }
+                       }
+               }
+
+               // still missing title or body, so we detect using Readability
+               if ($detect_title || $detect_body) {
+                       $this->debug('Using Readability');
+                       // clone body if we're only using Readability for title (otherwise it may interfere with body element)
+                       if (isset($this->body)) $this->body = $this->body->cloneNode(true);
+                       $success = $this->readability->init();
+               }
+               if ($detect_title) {
+                       $this->debug('Detecting title');
+                       $this->title = $this->readability->getTitle()->textContent;
+               }
+               if ($detect_body && $success) {
+                       $this->debug('Detecting body');
+                       $this->body = $this->readability->getContent();
+                       if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
+                               $this->body = $this->body->firstChild;
+                       }
+                       // prune (clean up elements that may not be content)
+                       if ($this->config->prune()) {
+                               $this->debug('Pruning content');
+                               $this->readability->prepArticle($this->body);
+                       }
+               }
+               if (isset($this->body)) {
+                       // remove scripts
+                       $this->readability->removeScripts($this->body);
+                       // remove any h1-h6 elements that appear as first thing in the body
+                       // and which match our title
+                       if (isset($this->title) && ($this->title != '')) {
+                               $firstChild = $this->body->firstChild;
+                               while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
+                                       $firstChild = $firstChild->nextSibling;
+                               }
+                               if (($firstChild->nodeType === XML_ELEMENT_NODE)
+                                       && in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
+                                       && (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
+                                               $this->body->removeChild($firstChild);
+                               }
+                       }
+                       // prevent self-closing iframes
+                       $elems = $this->body->getElementsByTagName('iframe');
+                       for ($i = $elems->length-1; $i >= 0; $i--) {
+                               $e = $elems->item($i);
+                               if (!$e->hasChildNodes()) {
+                                       $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
+                               }
+                       }
+                       // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
+                       // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
+                       // inside the data-lazy-src attribute. It also places the original image inside a noscript element 
+                       // next to the amended one.
+                       $elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
+                       for ($i = $elems->length-1; $i >= 0; $i--) {
+                               $e = $elems->item($i);
+                               // let's see if we can grab image from noscript
+                               if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
+                                       $_new_elem = $e->ownerDocument->createDocumentFragment();
+                                       @$_new_elem->appendXML($e->nextSibling->innerHTML);
+                                       $e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
+                                       $e->parentNode->removeChild($e);
+                               } else {
+                                       // Use data-lazy-src as src value
+                                       $e->setAttribute('src', $e->getAttribute('data-lazy-src'));
+                                       $e->removeAttribute('data-lazy-src');
+                               }
+                       }
+               
+                       $this->success = true;
+               }
+               
+               // if we've had no success and we've used tidy, there's a chance
+               // that tidy has messed up. So let's try again without tidy...
+               if (!$this->success && $tidied && $smart_tidy) {
+                       $this->debug('Trying again without tidy');
+                       $this->process($original_html, $url, false);
+               }
+
+               return $this->success;
+       }
+       
+       private function isDescendant(DOMElement $parent, DOMElement $child) {
+               $node = $child->parentNode;
+               while ($node != null) {
+                       if ($node->isSameNode($parent)) return true;
+                       $node = $node->parentNode;
+               }
+               return false;
+       }
+
+       public function getContent() {
+               return $this->body;
+       }
+       
+       public function getTitle() {
+               return $this->title;
+       }
+       
+       public function getAuthors() {
+               return $this->author;
+       }
+       
+       public function getLanguage() {
+               return $this->language;
+       }
+       
+       public function getDate() {
+               return $this->date;
+       }
+       
+       public function getSiteConfig() {
+               return $this->config;
+       }
+       
+       public function getNextPageUrl() {
+               return $this->nextPageUrl;
+       }
+}
\ No newline at end of file
index c5e300d7a1e5d0ff7bdf175fe94a91e2796f2bcb..1f6a7603bd2b0b3d9dcdd14d6fb04b37fac1d887 100644 (file)
-<?php\r
-/**\r
- * Site Config\r
- * \r
- * Each instance of this class should hold extraction patterns and other directives\r
- * for a website. See ContentExtractor class to see how it's used.\r
- * \r
- * @version 0.7\r
- * @date 2012-08-27\r
- * @author Keyvan Minoukadeh\r
- * @copyright 2012 Keyvan Minoukadeh\r
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
- */\r
-\r
-class SiteConfig\r
-{\r
-       // Use first matching element as title (0 or more xpath expressions)\r
-       public $title = array();\r
-       \r
-       // Use first matching element as body (0 or more xpath expressions)\r
-       public $body = array();\r
-       \r
-       // Use first matching element as author (0 or more xpath expressions)\r
-       public $author = array();\r
-       \r
-       // Use first matching element as date (0 or more xpath expressions)\r
-       public $date = array();\r
-       \r
-       // Strip elements matching these xpath expressions (0 or more)\r
-       public $strip = array();\r
-       \r
-       // Strip elements which contain these strings (0 or more) in the id or class attribute \r
-       public $strip_id_or_class = array();\r
-       \r
-       // Strip images which contain these strings (0 or more) in the src attribute \r
-       public $strip_image_src = array();\r
-       \r
-       // Additional HTTP headers to send\r
-       // NOT YET USED\r
-       public $http_header = array();\r
-       \r
-       // Process HTML with tidy before creating DOM (bool or null if undeclared)\r
-       public $tidy = null;\r
-       \r
-       protected $default_tidy = true; // used if undeclared\r
-       \r
-       // Autodetect title/body if xpath expressions fail to produce results.\r
-       // Note that this applies to title and body separately, ie. \r
-       //   * if we get a body match but no title match, this option will determine whether we autodetect title \r
-       //   * if neither match, this determines whether we autodetect title and body.\r
-       // Also note that this only applies when there is at least one xpath expression in title or body, ie.\r
-       //   * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)\r
-       //   * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.\r
-       // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).\r
-       // bool or null if undeclared\r
-       public $autodetect_on_failure = null;\r
-       protected $default_autodetect_on_failure = true; // used if undeclared\r
-       \r
-       // Clean up content block - attempt to remove elements that appear to be superfluous\r
-       // bool or null if undeclared\r
-       public $prune = null;\r
-       protected $default_prune = true; // used if undeclared\r
-       \r
-       // Test URL - if present, can be used to test the config above\r
-       public $test_url = array();\r
-       \r
-       // Single-page link - should identify a link element or URL pointing to the page holding the entire article\r
-       // This is useful for sites which split their articles across multiple pages. Links to such pages tend to \r
-       // display the first page with links to the other pages at the bottom. Often there is also a link to a page\r
-       // which displays the entire article on one page (e.g. 'print view').\r
-       // This should be an XPath expression identifying the link to that page. If present and we find a match,\r
-       // we will retrieve that page and the rest of the options in this config will be applied to the new page.\r
-       public $single_page_link = array();\r
-       \r
-       public $next_page_link = array();\r
-       \r
-       // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed\r
-       public $single_page_link_in_feed = array();\r
-       \r
-       // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')\r
-       // string or null if undeclared\r
-       public $parser = null;\r
-       protected $default_parser = 'libxml'; // used if undeclared\r
-       \r
-       // Strings to search for in HTML before processing begins (used with $replace_string)\r
-       public $find_string = array();\r
-       // Strings to replace those found in $find_string before HTML processing begins\r
-       public $replace_string = array();\r
-       \r
-       // the options below cannot be set in the config files which this class represents\r
-       \r
-       //public $cache_in_apc = false; // used to decide if we should cache in apc or not\r
-       public $cache_key = null;\r
-       public static $debug = false;\r
-       protected static $apc = false;\r
-       protected static $config_path;\r
-       protected static $config_path_fallback;\r
-       protected static $config_cache = array();\r
-       const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';\r
-       \r
-       protected static function debug($msg) {\r
-               if (self::$debug) {\r
-                       //$mem = round(memory_get_usage()/1024, 2);\r
-                       //$memPeak = round(memory_get_peak_usage()/1024, 2);\r
-                       echo '* ',$msg;\r
-                       //echo ' - mem used: ',$mem," (peak: $memPeak)\n";\r
-                       echo "\n";\r
-                       ob_flush();\r
-                       flush();\r
-               }\r
-       }\r
-       \r
-       // enable APC caching of certain site config files?\r
-       // If enabled the following site config files will be \r
-       // cached in APC cache (when requested for first time):\r
-       // * anything in site_config/custom/ and its corresponding file in site_config/standard/\r
-       // * the site config files associated with HTML fingerprints\r
-       // * the global site config file\r
-       // returns true if enabled, false otherwise\r
-       public static function use_apc($apc=true) {\r
-               if (!function_exists('apc_add')) {\r
-                       if ($apc) self::debug('APC will not be used (function apc_add does not exist)');\r
-                       return false;\r
-               }\r
-               self::$apc = $apc;\r
-               return $apc;\r
-       }\r
-       \r
-       // return bool or null\r
-       public function tidy($use_default=true) {\r
-               if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;\r
-               return $this->tidy;\r
-       }\r
-       \r
-       // return bool or null\r
-       public function prune($use_default=true) {\r
-               if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;\r
-               return $this->prune;\r
-       }\r
-       \r
-       // return string or null\r
-       public function parser($use_default=true) {\r
-               if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;\r
-               return $this->parser;\r
-       }\r
-\r
-       // return bool or null\r
-       public function autodetect_on_failure($use_default=true) {\r
-               if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;\r
-               return $this->autodetect_on_failure;\r
-       }\r
-       \r
-       public static function set_config_path($path, $fallback=null) {\r
-               self::$config_path = $path;\r
-               self::$config_path_fallback = $fallback;\r
-       }\r
-       \r
-       public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {\r
-               $key = strtolower($key);\r
-               if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r
-               if ($config->cache_key) $key = $config->cache_key;\r
-               self::$config_cache[$key] = $config;\r
-               if (self::$apc && $use_apc) {\r
-                       self::debug("Adding site config to APC cache with key sc.$key");\r
-                       apc_add("sc.$key", $config);\r
-               }\r
-               self::debug("Cached site config with key $key");\r
-       }\r
-       \r
-       public static function is_cached($key) {\r
-               $key = strtolower($key);\r
-               if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r
-               if (array_key_exists($key, self::$config_cache)) {\r
-                       return true;\r
-               } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {\r
-                       return true;\r
-               }\r
-               return false;\r
-       }\r
-       \r
-       public function append(SiteConfig $newconfig) {\r
-               // check for commands where we accept multiple statements (no test_url)\r
-               foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {\r
-                       // append array elements for this config variable from $newconfig to this config\r
-                       //$this->$var = $this->$var + $newconfig->$var;\r
-                       $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));\r
-               }\r
-               // check for single statement commands\r
-               // we do not overwrite existing non null values\r
-               foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {\r
-                       if ($this->$var === null) $this->$var = $newconfig->$var;\r
-               }\r
-       }\r
-       \r
-       // returns SiteConfig instance if an appropriate one is found, false otherwise\r
-       // if $exact_host_match is true, we will not look for wildcard config matches\r
-       // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists\r
-       public static function build($host, $exact_host_match=false) {\r
-               $host = strtolower($host);\r
-               if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);\r
-               if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;\r
-               // check for site configuration\r
-               $try = array($host);\r
-               // should we look for wildcard matches \r
-               if (!$exact_host_match) {\r
-                       $split = explode('.', $host);\r
-                       if (count($split) > 1) {\r
-                               array_shift($split);\r
-                               $try[] = '.'.implode('.', $split);\r
-                       }\r
-               }\r
-               \r
-               // look for site config file in primary folder\r
-               self::debug(". looking for site config for $host in primary folder");\r
-               foreach ($try as $h) {\r
-                       if (array_key_exists($h, self::$config_cache)) {\r
-                               self::debug("... site config for $h already loaded in this request");\r
-                               return self::$config_cache[$h];\r
-                       } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {\r
-                               self::debug("... site config for $h in APC cache");\r
-                               return $sconfig;\r
-                       } elseif (file_exists(self::$config_path."/$h.txt")) {\r
-                               self::debug("... found site config ($h.txt)");\r
-                               $file_primary = self::$config_path."/$h.txt";\r
-                               $matched_name = $h;\r
-                               break;\r
-                       }\r
-               }\r
-               \r
-               // if we found site config, process it\r
-               if (isset($file_primary)) {\r
-                       $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r
-                       if (!$config_lines || !is_array($config_lines)) return false;\r
-                       $config = self::build_from_array($config_lines);\r
-                       // if APC caching is available and enabled, mark this for cache\r
-                       //$config->cache_in_apc = true;\r
-                       $config->cache_key = $matched_name;\r
-                       \r
-                       // if autodetec on failure is off (on by default) we do not need to look\r
-                       // in secondary folder\r
-                       if (!$config->autodetect_on_failure()) {\r
-                               self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');\r
-                               return $config;\r
-                       }\r
-               }\r
-               \r
-               // look for site config file in secondary folder\r
-               if (isset(self::$config_path_fallback)) {\r
-                       self::debug(". looking for site config for $host in secondary folder");\r
-                       foreach ($try as $h) {\r
-                               if (file_exists(self::$config_path_fallback."/$h.txt")) {\r
-                                       self::debug("... found site config in secondary folder ($h.txt)");\r
-                                       $file_secondary = self::$config_path_fallback."/$h.txt";\r
-                                       $matched_name = $h;\r
-                                       break;\r
-                               }\r
-                       }\r
-                       if (!isset($file_secondary)) {\r
-                               self::debug("... no site config match in secondary folder");\r
-                       }\r
-               }\r
-               \r
-               // return false if no config file found\r
-               if (!isset($file_primary) && !isset($file_secondary)) {\r
-                       self::debug("... no site config match for $host");\r
-                       return false;\r
-               }\r
-               \r
-               // return primary config if secondary not found\r
-               if (!isset($file_secondary) && isset($config)) {\r
-                       return $config;\r
-               }\r
-               \r
-               // process secondary config file\r
-               $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r
-               if (!$config_lines || !is_array($config_lines)) {\r
-                       // failed to process secondary\r
-                       if (isset($config)) {\r
-                               // return primary config\r
-                               return $config;\r
-                       } else {\r
-                               return false;\r
-                       }\r
-               }\r
-               \r
-               // merge with primary and return\r
-               if (isset($config)) {\r
-                       self::debug('. merging config files');\r
-                       $config->append(self::build_from_array($config_lines));\r
-                       return $config;\r
-               } else {\r
-                       // return just secondary\r
-                       $config = self::build_from_array($config_lines);\r
-                       // if APC caching is available and enabled, mark this for cache\r
-                       //$config->cache_in_apc = true;\r
-                       $config->cache_key = $matched_name;\r
-                       return $config;\r
-               }\r
-       }\r
-       \r
-       public static function build_from_array(array $lines) {\r
-               $config = new SiteConfig();\r
-               foreach ($lines as $line) {\r
-                       $line = trim($line);\r
-                       \r
-                       // skip comments, empty lines\r
-                       if ($line == '' || $line[0] == '#') continue;\r
-                       \r
-                       // get command\r
-                       $command = explode(':', $line, 2);\r
-                       // if there's no colon ':', skip this line\r
-                       if (count($command) != 2) continue;\r
-                       $val = trim($command[1]);\r
-                       $command = trim($command[0]);\r
-                       if ($command == '' || $val == '') continue;\r
-                       \r
-                       // check for commands where we accept multiple statements\r
-                       if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {\r
-                               array_push($config->$command, $val);\r
-                       // check for single statement commands that evaluate to true or false\r
-                       } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {\r
-                               $config->$command = ($val == 'yes');\r
-                       // check for single statement commands stored as strings\r
-                       } elseif (in_array($command, array('parser'))) {\r
-                               $config->$command = $val;\r
-                       // check for replace_string(find): replace\r
-                       } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {\r
-                               if (in_array($match[1], array('replace_string'))) {\r
-                                       $command = $match[1];\r
-                                       array_push($config->find_string, $match[2]);\r
-                                       array_push($config->$command, $val);\r
-                               }\r
-                       }\r
-               }\r
-               return $config;\r
-       }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Site Config
+ * 
+ * Each instance of this class should hold extraction patterns and other directives
+ * for a website. See ContentExtractor class to see how it's used.
+ * 
+ * @version 0.8
+ * @date 2013-04-16
+ * @author Keyvan Minoukadeh
+ * @copyright 2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class SiteConfig
+{
+       // Use first matching element as title (0 or more xpath expressions)
+       public $title = array();
+       
+       // Use first matching element as body (0 or more xpath expressions)
+       public $body = array();
+       
+       // Use first matching element as author (0 or more xpath expressions)
+       public $author = array();
+       
+       // Use first matching element as date (0 or more xpath expressions)
+       public $date = array();
+       
+       // Strip elements matching these xpath expressions (0 or more)
+       public $strip = array();
+       
+       // Strip elements which contain these strings (0 or more) in the id or class attribute 
+       public $strip_id_or_class = array();
+       
+       // Strip images which contain these strings (0 or more) in the src attribute 
+       public $strip_image_src = array();
+       
+       // Additional HTTP headers to send
+       // NOT YET USED
+       public $http_header = array();
+       
+       // Process HTML with tidy before creating DOM (bool or null if undeclared)
+       public $tidy = null;
+       
+       protected $default_tidy = true; // used if undeclared
+       
+       // Autodetect title/body if xpath expressions fail to produce results.
+       // Note that this applies to title and body separately, ie. 
+       //   * if we get a body match but no title match, this option will determine whether we autodetect title 
+       //   * if neither match, this determines whether we autodetect title and body.
+       // Also note that this only applies when there is at least one xpath expression in title or body, ie.
+       //   * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
+       //   * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
+       // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
+       // bool or null if undeclared
+       public $autodetect_on_failure = null;
+       protected $default_autodetect_on_failure = true; // used if undeclared
+       
+       // Clean up content block - attempt to remove elements that appear to be superfluous
+       // bool or null if undeclared
+       public $prune = null;
+       protected $default_prune = true; // used if undeclared
+       
+       // Test URL - if present, can be used to test the config above
+       public $test_url = array();
+       
+       // Single-page link - should identify a link element or URL pointing to the page holding the entire article
+       // This is useful for sites which split their articles across multiple pages. Links to such pages tend to 
+       // display the first page with links to the other pages at the bottom. Often there is also a link to a page
+       // which displays the entire article on one page (e.g. 'print view').
+       // This should be an XPath expression identifying the link to that page. If present and we find a match,
+       // we will retrieve that page and the rest of the options in this config will be applied to the new page.
+       public $single_page_link = array();
+       
+       public $next_page_link = array();
+       
+       // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
+       public $single_page_link_in_feed = array();
+       
+       // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+       // string or null if undeclared
+       public $parser = null;
+       protected $default_parser = 'libxml'; // used if undeclared
+       
+       // Strings to search for in HTML before processing begins (used with $replace_string)
+       public $find_string = array();
+       // Strings to replace those found in $find_string before HTML processing begins
+       public $replace_string = array();
+       
+       // the options below cannot be set in the config files which this class represents
+       
+       //public $cache_in_apc = false; // used to decide if we should cache in apc or not
+       public $cache_key = null;
+       public static $debug = false;
+       protected static $apc = false;
+       protected static $config_path;
+       protected static $config_path_fallback;
+       protected static $config_cache = array();
+       const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
+       
+       protected static function debug($msg) {
+               if (self::$debug) {
+                       //$mem = round(memory_get_usage()/1024, 2);
+                       //$memPeak = round(memory_get_peak_usage()/1024, 2);
+                       echo '* ',$msg;
+                       //echo ' - mem used: ',$mem," (peak: $memPeak)\n";
+                       echo "\n";
+                       ob_flush();
+                       flush();
+               }
+       }
+       
+       // enable APC caching of certain site config files?
+       // If enabled the following site config files will be 
+       // cached in APC cache (when requested for first time):
+       // * anything in site_config/custom/ and its corresponding file in site_config/standard/
+       // * the site config files associated with HTML fingerprints
+       // * the global site config file
+       // returns true if enabled, false otherwise
+       public static function use_apc($apc=true) {
+               if (!function_exists('apc_add')) {
+                       if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
+                       return false;
+               }
+               self::$apc = $apc;
+               return $apc;
+       }
+       
+       // return bool or null
+       public function tidy($use_default=true) {
+               if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
+               return $this->tidy;
+       }
+       
+       // return bool or null
+       public function prune($use_default=true) {
+               if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
+               return $this->prune;
+       }
+       
+       // return string or null
+       public function parser($use_default=true) {
+               if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
+               return $this->parser;
+       }
+
+       // return bool or null
+       public function autodetect_on_failure($use_default=true) {
+               if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
+               return $this->autodetect_on_failure;
+       }
+       
+       public static function set_config_path($path, $fallback=null) {
+               self::$config_path = $path;
+               self::$config_path_fallback = $fallback;
+       }
+       
+       public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
+               $key = strtolower($key);
+               if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+               if ($config->cache_key) $key = $config->cache_key;
+               self::$config_cache[$key] = $config;
+               if (self::$apc && $use_apc) {
+                       self::debug("Adding site config to APC cache with key sc.$key");
+                       apc_add("sc.$key", $config);
+               }
+               self::debug("Cached site config with key $key");
+       }
+       
+       public static function is_cached($key) {
+               $key = strtolower($key);
+               if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+               if (array_key_exists($key, self::$config_cache)) {
+                       return true;
+               } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
+                       return true;
+               }
+               return false;
+       }
+       
+       public function append(SiteConfig $newconfig) {
+               // check for commands where we accept multiple statements (no test_url)
+               foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
+                       // append array elements for this config variable from $newconfig to this config
+                       //$this->$var = $this->$var + $newconfig->$var;
+                       $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
+               }
+               // check for single statement commands
+               // we do not overwrite existing non null values
+               foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
+                       if ($this->$var === null) $this->$var = $newconfig->$var;
+               }
+               // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
+               foreach (array('find_string', 'replace_string') as $var) {
+                       // append array elements for this config variable from $newconfig to this config
+                       //$this->$var = $this->$var + $newconfig->$var;
+                       $this->$var = array_merge($this->$var, $newconfig->$var);
+               }
+       }
+       
+       // returns SiteConfig instance if an appropriate one is found, false otherwise
+       // if $exact_host_match is true, we will not look for wildcard config matches
+       // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
+       public static function build($host, $exact_host_match=false) {
+               $host = strtolower($host);
+               if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+               if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
+               // check for site configuration
+               $try = array($host);
+               // should we look for wildcard matches 
+               if (!$exact_host_match) {
+                       $split = explode('.', $host);
+                       if (count($split) > 1) {
+                               array_shift($split);
+                               $try[] = '.'.implode('.', $split);
+                       }
+               }
+               
+               // look for site config file in primary folder
+               self::debug(". looking for site config for $host in primary folder");
+               foreach ($try as $h) {
+                       if (array_key_exists($h, self::$config_cache)) {
+                               self::debug("... site config for $h already loaded in this request");
+                               return self::$config_cache[$h];
+                       } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
+                               self::debug("... site config for $h in APC cache");
+                               return $sconfig;
+                       } elseif (file_exists(self::$config_path."/$h.txt")) {
+                               self::debug("... found site config ($h.txt)");
+                               $file_primary = self::$config_path."/$h.txt";
+                               $matched_name = $h;
+                               break;
+                       }
+               }
+               
+               // if we found site config, process it
+               if (isset($file_primary)) {
+                       $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+                       if (!$config_lines || !is_array($config_lines)) return false;
+                       $config = self::build_from_array($config_lines);
+                       // if APC caching is available and enabled, mark this for cache
+                       //$config->cache_in_apc = true;
+                       $config->cache_key = $matched_name;
+                       
+                       // if autodetec on failure is off (on by default) we do not need to look
+                       // in secondary folder
+                       if (!$config->autodetect_on_failure()) {
+                               self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
+                               return $config;
+                       }
+               }
+               
+               // look for site config file in secondary folder
+               if (isset(self::$config_path_fallback)) {
+                       self::debug(". looking for site config for $host in secondary folder");
+                       foreach ($try as $h) {
+                               if (file_exists(self::$config_path_fallback."/$h.txt")) {
+                                       self::debug("... found site config in secondary folder ($h.txt)");
+                                       $file_secondary = self::$config_path_fallback."/$h.txt";
+                                       $matched_name = $h;
+                                       break;
+                               }
+                       }
+                       if (!isset($file_secondary)) {
+                               self::debug("... no site config match in secondary folder");
+                       }
+               }
+               
+               // return false if no config file found
+               if (!isset($file_primary) && !isset($file_secondary)) {
+                       self::debug("... no site config match for $host");
+                       return false;
+               }
+               
+               // return primary config if secondary not found
+               if (!isset($file_secondary) && isset($config)) {
+                       return $config;
+               }
+               
+               // process secondary config file
+               $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+               if (!$config_lines || !is_array($config_lines)) {
+                       // failed to process secondary
+                       if (isset($config)) {
+                               // return primary config
+                               return $config;
+                       } else {
+                               return false;
+                       }
+               }
+               
+               // merge with primary and return
+               if (isset($config)) {
+                       self::debug('. merging config files');
+                       $config->append(self::build_from_array($config_lines));
+                       return $config;
+               } else {
+                       // return just secondary
+                       $config = self::build_from_array($config_lines);
+                       // if APC caching is available and enabled, mark this for cache
+                       //$config->cache_in_apc = true;
+                       $config->cache_key = $matched_name;
+                       return $config;
+               }
+       }
+       
+       public static function build_from_array(array $lines) {
+               $config = new SiteConfig();
+               foreach ($lines as $line) {
+                       $line = trim($line);
+                       
+                       // skip comments, empty lines
+                       if ($line == '' || $line[0] == '#') continue;
+                       
+                       // get command
+                       $command = explode(':', $line, 2);
+                       // if there's no colon ':', skip this line
+                       if (count($command) != 2) continue;
+                       $val = trim($command[1]);
+                       $command = trim($command[0]);
+                       if ($command == '' || $val == '') continue;
+                       
+                       // check for commands where we accept multiple statements
+                       if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
+                               array_push($config->$command, $val);
+                       // check for single statement commands that evaluate to true or false
+                       } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+                               $config->$command = ($val == 'yes');
+                       // check for single statement commands stored as strings
+                       } elseif (in_array($command, array('parser'))) {
+                               $config->$command = $val;
+                       // check for replace_string(find): replace
+                       } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
+                               if (in_array($match[1], array('replace_string'))) {
+                                       $command = $match[1];
+                                       array_push($config->find_string, $match[2]);
+                                       array_push($config->$command, $val);
+                               }
+                       }
+               }
+               return $config;
+       }
+}
\ No newline at end of file
old mode 100644 (file)
new mode 100755 (executable)
index 54a56f2..4078659
@@ -1,7 +1,7 @@
 <?php\r
  /**\r
  * Univarsel Feed Writer\r
- * \r
+ *\r
  * FeedItem class - Used as feed element in FeedWriter class\r
  *\r
  * @package         UnivarselFeedWriter\r
  {\r
     private $elements = array();    //Collection of feed elements\r
     private $version;\r
-    \r
+\r
     /**\r
-    * Constructor \r
-    * \r
-    * @param    contant     (RSS1/RSS2/ATOM) RSS2 is default. \r
-    */ \r
+    * Constructor\r
+    *\r
+    * @param    contant     (RSS1/RSS2/ATOM) RSS2 is default.\r
+    */\r
     function __construct($version = RSS2)\r
-    {    \r
+    {\r
         $this->version = $version;\r
     }\r
 \r
     /**\r
     * Set element (overwrites existing elements with $elementName)\r
-    * \r
+    *\r
     * @access   public\r
     * @param    srting  The tag name of an element\r
     * @param    srting  The content of tag\r
             unset($this->elements[$elementName]);\r
         }\r
         $this->addElement($elementName, $content, $attributes);\r
-    }    \r
-    \r
+    }\r
+\r
     /**\r
     * Add an element to elements array\r
-    * \r
+    *\r
     * @access   public\r
     * @param    srting  The tag name of an element\r
     * @param    srting  The content of tag\r
         $this->elements[$elementName][$i]['content']    = $content;\r
         $this->elements[$elementName][$i]['attributes'] = $attributes;\r
     }\r
-    \r
+\r
     /**\r
-    * Set multiple feed elements from an array. \r
+    * Set multiple feed elements from an array.\r
     * Elements which have attributes cannot be added by this method\r
-    * \r
+    *\r
     * @access   public\r
     * @param    array   array of elements in 'tagName' => 'tagContent' format.\r
     * @return   void\r
     public function addElementArray($elementArray)\r
     {\r
         if(! is_array($elementArray)) return;\r
-        foreach ($elementArray as $elementName => $content) \r
+        foreach ($elementArray as $elementName => $content)\r
         {\r
             $this->addElement($elementName, $content);\r
         }\r
     }\r
-    \r
+\r
     /**\r
     * Return the collection of elements in this feed item\r
-    * \r
+    *\r
     * @access   public\r
     * @return   array\r
     */\r
     {\r
         return $this->elements;\r
     }\r
-    \r
+\r
     // Wrapper functions ------------------------------------------------------\r
-    \r
+\r
     /**\r
     * Set the 'dscription' element of feed item\r
-    * \r
+    *\r
     * @access   public\r
     * @param    string  The content of 'description' element\r
     * @return   void\r
     */\r
-    public function setDescription($description) \r
+    public function setDescription($description)\r
     {\r
-        $this->setElement('description', $description);\r
+        $tag = ($this->version == ATOM)? 'summary' : 'description';\r
+        $this->setElement($tag, $description);\r
     }\r
-    \r
+\r
     /**\r
     * @desc     Set the 'title' element of feed item\r
     * @access   public\r
     * @param    string  The content of 'title' element\r
     * @return   void\r
     */\r
-    public function setTitle($title) \r
+    public function setTitle($title)\r
     {\r
-        $this->setElement('title', $title);      \r
+        $this->setElement('title', $title);\r
     }\r
-    \r
+\r
     /**\r
     * Set the 'date' element of feed item\r
-    * \r
+    *\r
     * @access   public\r
     * @param    string  The content of 'date' element\r
     * @return   void\r
     */\r
-    public function setDate($date) \r
+    public function setDate($date)\r
     {\r
         if(! is_numeric($date))\r
         {\r
             $date = strtotime($date);\r
         }\r
-      \r
-        if($this->version == RSS2) \r
+\r
+        if($this->version == ATOM)\r
+        {\r
+               $tag    = 'updated';\r
+               $value  = date(DATE_ATOM, $date);\r
+        }\r
+        elseif($this->version == RSS2)\r
         {\r
-            $tag    = 'pubDate';\r
-            $value  = date(DATE_RSS, $date);\r
+               $tag    = 'pubDate';\r
+               $value  = date(DATE_RSS, $date);\r
         }\r
-        else                                \r
+        else\r
         {\r
-            $tag    = 'dc:date';\r
-            $value  = date("Y-m-d", $date);\r
+               $tag    = 'dc:date';\r
+               $value  = date("Y-m-d", $date);\r
         }\r
-        \r
-        $this->setElement($tag, $value);    \r
+\r
+        $this->setElement($tag, $value);\r
     }\r
-    \r
+\r
     /**\r
     * Set the 'link' element of feed item\r
-    * \r
+    *\r
     * @access   public\r
     * @param    string  The content of 'link' element\r
     * @return   void\r
     */\r
-    public function setLink($link) \r
+    public function setLink($link)\r
     {\r
         if($this->version == RSS2 || $this->version == RSS1)\r
         {\r
         {\r
             $this->setElement('link','',array('href'=>$link));\r
             $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));\r
-        } \r
-        \r
+        }\r
+\r
     }\r
 \r
     /**\r
     * Set the 'source' element of feed item\r
-    * \r
+    *\r
     * @access   public\r
     * @param    string  The content of 'source' element\r
     * @return   void\r
     */\r
-    public function setSource($link) \r
+    public function setSource($link)\r
     {\r
         $attributes = array('url'=>$link);\r
         $this->setElement('source', "wallabag",$attributes);\r
     }\r
-    \r
+\r
     /**\r
     * Set the 'encloser' element of feed item\r
     * For RSS 2.0 only\r
-    * \r
+    *\r
     * @access   public\r
     * @param    string  The url attribute of encloser tag\r
     * @param    string  The length attribute of encloser tag\r
         $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);\r
         $this->setElement('enclosure','',$attributes);\r
     }\r
-    \r
+\r
  } // end of class FeedItem\r
 ?>
\ No newline at end of file
index d708e99b0ec36e12dce2dcc6309410f3f8e598dc..7775569033cc9e3c1acf81832f7c1b2dc36946dd 100755 (executable)
@@ -97,15 +97,12 @@ define('JSONP', 3, true);
               header('X-content-type-options: nosniff');\r
           } elseif ($this->version == JSON) {\r
               header('Content-type: application/json; charset=UTF-8');\r
+              $this->json = new stdClass();\r
           } elseif ($this->version == JSONP) {\r
               header('Content-type: application/javascript; charset=UTF-8');\r
+              $this->json = new stdClass();\r
           }\r
         }\r
-      \r
-        if ($this->version == JSON || $this->version == JSONP) {\r
-          $this->json = new stdClass();\r
-        }\r
-      \r
 \r
         $this->printHead();\r
         $this->printChannels();\r
@@ -116,6 +113,11 @@ define('JSONP', 3, true);
         }\r
     }\r
 \r
+    public function &getItems()\r
+    {\r
+       return $this->items;\r
+    }\r
+\r
     /**\r
     * Create a new FeedItem.\r
     *\r
@@ -199,7 +201,8 @@ define('JSONP', 3, true);
     */\r
     public function setDescription($description)\r
     {\r
-        $this->setChannelElement('description', $description);\r
+        $tag = ($this->version == ATOM)? 'subtitle' : 'description';\r
+        $this->setChannelElement($tag, $desciption);\r
     }\r
 \r
     /**\r
@@ -244,7 +247,7 @@ define('JSONP', 3, true);
         {\r
             $out  = '<?xml version="1.0" encoding="utf-8"?>'."\n";\r
             if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;\r
-            $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;\r
+            $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;\r
             echo $out;\r
         }\r
         elseif ($this->version == JSON || $this->version == JSONP)\r
index 2f5244f9fd744d1543fa83d4c5432dc7a7ff8773..c4a48b2152d09778c264b0864253371e65fd91b3 100644 (file)
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
 
     // Namespaces for foreign content
     const NS_HTML   = null; // to prevent DOM from requiring NS on everything
+    const NS_XHTML  = 'http://www.w3.org/1999/xhtml';
     const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
     const NS_SVG    = 'http://www.w3.org/2000/svg';
     const NS_XLINK  = 'http://www.w3.org/1999/xlink';
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
         }
 
     private function insertElement($token, $append = true) {
-        $el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
+        //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
+        $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
+        $el = $this->dom->createElementNS($namespaceURI, $token['name']);
 
         if (!empty($token['attr'])) {
             foreach($token['attr'] as $attr) {
-                if(!$el->hasAttribute($attr['name'])) {
+
+                               // mike@macgirvin.com 2011-11-17, check attribute name for
+                               // validity (ignoring extenders and combiners) as illegal chars in names
+                               // causes everything to abort
+
+                               $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
+                if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
                     $el->setAttribute($attr['name'], $attr['value']);
                 }
             }
index 83e94f14035c5dd9053bb4150592a6807fa77f39..e4d5f495b355a1dc2dc5e4079266cd654cea7f42 100644 (file)
-<?php\r
-/**\r
- * Cookie Jar\r
- * \r
- * PHP class for handling cookies, as defined by the Netscape spec: \r
- * <http://curl.haxx.se/rfc/cookie_spec.html>\r
- *\r
- * This class should be used to handle cookies (storing cookies from HTTP response messages, and\r
- * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org \r
- * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/\r
- * \r
- * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/\r
- * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.\r
- * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.\r
- * \r
- * @version 0.5\r
- * @date 2011-03-15\r
- * @see http://php.net/HttpRequestPool\r
- * @author Keyvan Minoukadeh\r
- * @copyright 2011 Keyvan Minoukadeh\r
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
- */\r
-\r
-class CookieJar\r
-{\r
-    /**\r
-    * Cookies - array containing all cookies.\r
-    *\r
-    * <pre>\r
-    * Cookies are stored like this:\r
-    *   [domain][path][name] = array\r
-    * where array is:\r
-    *   0 => value, 1 => secure, 2 => expires\r
-    * </pre>\r
-    * @var array\r
-    * @access private\r
-    */\r
-    public $cookies = array();\r
-       public $debug = false;\r
-\r
-    /**\r
-    * Constructor\r
-    */\r
-    function __construct() {\r
-    }\r
-\r
-       protected function debug($msg, $file=null, $line=null) {\r
-               if ($this->debug) {\r
-                       $mem = round(memory_get_usage()/1024, 2);\r
-                       $memPeak = round(memory_get_peak_usage()/1024, 2);\r
-                       echo '* ',$msg;\r
-                       if (isset($file, $line)) echo " ($file line $line)";\r
-                       echo ' - mem used: ',$mem," (peak: $memPeak)\n";        \r
-                       ob_flush();\r
-                       flush();\r
-               }\r
-       }       \r
-       \r
-    /**\r
-    * Get matching cookies\r
-    *\r
-    * Only use this method if you cannot use add_cookie_header(), for example, if you want to use\r
-    * this cookie jar class without using the request class.\r
-    *\r
-    * @param array $param associative array containing 'domain', 'path', 'secure' keys\r
-    * @return string\r
-    * @see add_cookie_header()\r
-    */\r
-    public function getMatchingCookies($url)\r
-    {\r
-               if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {\r
-                       $param['domain'] = $parts['host'];\r
-                       $param['path'] = $parts['path'];\r
-                       $param['secure'] = (strtolower($parts['scheme']) == 'https');\r
-                       unset($parts);\r
-               } else {\r
-                       return false;\r
-               }\r
-        // RFC 2965 notes:\r
-        //  If multiple cookies satisfy the criteria above, they are ordered in\r
-        //  the Cookie header such that those with more specific Path attributes\r
-        //  precede those with less specific.  Ordering with respect to other\r
-        //  attributes (e.g., Domain) is unspecified.\r
-        $domain = $param['domain'];\r
-        if (strpos($domain, '.') === false) $domain .= '.local';\r
-        $request_path = $param['path'];\r
-        if ($request_path == '') $request_path = '/';\r
-        $request_secure = $param['secure'];\r
-        $now = time();\r
-        $matched_cookies = array();\r
-        // domain - find matching domains\r
-        $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);\r
-        while (strpos($domain, '.') !== false) {\r
-            if (isset($this->cookies[$domain])) {\r
-                $this->debug(' domain match found: '.$domain);\r
-                $cookies =& $this->cookies[$domain];\r
-            } else {\r
-                $domain = $this->_reduce_domain($domain);\r
-                continue;\r
-            }\r
-            // paths - find matching paths starting from most specific\r
-            $this->debug('  - Finding matching paths for '.$request_path);\r
-            $paths = array_keys($cookies);\r
-            usort($paths, array($this, '_cmp_length'));\r
-            foreach ($paths as $path) {\r
-                // continue to next cookie if request path does not path-match cookie path\r
-                if (!$this->_path_match($request_path, $path)) continue;\r
-                // loop through cookie names\r
-                $this->debug('     path match found: '.$path);\r
-                foreach ($cookies[$path] as $name => $values) {\r
-                    // if this cookie is secure but request isn't, continue to next cookie\r
-                    if ($values[1] && !$request_secure) continue;\r
-                    // if cookie is not a session cookie and has expired, continue to next cookie\r
-                    if (is_int($values[2]) && ($values[2] < $now)) continue;\r
-                    // cookie matches request\r
-                    $this->debug('      cookie match: '.$name.'='.$values[0]);\r
-                    $matched_cookies[] = $name.'='.$values[0];\r
-                }\r
-            }\r
-            $domain = $this->_reduce_domain($domain);\r
-        }\r
-        // return cookies\r
-        return implode('; ', $matched_cookies);\r
-    }\r
-\r
-    /**\r
-    * Parse Set-Cookie values.\r
-    *\r
-    * Only use this method if you cannot use extract_cookies(), for example, if you want to use\r
-    * this cookie jar class without using the response class.\r
-    *\r
-    * @param array $set_cookies array holding 1 or more "Set-Cookie" header values\r
-    * @param array $param associative array containing 'host', 'path' keys\r
-    * @return void\r
-    * @see extract_cookies()\r
-    */\r
-    public function storeCookies($url, $set_cookies)\r
-    {\r
-        if (count($set_cookies) == 0) return;\r
-               $param = @parse_url($url);\r
-               if (!is_array($param) || !isset($param['host'])) return;\r
-        $request_host = $param['host'];\r
-        if (strpos($request_host, '.') === false) $request_host .= '.local';\r
-        $request_path = @$param['path'];\r
-        if ($request_path == '') $request_path = '/';\r
-        //\r
-        // loop through set-cookie headers\r
-        //\r
-        foreach ($set_cookies as $set_cookie) {\r
-            $this->debug('Parsing: '.$set_cookie);\r
-            // temporary cookie store (before adding to jar)\r
-            $tmp_cookie = array();\r
-            $param = explode(';', $set_cookie);\r
-            // loop through params\r
-            for ($x=0; $x<count($param); $x++) {\r
-                $key_val = explode('=', $param[$x], 2);\r
-                if (count($key_val) != 2) {\r
-                    // if the first param isn't a name=value pair, continue to the next set-cookie\r
-                    // header\r
-                    if ($x == 0) continue 2;\r
-                    // check for secure flag\r
-                    if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;\r
-                    // continue to next param\r
-                    continue;\r
-                }\r
-                list($key, $val) = array_map('trim', $key_val);\r
-                // first name=value pair is the cookie name and value\r
-                // the name and value are stored under 'name' and 'value' to avoid conflicts\r
-                // with later parameters.\r
-                if ($x == 0) {\r
-                    $tmp_cookie = array('name'=>$key, 'value'=>$val);\r
-                    continue;\r
-                }\r
-                $key = strtolower($key);\r
-                if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {\r
-                    $tmp_cookie[$key] = $val;\r
-                }\r
-            }\r
-            //\r
-            // set cookie\r
-            //\r
-            // check domain\r
-            if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&\r
-                    ($tmp_cookie['domain'] != ".$request_host")) {\r
-                $domain = $tmp_cookie['domain'];\r
-                if ((strpos($domain, '.') === false) && ($domain != 'local')) {\r
-                    $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');\r
-                    continue;\r
-                }\r
-                if (preg_match('/\.[0-9]+$/', $domain)) {\r
-                    $this->debug(' - domain "'.$domain.'" appears to be an ip address');\r
-                    continue;\r
-                }\r
-                if (substr($domain, 0, 1) != '.') $domain = ".$domain";\r
-                if (!$this->_domain_match($request_host, $domain)) {\r
-                    $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');\r
-                    continue;\r
-                }\r
-            } else {\r
-                // if domain is not specified in the set-cookie header, domain will default to\r
-                // the request host\r
-                $domain = $request_host;\r
-            }\r
-            // check path\r
-            if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {\r
-                $path = urldecode($tmp_cookie['path']);\r
-                if (!$this->_path_match($request_path, $path)) {\r
-                    $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');\r
-                    continue;\r
-                }\r
-            } else {\r
-                $path = $request_path;\r
-                $path = substr($path, 0, strrpos($path, '/'));\r
-                if ($path == '') $path = '/';\r
-            }\r
-            // check if secure\r
-            $secure = (isset($tmp_cookie['secure'])) ? true : false;\r
-            // check expiry\r
-            if (isset($tmp_cookie['expires'])) {\r
-                if (($expires = strtotime($tmp_cookie['expires'])) < 0) {\r
-                    $expires = null;\r
-                }\r
-            } else {\r
-                $expires = null;\r
-            }\r
-            // set cookie\r
-            $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);\r
-        }\r
-    }\r
-       \r
-       // return array of set-cookie values extracted from HTTP response headers (string $h)\r
-       public function extractCookies($h) {\r
-        $x = 0;\r
-        $lines = 0;\r
-        $headers = array();\r
-        $last_match = false;\r
-               $h = explode("\n", $h);\r
-        foreach ($h as $line) {\r
-                       $line = rtrim($line);\r
-            $lines++;\r
-\r
-            $trimmed_line = trim($line);\r
-            if (isset($line_last)) {\r
-                // check if we have \r\n\r\n (indicating the end of headers)\r
-                // some servers will not use CRLF (\r\n), so we make CR (\r) optional.\r
-                // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {\r
-                //     break;\r
-                // }\r
-                // As an alternative, we can check if the current trimmed line is empty\r
-                if ($trimmed_line == '') {\r
-                    break;\r
-                }\r
-\r
-                // check for continuation line...\r
-                // RFC 2616 Section 2.2 "Basic Rules":\r
-                // HTTP/1.1 header field values can be folded onto multiple lines if the\r
-                // continuation line begins with a space or horizontal tab. All linear\r
-                // white space, including folding, has the same semantics as SP. A\r
-                // recipient MAY replace any linear white space with a single SP before\r
-                // interpreting the field value or forwarding the message downstream.\r
-                if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {\r
-                    // append to previous header value\r
-                    $headers[$x-1] .= ' '.rtrim($match[1]);\r
-                    continue;\r
-                }\r
-            }\r
-            $line_last = $line;\r
-\r
-            // split header name and value\r
-            if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {\r
-                $headers[$x++] = rtrim($match[1]);\r
-                $last_match = true;\r
-            } else {\r
-                $last_match = false;\r
-            }\r
-        }\r
-        return $headers;\r
-       }\r
-\r
-    /**\r
-    * Set Cookie\r
-    * @param string $domain\r
-    * @param string $path\r
-    * @param string $name cookie name\r
-    * @param string $value cookie value\r
-    * @param bool $secure\r
-    * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)\r
-    * @return void\r
-    */\r
-    function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)\r
-    {\r
-        if ($domain == '') return;\r
-        if ($path == '') return;\r
-        if ($name == '') return;\r
-        // check if cookie needs to go\r
-        if (isset($expires) && ($expires <= 0)) {\r
-            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);\r
-            return;\r
-        }\r
-        if ($value == '') return;\r
-        $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);\r
-        return;\r
-    }\r
-\r
-    /**\r
-    * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.\r
-    * @param string $domain\r
-    * @param string $path\r
-    * @param string $name\r
-    * @return void\r
-    */\r
-    function clear($domain=null, $path=null, $name=null)\r
-    {\r
-        if (!isset($domain)) {\r
-            $this->cookies = array();\r
-        } elseif (!isset($path)) {\r
-            if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);\r
-        } elseif (!isset($name)) {\r
-            if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);\r
-        } elseif (isset($name)) {\r
-            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);\r
-        }\r
-    }\r
-\r
-    /**\r
-    * Compare string length - used for sorting\r
-    * @access private\r
-    * @return int\r
-    */\r
-    function _cmp_length($a, $b)\r
-    {\r
-        $la = strlen($a); $lb = strlen($b);\r
-        if ($la == $lb) return 0;\r
-        return ($la > $lb) ? -1 : 1;\r
-    }\r
-\r
-    /**\r
-    * Reduce domain\r
-    * @param string $domain\r
-    * @return string\r
-    * @access private\r
-    */\r
-    function _reduce_domain($domain)\r
-    {\r
-        if ($domain == '') return '';\r
-        if (substr($domain, 0, 1) == '.') return substr($domain, 1);\r
-        return substr($domain, strpos($domain, '.'));\r
-    }\r
-\r
-    /**\r
-    * Path match - check if path1 path-matches path2\r
-    *\r
-    * From RFC 2965: \r
-    *   <i>For two strings that represent paths, P1 and P2, P1 path-matches P2\r
-    *   if P2 is a prefix of P1 (including the case where P1 and P2 string-\r
-    *   compare equal).  Thus, the string /tec/waldo path-matches /tec.</i>\r
-    * @param string $path1\r
-    * @param string $path2\r
-    * @return bool\r
-    * @access private\r
-    */\r
-    function _path_match($path1, $path2)\r
-    {\r
-        return (substr($path1, 0, strlen($path2)) == $path2);\r
-    }\r
-\r
-    /**\r
-    * Domain match - check if domain1 domain-matches domain2\r
-    *\r
-    * A few extracts from RFC 2965: \r
-    *  -  A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com\r
-    *     would be rejected, because H is y.x and contains a dot.\r
-    *\r
-    *  -  A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com\r
-    *     would be accepted.\r
-    *\r
-    *  -  A Set-Cookie2 with Domain=.com or Domain=.com., will always be\r
-    *     rejected, because there is no embedded dot.\r
-    *\r
-    *  -  A Set-Cookie2 from request-host example for Domain=.local will\r
-    *     be accepted, because the effective host name for the request-\r
-    *     host is example.local, and example.local domain-matches .local.\r
-    *\r
-    * I'm ignoring the first point for now (must check to see how other browsers handle\r
-    * this rule for Set-Cookie headers)\r
-    *\r
-    * @param string $domain1\r
-    * @param string $domain2\r
-    * @return bool\r
-    * @access private\r
-    */\r
-    function _domain_match($domain1, $domain2)\r
-    {\r
-        $domain1 = strtolower($domain1);\r
-        $domain2 = strtolower($domain2);\r
-        while (strpos($domain1, '.') !== false) {\r
-            if ($domain1 == $domain2) return true;\r
-            $domain1 = $this->_reduce_domain($domain1);\r
-            continue;\r
-        }\r
-        return false;\r
-    }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Cookie Jar
+ * 
+ * PHP class for handling cookies, as defined by the Netscape spec: 
+ * <http://curl.haxx.se/rfc/cookie_spec.html>
+ *
+ * This class should be used to handle cookies (storing cookies from HTTP response messages, and
+ * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 
+ * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
+ * 
+ * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
+ * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
+ * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
+ * 
+ * @version 0.5
+ * @date 2011-03-15
+ * @see http://php.net/HttpRequestPool
+ * @author Keyvan Minoukadeh
+ * @copyright 2011 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class CookieJar
+{
+    /**
+    * Cookies - array containing all cookies.
+    *
+    * <pre>
+    * Cookies are stored like this:
+    *   [domain][path][name] = array
+    * where array is:
+    *   0 => value, 1 => secure, 2 => expires
+    * </pre>
+    * @var array
+    * @access private
+    */
+    public $cookies = array();
+       public $debug = false;
+
+    /**
+    * Constructor
+    */
+    function __construct() {
+    }
+
+       protected function debug($msg, $file=null, $line=null) {
+               if ($this->debug) {
+                       $mem = round(memory_get_usage()/1024, 2);
+                       $memPeak = round(memory_get_peak_usage()/1024, 2);
+                       echo '* ',$msg;
+                       if (isset($file, $line)) echo " ($file line $line)";
+                       echo ' - mem used: ',$mem," (peak: $memPeak)\n";        
+                       ob_flush();
+                       flush();
+               }
+       }       
+       
+    /**
+    * Get matching cookies
+    *
+    * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
+    * this cookie jar class without using the request class.
+    *
+    * @param array $param associative array containing 'domain', 'path', 'secure' keys
+    * @return string
+    * @see add_cookie_header()
+    */
+    public function getMatchingCookies($url)
+    {
+               if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
+                       $param['domain'] = $parts['host'];
+                       $param['path'] = $parts['path'];
+                       $param['secure'] = (strtolower($parts['scheme']) == 'https');
+                       unset($parts);
+               } else {
+                       return false;
+               }
+        // RFC 2965 notes:
+        //  If multiple cookies satisfy the criteria above, they are ordered in
+        //  the Cookie header such that those with more specific Path attributes
+        //  precede those with less specific.  Ordering with respect to other
+        //  attributes (e.g., Domain) is unspecified.
+        $domain = $param['domain'];
+        if (strpos($domain, '.') === false) $domain .= '.local';
+        $request_path = $param['path'];
+        if ($request_path == '') $request_path = '/';
+        $request_secure = $param['secure'];
+        $now = time();
+        $matched_cookies = array();
+        // domain - find matching domains
+        $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
+        while (strpos($domain, '.') !== false) {
+            if (isset($this->cookies[$domain])) {
+                $this->debug(' domain match found: '.$domain);
+                $cookies =& $this->cookies[$domain];
+            } else {
+                $domain = $this->_reduce_domain($domain);
+                continue;
+            }
+            // paths - find matching paths starting from most specific
+            $this->debug('  - Finding matching paths for '.$request_path);
+            $paths = array_keys($cookies);
+            usort($paths, array($this, '_cmp_length'));
+            foreach ($paths as $path) {
+                // continue to next cookie if request path does not path-match cookie path
+                if (!$this->_path_match($request_path, $path)) continue;
+                // loop through cookie names
+                $this->debug('     path match found: '.$path);
+                foreach ($cookies[$path] as $name => $values) {
+                    // if this cookie is secure but request isn't, continue to next cookie
+                    if ($values[1] && !$request_secure) continue;
+                    // if cookie is not a session cookie and has expired, continue to next cookie
+                    if (is_int($values[2]) && ($values[2] < $now)) continue;
+                    // cookie matches request
+                    $this->debug('      cookie match: '.$name.'='.$values[0]);
+                    $matched_cookies[] = $name.'='.$values[0];
+                }
+            }
+            $domain = $this->_reduce_domain($domain);
+        }
+        // return cookies
+        return implode('; ', $matched_cookies);
+    }
+
+    /**
+    * Parse Set-Cookie values.
+    *
+    * Only use this method if you cannot use extract_cookies(), for example, if you want to use
+    * this cookie jar class without using the response class.
+    *
+    * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
+    * @param array $param associative array containing 'host', 'path' keys
+    * @return void
+    * @see extract_cookies()
+    */
+    public function storeCookies($url, $set_cookies)
+    {
+        if (count($set_cookies) == 0) return;
+               $param = @parse_url($url);
+               if (!is_array($param) || !isset($param['host'])) return;
+        $request_host = $param['host'];
+        if (strpos($request_host, '.') === false) $request_host .= '.local';
+        $request_path = @$param['path'];
+        if ($request_path == '') $request_path = '/';
+        //
+        // loop through set-cookie headers
+        //
+        foreach ($set_cookies as $set_cookie) {
+            $this->debug('Parsing: '.$set_cookie);
+            // temporary cookie store (before adding to jar)
+            $tmp_cookie = array();
+            $param = explode(';', $set_cookie);
+            // loop through params
+            for ($x=0; $x<count($param); $x++) {
+                $key_val = explode('=', $param[$x], 2);
+                if (count($key_val) != 2) {
+                    // if the first param isn't a name=value pair, continue to the next set-cookie
+                    // header
+                    if ($x == 0) continue 2;
+                    // check for secure flag
+                    if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
+                    // continue to next param
+                    continue;
+                }
+                list($key, $val) = array_map('trim', $key_val);
+                // first name=value pair is the cookie name and value
+                // the name and value are stored under 'name' and 'value' to avoid conflicts
+                // with later parameters.
+                if ($x == 0) {
+                    $tmp_cookie = array('name'=>$key, 'value'=>$val);
+                    continue;
+                }
+                $key = strtolower($key);
+                if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
+                    $tmp_cookie[$key] = $val;
+                }
+            }
+            //
+            // set cookie
+            //
+            // check domain
+            if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
+                    ($tmp_cookie['domain'] != ".$request_host")) {
+                $domain = $tmp_cookie['domain'];
+                if ((strpos($domain, '.') === false) && ($domain != 'local')) {
+                    $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
+                    continue;
+                }
+                if (preg_match('/\.[0-9]+$/', $domain)) {
+                    $this->debug(' - domain "'.$domain.'" appears to be an ip address');
+                    continue;
+                }
+                if (substr($domain, 0, 1) != '.') $domain = ".$domain";
+                if (!$this->_domain_match($request_host, $domain)) {
+                    $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
+                    continue;
+                }
+            } else {
+                // if domain is not specified in the set-cookie header, domain will default to
+                // the request host
+                $domain = $request_host;
+            }
+            // check path
+            if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
+                $path = urldecode($tmp_cookie['path']);
+                if (!$this->_path_match($request_path, $path)) {
+                    $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
+                    continue;
+                }
+            } else {
+                $path = $request_path;
+                $path = substr($path, 0, strrpos($path, '/'));
+                if ($path == '') $path = '/';
+            }
+            // check if secure
+            $secure = (isset($tmp_cookie['secure'])) ? true : false;
+            // check expiry
+            if (isset($tmp_cookie['expires'])) {
+                if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
+                    $expires = null;
+                }
+            } else {
+                $expires = null;
+            }
+            // set cookie
+            $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
+        }
+    }
+       
+       // return array of set-cookie values extracted from HTTP response headers (string $h)
+       public function extractCookies($h) {
+        $x = 0;
+        $lines = 0;
+        $headers = array();
+        $last_match = false;
+               $h = explode("\n", $h);
+        foreach ($h as $line) {
+                       $line = rtrim($line);
+            $lines++;
+
+            $trimmed_line = trim($line);
+            if (isset($line_last)) {
+                // check if we have \r\n\r\n (indicating the end of headers)
+                // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
+                // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
+                //     break;
+                // }
+                // As an alternative, we can check if the current trimmed line is empty
+                if ($trimmed_line == '') {
+                    break;
+                }
+
+                // check for continuation line...
+                // RFC 2616 Section 2.2 "Basic Rules":
+                // HTTP/1.1 header field values can be folded onto multiple lines if the
+                // continuation line begins with a space or horizontal tab. All linear
+                // white space, including folding, has the same semantics as SP. A
+                // recipient MAY replace any linear white space with a single SP before
+                // interpreting the field value or forwarding the message downstream.
+                if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
+                    // append to previous header value
+                    $headers[$x-1] .= ' '.rtrim($match[1]);
+                    continue;
+                }
+            }
+            $line_last = $line;
+
+            // split header name and value
+            if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
+                $headers[$x++] = rtrim($match[1]);
+                $last_match = true;
+            } else {
+                $last_match = false;
+            }
+        }
+        return $headers;
+       }
+
+    /**
+    * Set Cookie
+    * @param string $domain
+    * @param string $path
+    * @param string $name cookie name
+    * @param string $value cookie value
+    * @param bool $secure
+    * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
+    * @return void
+    */
+    function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
+    {
+        if ($domain == '') return;
+        if ($path == '') return;
+        if ($name == '') return;
+        // check if cookie needs to go
+        if (isset($expires) && ($expires <= 0)) {
+            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
+            return;
+        }
+        if ($value == '') return;
+        $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
+        return;
+    }
+
+    /**
+    * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
+    * @param string $domain
+    * @param string $path
+    * @param string $name
+    * @return void
+    */
+    function clear($domain=null, $path=null, $name=null)
+    {
+        if (!isset($domain)) {
+            $this->cookies = array();
+        } elseif (!isset($path)) {
+            if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
+        } elseif (!isset($name)) {
+            if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
+        } elseif (isset($name)) {
+            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
+        }
+    }
+
+    /**
+    * Compare string length - used for sorting
+    * @access private
+    * @return int
+    */
+    function _cmp_length($a, $b)
+    {
+        $la = strlen($a); $lb = strlen($b);
+        if ($la == $lb) return 0;
+        return ($la > $lb) ? -1 : 1;
+    }
+
+    /**
+    * Reduce domain
+    * @param string $domain
+    * @return string
+    * @access private
+    */
+    function _reduce_domain($domain)
+    {
+        if ($domain == '') return '';
+        if (substr($domain, 0, 1) == '.') return substr($domain, 1);
+        return substr($domain, strpos($domain, '.'));
+    }
+
+    /**
+    * Path match - check if path1 path-matches path2
+    *
+    * From RFC 2965: 
+    *   <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
+    *   if P2 is a prefix of P1 (including the case where P1 and P2 string-
+    *   compare equal).  Thus, the string /tec/waldo path-matches /tec.</i>
+    * @param string $path1
+    * @param string $path2
+    * @return bool
+    * @access private
+    */
+    function _path_match($path1, $path2)
+    {
+        return (substr($path1, 0, strlen($path2)) == $path2);
+    }
+
+    /**
+    * Domain match - check if domain1 domain-matches domain2
+    *
+    * A few extracts from RFC 2965: 
+    *  -  A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
+    *     would be rejected, because H is y.x and contains a dot.
+    *
+    *  -  A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
+    *     would be accepted.
+    *
+    *  -  A Set-Cookie2 with Domain=.com or Domain=.com., will always be
+    *     rejected, because there is no embedded dot.
+    *
+    *  -  A Set-Cookie2 from request-host example for Domain=.local will
+    *     be accepted, because the effective host name for the request-
+    *     host is example.local, and example.local domain-matches .local.
+    *
+    * I'm ignoring the first point for now (must check to see how other browsers handle
+    * this rule for Set-Cookie headers)
+    *
+    * @param string $domain1
+    * @param string $domain2
+    * @return bool
+    * @access private
+    */
+    function _domain_match($domain1, $domain2)
+    {
+        $domain1 = strtolower($domain1);
+        $domain2 = strtolower($domain2);
+        while (strpos($domain1, '.') !== false) {
+            if ($domain1 == $domain2) return true;
+            $domain1 = $this->_reduce_domain($domain1);
+            continue;
+        }
+        return false;
+    }
+}
\ No newline at end of file
index e4f1b3b3834c989cb22925fc17e14b4e6b70891a..963f0c05296e00f8bc9d38ef8aac9e5b950ea2b3 100644 (file)
-<?php\r
-/**\r
- * Humble HTTP Agent\r
- * \r
- * This class is designed to take advantage of parallel HTTP requests\r
- * offered by PHP's PECL HTTP extension or the curl_multi_* functions. \r
- * For environments which do not have these options, it reverts to standard sequential \r
- * requests (using file_get_contents())\r
- * \r
- * @version 1.1\r
- * @date 2012-08-20\r
- * @see http://php.net/HttpRequestPool\r
- * @author Keyvan Minoukadeh\r
- * @copyright 2011-2012 Keyvan Minoukadeh\r
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
- */\r
-\r
-class HumbleHttpAgent\r
-{\r
-       const METHOD_REQUEST_POOL = 1;\r
-       const METHOD_CURL_MULTI = 2;\r
-       const METHOD_FILE_GET_CONTENTS = 4;\r
-       //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';\r
-       const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';\r
-       const UA_PHP = 'PHP/5.2';\r
-       const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';\r
-       \r
-       protected $requests = array();\r
-       protected $redirectQueue = array();\r
-       protected $requestOptions;\r
-       protected $maxParallelRequests = 5;\r
-       protected $cache = null; //TODO\r
-       protected $httpContext;\r
-       protected $minimiseMemoryUse = false; //TODO\r
-       protected $method;\r
-       protected $cookieJar;\r
-       public $debug = false;\r
-       public $debugVerbose = false;\r
-       public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html\r
-       public $maxRedirects = 5;\r
-       public $userAgentMap = array();\r
-       public $rewriteUrls = array();\r
-       public $userAgentDefault;\r
-       public $referer;\r
-       //public $userAgent = 'Mozilla/5.0';\r
-       \r
-       // Prevent certain file/mime types\r
-       // HTTP responses which match these content types will\r
-       // be returned without body.\r
-       public $headerOnlyTypes = array();\r
-       // URLs ending with one of these extensions will\r
-       // prompt Humble HTTP Agent to send a HEAD request first\r
-       // to see if returned content type matches $headerOnlyTypes.\r
-       public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');\r
-       // AJAX triggers to search for.\r
-       // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
-       public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');\r
-       \r
-       //TODO: set max file size\r
-       //TODO: normalise headers\r
-       \r
-       function __construct($requestOptions=null, $method=null) {\r
-               $this->userAgentDefault = self::UA_BROWSER;\r
-               $this->referer = self::REF_GOOGLE;\r
-               // set the request method\r
-               if (in_array($method, array(1,2,4))) {\r
-                       $this->method = $method;\r
-               } else {\r
-                       if (class_exists('HttpRequestPool')) {\r
-                               $this->method = self::METHOD_REQUEST_POOL;\r
-                       } elseif (function_exists('curl_multi_init')) {\r
-                               $this->method = self::METHOD_CURL_MULTI;\r
-                       } else {\r
-                               $this->method = self::METHOD_FILE_GET_CONTENTS;\r
-                       }\r
-               }\r
-               if ($this->method == self::METHOD_CURL_MULTI) {\r
-                       require_once(dirname(__FILE__).'/RollingCurl.php');\r
-               }\r
-               // create cookie jar\r
-               $this->cookieJar = new CookieJar();\r
-               // set request options (redirect must be 0)\r
-               $this->requestOptions = array(\r
-                       'timeout' => 15,\r
-                       'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web\r
-                       // TODO: test onprogress?\r
-               );\r
-               if (is_array($requestOptions)) {\r
-                       $this->requestOptions = array_merge($this->requestOptions, $requestOptions);\r
-               }\r
-               $this->httpContext = array(\r
-                       'http' => array(\r
-                               'ignore_errors' => true,\r
-                               'timeout' => $this->requestOptions['timeout'],\r
-                               'max_redirects' => $this->requestOptions['redirect'],\r
-                               'header' => "Accept: */*\r\n"\r
-                               )\r
-                       );\r
-       }\r
-       \r
-       protected function debug($msg) {\r
-               if ($this->debug) {\r
-                       $mem = round(memory_get_usage()/1024, 2);\r
-                       $memPeak = round(memory_get_peak_usage()/1024, 2);\r
-                       echo '* ',$msg;\r
-                       if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";\r
-                       echo "\n";\r
-                       ob_flush();\r
-                       flush();\r
-               }\r
-       }\r
-       \r
-       protected function getUserAgent($url, $asArray=false) {\r
-               $host = @parse_url($url, PHP_URL_HOST);\r
-               if (strtolower(substr($host, 0, 4)) == 'www.') {\r
-                       $host = substr($host, 4);\r
-               }\r
-               if ($host) {\r
-                       $try = array($host);\r
-                       $split = explode('.', $host);\r
-                       if (count($split) > 1) {\r
-                               array_shift($split);\r
-                               $try[] = '.'.implode('.', $split);\r
-                       }\r
-                       foreach ($try as $h) {\r
-                               if (isset($this->userAgentMap[$h])) {\r
-                                       $ua = $this->userAgentMap[$h];\r
-                                       break;\r
-                               }\r
-                       }\r
-               }\r
-               if (!isset($ua)) $ua = $this->userAgentDefault;\r
-               if ($asArray) {\r
-                       return array('User-Agent' => $ua);\r
-               } else {\r
-                       return 'User-Agent: '.$ua;\r
-               }\r
-       }\r
-       \r
-       public function rewriteHashbangFragment($url) {\r
-               // return $url if there's no '#!'\r
-               if (strpos($url, '#!') === false) return $url;\r
-               // split $url and rewrite\r
-               // TODO: is SimplePie_IRI included?\r
-               $iri = new SimplePie_IRI($url);\r
-               $fragment = substr($iri->fragment, 1); // strip '!'\r
-               $iri->fragment = null;\r
-               if (isset($iri->query)) {\r
-                       parse_str($iri->query, $query);\r
-               } else {\r
-                       $query = array();\r
-               }\r
-               $query['_escaped_fragment_'] = (string)$fragment;\r
-               $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
-               return $iri->get_iri();\r
-       }\r
-       \r
-       public function getUglyURL($url, $html) {\r
-               if ($html == '') return false;\r
-               $found = false;\r
-               foreach ($this->ajaxTriggers as $string) {\r
-                       if (stripos($html, $string)) {\r
-                               $found = true;\r
-                               break;\r
-                       }\r
-               }\r
-               if (!$found) return false;\r
-               $iri = new SimplePie_IRI($url);\r
-               if (isset($iri->query)) {\r
-                       parse_str($iri->query, $query);\r
-               } else {\r
-                       $query = array();\r
-               }\r
-               $query['_escaped_fragment_'] = '';\r
-               $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites\r
-               return $iri->get_iri();\r
-       }\r
-       \r
-       public function removeFragment($url) {\r
-               $pos = strpos($url, '#');\r
-               if ($pos === false) {\r
-                       return $url;\r
-               } else {\r
-                       return substr($url, 0, $pos);\r
-               }\r
-       }\r
-       \r
-       public function rewriteUrls($url) {\r
-               foreach ($this->rewriteUrls as $find => $action) {\r
-                       if (strpos($url, $find) !== false) {\r
-                               if (is_array($action)) {\r
-                                       return strtr($url, $action);\r
-                               }\r
-                       }\r
-               }\r
-               return $url;\r
-       }\r
-       \r
-       public function enableDebug($bool=true) {\r
-               $this->debug = (bool)$bool;\r
-       }\r
-       \r
-       public function minimiseMemoryUse($bool = true) {\r
-               $this->minimiseMemoryUse = $bool;\r
-       }\r
-       \r
-       public function setMaxParallelRequests($max) {\r
-               $this->maxParallelRequests = $max;\r
-       }\r
-       \r
-       public function validateUrl($url) {\r
-               $url = filter_var($url, FILTER_SANITIZE_URL);\r
-               $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
-               // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)\r
-               if ($test === false) {\r
-                       $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);\r
-               }\r
-               if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {\r
-                       return $url;\r
-               } else {\r
-                       return false;\r
-               }\r
-       }\r
-       \r
-       public function fetchAll(array $urls) {\r
-               $this->fetchAllOnce($urls, $isRedirect=false);\r
-               $redirects = 0;\r
-               while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {\r
-                       $this->debug("Following redirects #$redirects...");\r
-                       $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);\r
-               }\r
-       }\r
-       \r
-       // fetch all URLs without following redirects\r
-       public function fetchAllOnce(array $urls, $isRedirect=false) {\r
-               if (!$isRedirect) $urls = array_unique($urls);\r
-               if (empty($urls)) return;\r
-               \r
-               //////////////////////////////////////////////////////\r
-               // parallel (HttpRequestPool)\r
-               if ($this->method == self::METHOD_REQUEST_POOL) {\r
-                       $this->debug('Starting parallel fetch (HttpRequestPool)');\r
-                       try {\r
-                               while (count($urls) > 0) {\r
-                                       $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
-                                       $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
-                                       $pool = new HttpRequestPool();\r
-                                       foreach ($subset as $orig => $url) {\r
-                                               if (!$isRedirect) $orig = $url;\r
-                                               unset($this->redirectQueue[$orig]);\r
-                                               $this->debug("...$url");\r
-                                               if (!$isRedirect && isset($this->requests[$url])) {\r
-                                                       $this->debug("......in memory");\r
-                                               /*\r
-                                               } elseif ($this->isCached($url)) {\r
-                                                       $this->debug("......is cached");\r
-                                                       if (!$this->minimiseMemoryUse) {\r
-                                                               $this->requests[$url] = $this->getCached($url);\r
-                                                       }\r
-                                               */\r
-                                               } else {\r
-                                                       $this->debug("......adding to pool");\r
-                                                       $req_url = $this->rewriteUrls($url);\r
-                                                       $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
-                                                       $req_url = $this->removeFragment($req_url);\r
-                                                       if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
-                                                               $_meth = HttpRequest::METH_HEAD;\r
-                                                       } else {\r
-                                                               $_meth = HttpRequest::METH_GET;\r
-                                                               unset($this->requests[$orig]['wrongGuess']);\r
-                                                       }\r
-                                                       $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);\r
-                                                       // send cookies, if we have any\r
-                                                       if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
-                                                               $this->debug("......sending cookies: $cookies");\r
-                                                               $httpRequest->addHeaders(array('Cookie' => $cookies));\r
-                                                       }\r
-                                                       //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));\r
-                                                       $httpRequest->addHeaders($this->getUserAgent($req_url, true));\r
-                                                       // add referer for picky sites\r
-                                                       $httpRequest->addheaders(array('Referer' => $this->referer));\r
-                                                       $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
-                                                       $this->requests[$orig]['original_url'] = $orig;\r
-                                                       $pool->attach($httpRequest);\r
-                                               }\r
-                                       }\r
-                                       // did we get anything into the pool?\r
-                                       if (count($pool) > 0) {\r
-                                               $this->debug('Sending request...');\r
-                                               try {\r
-                                                       $pool->send();\r
-                                               } catch (HttpRequestPoolException $e) {\r
-                                                       // do nothing\r
-                                               }\r
-                                               $this->debug('Received responses');\r
-                                               foreach($subset as $orig => $url) {\r
-                                                       if (!$isRedirect) $orig = $url;\r
-                                                       $request = $this->requests[$orig]['httpRequest'];\r
-                                                       //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());\r
-                                                       // getResponseHeader() doesn't return status line, so, for consistency...\r
-                                                       $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));\r
-                                                       // check content type\r
-                                                       // TODO: use getResponseHeader('content-type') or getResponseInfo()\r
-                                                       if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
-                                                               $this->requests[$orig]['body'] = '';\r
-                                                               $_header_only_type = true;\r
-                                                               $this->debug('Header only type returned');\r
-                                                       } else {\r
-                                                               $this->requests[$orig]['body'] = $request->getResponseBody();\r
-                                                               $_header_only_type = false;\r
-                                                       }\r
-                                                       $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');\r
-                                                       $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();\r
-                                                       // is redirect?\r
-                                                       if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {\r
-                                                               $redirectURL = $request->getResponseHeader('location');\r
-                                                               if (!preg_match('!^https?://!i', $redirectURL)) {\r
-                                                                       $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
-                                                               }\r
-                                                               if ($this->validateURL($redirectURL)) {\r
-                                                                       $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
-                                                                       // store any cookies\r
-                                                                       $cookies = $request->getResponseHeader('set-cookie');\r
-                                                                       if ($cookies && !is_array($cookies)) $cookies = array($cookies);\r
-                                                                       if ($cookies) $this->cookieJar->storeCookies($url, $cookies);\r
-                                                                       $this->redirectQueue[$orig] = $redirectURL;\r
-                                                               } else {\r
-                                                                       $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
-                                                               }\r
-                                                       } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {\r
-                                                               // the response content-type did not match our 'header only' types, \r
-                                                               // but we'd issues a HEAD request because we assumed it would. So\r
-                                                               // let's queue a proper GET request for this item...\r
-                                                               $this->debug('Wrong guess at content-type, queing GET request');\r
-                                                               $this->requests[$orig]['wrongGuess'] = true;\r
-                                                               $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
-                                                       } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
-                                                               // check for <meta name='fragment' content='!'/>\r
-                                                               // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
-                                                               // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
-                                                               if (isset($this->requests[$orig]['body'])) {\r
-                                                                       $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
-                                                                       if ($redirectURL) {\r
-                                                                               $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
-                                                                               $this->redirectQueue[$orig] = $redirectURL;\r
-                                                                       }\r
-                                                               }\r
-                                                       }\r
-                                                       //die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
-                                                       $pool->detach($request);\r
-                                                       unset($this->requests[$orig]['httpRequest'], $request);\r
-                                                       /*\r
-                                                       if ($this->minimiseMemoryUse) {\r
-                                                               if ($this->cache($url)) {\r
-                                                                       unset($this->requests[$url]);\r
-                                                               }\r
-                                                       }\r
-                                                       */\r
-                                               }\r
-                                       }\r
-                               }\r
-                       } catch (HttpException $e) {\r
-                               $this->debug($e);\r
-                               return false;\r
-                       }\r
-               }\r
-               \r
-               //////////////////////////////////////////////////////////\r
-               // parallel (curl_multi_*)\r
-               elseif ($this->method == self::METHOD_CURL_MULTI) {\r
-                       $this->debug('Starting parallel fetch (curl_multi_*)');\r
-                       while (count($urls) > 0) {\r
-                               $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));\r
-                               $subset = array_splice($urls, 0, $this->maxParallelRequests);\r
-                               $pool = new RollingCurl(array($this, 'handleCurlResponse'));\r
-                               $pool->window_size = count($subset);            \r
-                               \r
-                               foreach ($subset as $orig => $url) {\r
-                                       if (!$isRedirect) $orig = $url;\r
-                                       unset($this->redirectQueue[$orig]);\r
-                                       $this->debug("...$url");\r
-                                       if (!$isRedirect && isset($this->requests[$url])) {\r
-                                               $this->debug("......in memory");\r
-                                       /*\r
-                                       } elseif ($this->isCached($url)) {\r
-                                               $this->debug("......is cached");\r
-                                               if (!$this->minimiseMemoryUse) {\r
-                                                       $this->requests[$url] = $this->getCached($url);\r
-                                               }\r
-                                       */\r
-                                       } else {\r
-                                               $this->debug("......adding to pool");\r
-                                               $req_url = $this->rewriteUrls($url);\r
-                                               $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
-                                               $req_url = $this->removeFragment($req_url);\r
-                                               if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {\r
-                                                       $_meth = 'HEAD';\r
-                                               } else {\r
-                                                       $_meth = 'GET';\r
-                                                       unset($this->requests[$orig]['wrongGuess']);\r
-                                               }                                               \r
-                                               $headers = array();\r
-                                               //$headers[] = 'User-Agent: '.$this->userAgent;\r
-                                               $headers[] = $this->getUserAgent($req_url);\r
-                                               // add referer for picky sites\r
-                                               $headers[] = 'Referer: '.$this->referer;\r
-                                               // send cookies, if we have any\r
-                                               if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
-                                                       $this->debug("......sending cookies: $cookies");\r
-                                                       $headers[] = 'Cookie: '.$cookies;\r
-                                               }\r
-                                               $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(\r
-                                                       CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],\r
-                                                       CURLOPT_TIMEOUT => $this->requestOptions['timeout']\r
-                                                       ));\r
-                                               $httpRequest->set_original_url($orig);\r
-                                               $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);\r
-                                               $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?\r
-                                               $pool->add($httpRequest);\r
-                                       }\r
-                               }\r
-                               // did we get anything into the pool?\r
-                               if (count($pool) > 0) {\r
-                                       $this->debug('Sending request...');\r
-                                       $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]\r
-                                       $this->debug('Received responses');\r
-                                       foreach($subset as $orig => $url) {\r
-                                               if (!$isRedirect) $orig = $url;\r
-                                               // $this->requests[$orig]['headers']\r
-                                               // $this->requests[$orig]['body']\r
-                                               // $this->requests[$orig]['effective_url']\r
-                                               // check content type\r
-                                               if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
-                                                       $this->requests[$orig]['body'] = '';\r
-                                                       $_header_only_type = true;\r
-                                                       $this->debug('Header only type returned');\r
-                                               } else {\r
-                                                       $_header_only_type = false;\r
-                                               }\r
-                                               $status_code = $this->requests[$orig]['status_code'];\r
-                                               if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
-                                                       $redirectURL = $this->requests[$orig]['location'];\r
-                                                       if (!preg_match('!^https?://!i', $redirectURL)) {\r
-                                                               $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
-                                                       }\r
-                                                       if ($this->validateURL($redirectURL)) {\r
-                                                               $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
-                                                               // store any cookies\r
-                                                               $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
-                                                               if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);                                                   \r
-                                                               $this->redirectQueue[$orig] = $redirectURL;\r
-                                                       } else {\r
-                                                               $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
-                                                       }\r
-                                               } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {\r
-                                                       // the response content-type did not match our 'header only' types, \r
-                                                       // but we'd issues a HEAD request because we assumed it would. So\r
-                                                       // let's queue a proper GET request for this item...\r
-                                                       $this->debug('Wrong guess at content-type, queing GET request');\r
-                                                       $this->requests[$orig]['wrongGuess'] = true;\r
-                                                       $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];\r
-                                               } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
-                                                       // check for <meta name='fragment' content='!'/>\r
-                                                       // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
-                                                       // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
-                                                       if (isset($this->requests[$orig]['body'])) {\r
-                                                               $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
-                                                               if ($redirectURL) {\r
-                                                                       $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
-                                                                       $this->redirectQueue[$orig] = $redirectURL;\r
-                                                               }\r
-                                                       }\r
-                                               }\r
-                                               // die($url.' -multi- '.$request->getResponseInfo('effective_url'));\r
-                                               unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-\r
-               //////////////////////////////////////////////////////\r
-               // sequential (file_get_contents)\r
-               else {\r
-                       $this->debug('Starting sequential fetch (file_get_contents)');\r
-                       $this->debug('Processing set of '.count($urls));\r
-                       foreach ($urls as $orig => $url) {\r
-                               if (!$isRedirect) $orig = $url;\r
-                               unset($this->redirectQueue[$orig]);\r
-                               $this->debug("...$url");\r
-                               if (!$isRedirect && isset($this->requests[$url])) {\r
-                                       $this->debug("......in memory");\r
-                               /*\r
-                               } elseif ($this->isCached($url)) {\r
-                                       $this->debug("......is cached");\r
-                                       if (!$this->minimiseMemoryUse) {\r
-                                               $this->requests[$url] = $this->getCached($url);\r
-                                       }\r
-                               */\r
-                               } else {\r
-                                       $this->debug("Sending request for $url");\r
-                                       $this->requests[$orig]['original_url'] = $orig;\r
-                                       $req_url = $this->rewriteUrls($url);\r
-                                       $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;\r
-                                       $req_url = $this->removeFragment($req_url);\r
-                                       // send cookies, if we have any\r
-                                       $httpContext = $this->httpContext;\r
-                                       $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";\r
-                                       // add referer for picky sites\r
-                                       $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";\r
-                                       if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {\r
-                                               $this->debug("......sending cookies: $cookies");\r
-                                               $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";\r
-                                       }\r
-                                       if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {\r
-                                               $this->debug('Received response');\r
-                                               // get status code\r
-                                               if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {\r
-                                                       $this->debug('Error: no status code found');\r
-                                                       // TODO: handle error - no status code\r
-                                               } else {\r
-                                                       $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);\r
-                                                       // check content type\r
-                                                       if ($this->headerOnlyType($this->requests[$orig]['headers'])) {\r
-                                                               $this->requests[$orig]['body'] = '';\r
-                                                       } else {\r
-                                                               $this->requests[$orig]['body'] = $html;\r
-                                                       }\r
-                                                       $this->requests[$orig]['effective_url'] = $req_url;\r
-                                                       $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];\r
-                                                       unset($match);\r
-                                                       // handle redirect\r
-                                                       if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
-                                                               $this->requests[$orig]['location'] =  trim($match[1]);\r
-                                                       }\r
-                                                       if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {\r
-                                                               $redirectURL = $this->requests[$orig]['location'];\r
-                                                               if (!preg_match('!^https?://!i', $redirectURL)) {\r
-                                                                       $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);\r
-                                                               }\r
-                                                               if ($this->validateURL($redirectURL)) {\r
-                                                                       $this->debug('Redirect detected. Valid URL: '.$redirectURL);\r
-                                                                       // store any cookies\r
-                                                                       $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);\r
-                                                                       if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);\r
-                                                                       $this->redirectQueue[$orig] = $redirectURL;\r
-                                                               } else {\r
-                                                                       $this->debug('Redirect detected. Invalid URL: '.$redirectURL);\r
-                                                               }\r
-                                                       } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {\r
-                                                               // check for <meta name='fragment' content='!'/>\r
-                                                               // for AJAX sites, e.g. Blogger with its dynamic views templates.\r
-                                                               // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification\r
-                                                               if (isset($this->requests[$orig]['body'])) {\r
-                                                                       $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));\r
-                                                                       if ($redirectURL) {\r
-                                                                               $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);\r
-                                                                               $this->redirectQueue[$orig] = $redirectURL;\r
-                                                                       }\r
-                                                               }\r
-                                                       }\r
-                                               }\r
-                                       } else {\r
-                                               $this->debug('Error retrieving URL');\r
-                                               //print_r($req_url);\r
-                                               //print_r($http_response_header);\r
-                                               //print_r($html);\r
-                                               \r
-                                               // TODO: handle error - failed to retrieve URL\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-       }\r
-       \r
-       public function handleCurlResponse($response, $info, $request) {\r
-               $orig = $request->url_original;\r
-               $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);\r
-               $this->requests[$orig]['body'] = substr($response, $info['header_size']);\r
-               $this->requests[$orig]['method'] = $request->method;\r
-               $this->requests[$orig]['effective_url'] = $info['url'];\r
-               $this->requests[$orig]['status_code'] = (int)$info['http_code'];\r
-               if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {\r
-                       $this->requests[$orig]['location'] =  trim($match[1]);\r
-               }\r
-       }\r
-       \r
-       protected function headersToString(array $headers, $associative=true) {\r
-               if (!$associative) {\r
-                       return implode("\n", $headers);\r
-               } else {\r
-                       $str = '';\r
-                       foreach ($headers as $key => $val) {\r
-                               if (is_array($val)) {\r
-                                       foreach ($val as $v) $str .= "$key: $v\n";\r
-                               } else {\r
-                                       $str .= "$key: $val\n";\r
-                               }\r
-                       }\r
-                       return rtrim($str);\r
-               }\r
-       }\r
-       \r
-       public function get($url, $remove=false, $gzdecode=true) {\r
-               $url = "$url";\r
-               if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
-                       $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");\r
-                       $response = $this->requests[$url];\r
-               /*\r
-               } elseif ($this->isCached($url)) {\r
-                       $this->debug("URL already fetched - in disk cache ($url)");\r
-                       $response = $this->getCached($url);\r
-                       $this->requests[$url] = $response;\r
-               */\r
-               } else {\r
-                       $this->debug("Fetching URL ($url)");\r
-                       $this->fetchAll(array($url));\r
-                       if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {\r
-                               $response = $this->requests[$url];\r
-                       } else {\r
-                               $this->debug("Request failed");\r
-                               $response = false;\r
-                       }\r
-               }\r
-               /*\r
-               if ($this->minimiseMemoryUse && $response) {\r
-                       $this->cache($url);\r
-                       unset($this->requests[$url]);\r
-               }\r
-               */\r
-               if ($remove && $response) unset($this->requests[$url]);\r
-               if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {\r
-                       if ($html = gzdecode($response['body'])) {\r
-                               $response['body'] = $html;\r
-                       }\r
-               }\r
-               return $response;\r
-       }\r
-       \r
-       public function parallelSupport() {\r
-               return class_exists('HttpRequestPool') || function_exists('curl_multi_init');\r
-       }\r
-       \r
-       private function headerOnlyType($headers) {\r
-               if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {\r
-                       // look for full mime type (e.g. image/jpeg) or just type (e.g. image)\r
-                       $match[1] = strtolower(trim($match[1]));\r
-                       $match[2] = strtolower(trim($match[2]));\r
-                       foreach (array($match[1], $match[2]) as $mime) {\r
-                               if (in_array($mime, $this->headerOnlyTypes)) return true;\r
-                       }\r
-               }\r
-               return false;\r
-       }\r
-       \r
-       private function possibleUnsupportedType($url) {\r
-               $path = @parse_url($url, PHP_URL_PATH);\r
-               if ($path && strpos($path, '.') !== false) {\r
-                       $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));\r
-                       return in_array($ext, $this->headerOnlyClues);\r
-               }\r
-               return false;\r
-       }\r
-}\r
-\r
-// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930\r
-if (!function_exists('gzdecode')) {\r
-       function gzdecode($data,&$filename='',&$error='',$maxlength=null) \r
-       {\r
-               $len = strlen($data);\r
-               if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {\r
-                       $error = "Not in GZIP format.";\r
-                       return null;  // Not GZIP format (See RFC 1952)\r
-               }\r
-               $method = ord(substr($data,2,1));  // Compression method\r
-               $flags  = ord(substr($data,3,1));  // Flags\r
-               if ($flags & 31 != $flags) {\r
-                       $error = "Reserved bits not allowed.";\r
-                       return null;\r
-               }\r
-               // NOTE: $mtime may be negative (PHP integer limitations)\r
-               $mtime = unpack("V", substr($data,4,4));\r
-               $mtime = $mtime[1];\r
-               $xfl   = substr($data,8,1);\r
-               $os    = substr($data,8,1);\r
-               $headerlen = 10;\r
-               $extralen  = 0;\r
-               $extra     = "";\r
-               if ($flags & 4) {\r
-                       // 2-byte length prefixed EXTRA data in header\r
-                       if ($len - $headerlen - 2 < 8) {\r
-                               return false;  // invalid\r
-                       }\r
-                       $extralen = unpack("v",substr($data,8,2));\r
-                       $extralen = $extralen[1];\r
-                       if ($len - $headerlen - 2 - $extralen < 8) {\r
-                               return false;  // invalid\r
-                       }\r
-                       $extra = substr($data,10,$extralen);\r
-                       $headerlen += 2 + $extralen;\r
-               }\r
-               $filenamelen = 0;\r
-               $filename = "";\r
-               if ($flags & 8) {\r
-                       // C-style string\r
-                       if ($len - $headerlen - 1 < 8) {\r
-                               return false; // invalid\r
-                       }\r
-                       $filenamelen = strpos(substr($data,$headerlen),chr(0));\r
-                       if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {\r
-                               return false; // invalid\r
-                       }\r
-                       $filename = substr($data,$headerlen,$filenamelen);\r
-                       $headerlen += $filenamelen + 1;\r
-               }\r
-               $commentlen = 0;\r
-               $comment = "";\r
-               if ($flags & 16) {\r
-                       // C-style string COMMENT data in header\r
-                       if ($len - $headerlen - 1 < 8) {\r
-                               return false;    // invalid\r
-                       }\r
-                       $commentlen = strpos(substr($data,$headerlen),chr(0));\r
-                       if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {\r
-                               return false;    // Invalid header format\r
-                       }\r
-                       $comment = substr($data,$headerlen,$commentlen);\r
-                       $headerlen += $commentlen + 1;\r
-               }\r
-               $headercrc = "";\r
-               if ($flags & 2) {\r
-                       // 2-bytes (lowest order) of CRC32 on header present\r
-                       if ($len - $headerlen - 2 < 8) {\r
-                               return false;    // invalid\r
-                       }\r
-                       $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;\r
-                       $headercrc = unpack("v", substr($data,$headerlen,2));\r
-                       $headercrc = $headercrc[1];\r
-                       if ($headercrc != $calccrc) {\r
-                               $error = "Header checksum failed.";\r
-                               return false;    // Bad header CRC\r
-                       }\r
-                       $headerlen += 2;\r
-               }\r
-               // GZIP FOOTER\r
-               $datacrc = unpack("V",substr($data,-8,4));\r
-               $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);\r
-               $isize = unpack("V",substr($data,-4));\r
-               $isize = $isize[1];\r
-               // decompression:\r
-               $bodylen = $len-$headerlen-8;\r
-               if ($bodylen < 1) {\r
-                       // IMPLEMENTATION BUG!\r
-                       return null;\r
-               }\r
-               $body = substr($data,$headerlen,$bodylen);\r
-               $data = "";\r
-               if ($bodylen > 0) {\r
-                       switch ($method) {\r
-                       case 8:\r
-                               // Currently the only supported compression method:\r
-                               $data = gzinflate($body,$maxlength);\r
-                               break;\r
-                       default:\r
-                               $error = "Unknown compression method.";\r
-                               return false;\r
-                       }\r
-               }  // zero-byte body content is allowed\r
-               // Verifiy CRC32\r
-               $crc   = sprintf("%u",crc32($data));\r
-               $crcOK = $crc == $datacrc;\r
-               $lenOK = $isize == strlen($data);\r
-               if (!$lenOK || !$crcOK) {\r
-                       $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');\r
-                       return false;\r
-               }\r
-               return $data;\r
-       }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Humble HTTP Agent
+ * 
+ * This class is designed to take advantage of parallel HTTP requests
+ * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 
+ * For environments which do not have these options, it reverts to standard sequential 
+ * requests (using file_get_contents())
+ * 
+ * @version 1.4
+ * @date 2013-05-10
+ * @see http://php.net/HttpRequestPool
+ * @author Keyvan Minoukadeh
+ * @copyright 2011-2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class HumbleHttpAgent
+{
+       const METHOD_REQUEST_POOL = 1;
+       const METHOD_CURL_MULTI = 2;
+       const METHOD_FILE_GET_CONTENTS = 4;
+       //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
+       const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
+       const UA_PHP = 'PHP/5.4';
+       const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
+       
+       protected $requests = array();
+       protected $redirectQueue = array();
+       protected $requestOptions;
+       protected $maxParallelRequests = 5;
+       protected $cache = null; //TODO
+       protected $httpContext;
+       protected $minimiseMemoryUse = false; //TODO
+       protected $method;
+       protected $cookieJar;
+       public $debug = false;
+       public $debugVerbose = false;
+       public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
+       public $maxRedirects = 5;
+       public $userAgentMap = array();
+       public $rewriteUrls = array();
+       public $userAgentDefault;
+       public $referer;
+       //public $userAgent = 'Mozilla/5.0';
+       
+       // Prevent certain file/mime types
+       // HTTP responses which match these content types will
+       // be returned without body.
+       public $headerOnlyTypes = array();
+       // URLs ending with one of these extensions will
+       // prompt Humble HTTP Agent to send a HEAD request first
+       // to see if returned content type matches $headerOnlyTypes.
+       public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
+       // AJAX triggers to search for.
+       // for AJAX sites, e.g. Blogger with its dynamic views templates.
+       public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
+       
+       //TODO: set max file size
+       //TODO: normalise headers
+       
+       function __construct($requestOptions=null, $method=null) {
+               $this->userAgentDefault = self::UA_BROWSER;
+               $this->referer = self::REF_GOOGLE;
+               // set the request method
+               if (in_array($method, array(1,2,4))) {
+                       $this->method = $method;
+               } else {
+                       if (class_exists('HttpRequestPool')) {
+                               $this->method = self::METHOD_REQUEST_POOL;
+                       } elseif (function_exists('curl_multi_init')) {
+                               $this->method = self::METHOD_CURL_MULTI;
+                       } else {
+                               $this->method = self::METHOD_FILE_GET_CONTENTS;
+                       }
+               }
+               if ($this->method == self::METHOD_CURL_MULTI) {
+                       require_once(dirname(__FILE__).'/RollingCurl.php');
+               }
+               // create cookie jar
+               $this->cookieJar = new CookieJar();
+               // set request options (redirect must be 0)
+               $this->requestOptions = array(
+                       'timeout' => 15,
+                       'connecttimeout' => 15,
+                       'dns_cache_timeout' => 300,
+                       'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
+                       // TODO: test onprogress?
+               );
+               if (is_array($requestOptions)) {
+                       $this->requestOptions = array_merge($this->requestOptions, $requestOptions);
+               }
+               $this->httpContext = array(
+                       'http' => array(
+                               'ignore_errors' => true,
+                               'timeout' => $this->requestOptions['timeout'],
+                               'max_redirects' => $this->requestOptions['redirect'],
+                               'header' => "Accept: */*\r\n"
+                               )
+                       );
+       }
+       
+       protected function debug($msg) {
+               if ($this->debug) {
+                       $mem = round(memory_get_usage()/1024, 2);
+                       $memPeak = round(memory_get_peak_usage()/1024, 2);
+                       echo '* ',$msg;
+                       if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
+                       echo "\n";
+                       ob_flush();
+                       flush();
+               }
+       }
+       
+       protected function getUserAgent($url, $asArray=false) {
+               $host = @parse_url($url, PHP_URL_HOST);
+               if (strtolower(substr($host, 0, 4)) == 'www.') {
+                       $host = substr($host, 4);
+               }
+               if ($host) {
+                       $try = array($host);
+                       $split = explode('.', $host);
+                       if (count($split) > 1) {
+                               array_shift($split);
+                               $try[] = '.'.implode('.', $split);
+                       }
+                       foreach ($try as $h) {
+                               if (isset($this->userAgentMap[$h])) {
+                                       $ua = $this->userAgentMap[$h];
+                                       break;
+                               }
+                       }
+               }
+               if (!isset($ua)) $ua = $this->userAgentDefault;
+               if ($asArray) {
+                       return array('User-Agent' => $ua);
+               } else {
+                       return 'User-Agent: '.$ua;
+               }
+       }
+       
+       public function rewriteHashbangFragment($url) {
+               // return $url if there's no '#!'
+               if (strpos($url, '#!') === false) return $url;
+               // split $url and rewrite
+               // TODO: is SimplePie_IRI included?
+               $iri = new SimplePie_IRI($url);
+               $fragment = substr($iri->fragment, 1); // strip '!'
+               $iri->fragment = null;
+               if (isset($iri->query)) {
+                       parse_str($iri->query, $query);
+               } else {
+                       $query = array();
+               }
+               $query['_escaped_fragment_'] = (string)$fragment;
+               $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+               return $iri->get_iri();
+       }
+       
+       public function getRedirectURLfromHTML($url, $html) {
+               $redirect_url = $this->getMetaRefreshURL($url, $html);
+               if (!$redirect_url) {
+                       $redirect_url = $this->getUglyURL($url, $html);
+               }
+               return $redirect_url;
+       }
+       
+       public function getMetaRefreshURL($url, $html) {
+               if ($html == '') return false;
+               // <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
+               if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
+                       return false;
+               }
+               $redirect_url = $match[1];
+               if (preg_match('!^https?://!i', $redirect_url)) {
+                       // already absolute
+                       $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
+                       return $redirect_url;
+               }
+               // absolutize redirect URL
+               $base = new SimplePie_IRI($url);
+               // remove '//' in URL path (causes URLs not to resolve properly)
+               if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
+               if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
+                       $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
+                       return $absolute;
+               }
+               return false;
+       }       
+       
+       public function getUglyURL($url, $html) {
+               if ($html == '') return false;
+               $found = false;
+               foreach ($this->ajaxTriggers as $string) {
+                       if (stripos($html, $string)) {
+                               $found = true;
+                               break;
+                       }
+               }
+               if (!$found) return false;
+               $iri = new SimplePie_IRI($url);
+               if (isset($iri->query)) {
+                       parse_str($iri->query, $query);
+               } else {
+                       $query = array();
+               }
+               $query['_escaped_fragment_'] = '';
+               $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+               $ugly_url = $iri->get_iri();
+               $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
+               return $ugly_url;
+       }
+       
+       public function removeFragment($url) {
+               $pos = strpos($url, '#');
+               if ($pos === false) {
+                       return $url;
+               } else {
+                       return substr($url, 0, $pos);
+               }
+       }
+       
+       public function rewriteUrls($url) {
+               foreach ($this->rewriteUrls as $find => $action) {
+                       if (strpos($url, $find) !== false) {
+                               if (is_array($action)) {
+                                       return strtr($url, $action);
+                               }
+                       }
+               }
+               return $url;
+       }
+       
+       public function enableDebug($bool=true) {
+               $this->debug = (bool)$bool;
+       }
+       
+       public function minimiseMemoryUse($bool = true) {
+               $this->minimiseMemoryUse = $bool;
+       }
+       
+       public function setMaxParallelRequests($max) {
+               $this->maxParallelRequests = $max;
+       }
+       
+       public function validateUrl($url) {
+               $url = filter_var($url, FILTER_SANITIZE_URL);
+               $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+               // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
+               if ($test === false) {
+                       $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+               }
+               if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
+                       return $url;
+               } else {
+                       return false;
+               }
+       }
+       
+       public function fetchAll(array $urls) {
+               $this->fetchAllOnce($urls, $isRedirect=false);
+               $redirects = 0;
+               while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
+                       $this->debug("Following redirects #$redirects...");
+                       $this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
+               }
+       }
+       
+       // fetch all URLs without following redirects
+       public function fetchAllOnce(array $urls, $isRedirect=false) {
+               if (!$isRedirect) $urls = array_unique($urls);
+               if (empty($urls)) return;
+               
+               //////////////////////////////////////////////////////
+               // parallel (HttpRequestPool)
+               if ($this->method == self::METHOD_REQUEST_POOL) {
+                       $this->debug('Starting parallel fetch (HttpRequestPool)');
+                       try {
+                               while (count($urls) > 0) {
+                                       $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+                                       $subset = array_splice($urls, 0, $this->maxParallelRequests);
+                                       $pool = new HttpRequestPool();
+                                       foreach ($subset as $orig => $url) {
+                                               if (!$isRedirect) $orig = $url;
+                                               unset($this->redirectQueue[$orig]);
+                                               $this->debug("...$url");
+                                               if (!$isRedirect && isset($this->requests[$url])) {
+                                                       $this->debug("......in memory");
+                                               /*
+                                               } elseif ($this->isCached($url)) {
+                                                       $this->debug("......is cached");
+                                                       if (!$this->minimiseMemoryUse) {
+                                                               $this->requests[$url] = $this->getCached($url);
+                                                       }
+                                               */
+                                               } else {
+                                                       $this->debug("......adding to pool");
+                                                       $req_url = $this->rewriteUrls($url);
+                                                       $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+                                                       $req_url = $this->removeFragment($req_url);
+                                                       if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+                                                               $_meth = HttpRequest::METH_HEAD;
+                                                       } else {
+                                                               $_meth = HttpRequest::METH_GET;
+                                                               unset($this->requests[$orig]['wrongGuess']);
+                                                       }
+                                                       $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
+                                                       // send cookies, if we have any
+                                                       if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+                                                               $this->debug("......sending cookies: $cookies");
+                                                               $httpRequest->addHeaders(array('Cookie' => $cookies));
+                                                       }
+                                                       //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
+                                                       $httpRequest->addHeaders($this->getUserAgent($req_url, true));
+                                                       // add referer for picky sites
+                                                       $httpRequest->addheaders(array('Referer' => $this->referer));
+                                                       $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+                                                       $this->requests[$orig]['original_url'] = $orig;
+                                                       $pool->attach($httpRequest);
+                                               }
+                                       }
+                                       // did we get anything into the pool?
+                                       if (count($pool) > 0) {
+                                               $this->debug('Sending request...');
+                                               try {
+                                                       $pool->send();
+                                               } catch (HttpRequestPoolException $e) {
+                                                       // do nothing
+                                               }
+                                               $this->debug('Received responses');
+                                               foreach($subset as $orig => $url) {
+                                                       if (!$isRedirect) $orig = $url;
+                                                       $request = $this->requests[$orig]['httpRequest'];
+                                                       //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
+                                                       // getResponseHeader() doesn't return status line, so, for consistency...
+                                                       $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
+                                                       // check content type
+                                                       // TODO: use getResponseHeader('content-type') or getResponseInfo()
+                                                       if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+                                                               $this->requests[$orig]['body'] = '';
+                                                               $_header_only_type = true;
+                                                               $this->debug('Header only type returned');
+                                                       } else {
+                                                               $this->requests[$orig]['body'] = $request->getResponseBody();
+                                                               $_header_only_type = false;
+                                                       }
+                                                       $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
+                                                       $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
+                                                       // is redirect?
+                                                       if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
+                                                               $redirectURL = $request->getResponseHeader('location');
+                                                               if (!preg_match('!^https?://!i', $redirectURL)) {
+                                                                       $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+                                                               }
+                                                               if ($this->validateURL($redirectURL)) {
+                                                                       $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+                                                                       // store any cookies
+                                                                       $cookies = $request->getResponseHeader('set-cookie');
+                                                                       if ($cookies && !is_array($cookies)) $cookies = array($cookies);
+                                                                       if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
+                                                                       $this->redirectQueue[$orig] = $redirectURL;
+                                                               } else {
+                                                                       $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+                                                               }
+                                                       } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
+                                                               // the response content-type did not match our 'header only' types, 
+                                                               // but we'd issues a HEAD request because we assumed it would. So
+                                                               // let's queue a proper GET request for this item...
+                                                               $this->debug('Wrong guess at content-type, queing GET request');
+                                                               $this->requests[$orig]['wrongGuess'] = true;
+                                                               $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+                                                       } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+                                                               // check for <meta name='fragment' content='!'/>
+                                                               // for AJAX sites, e.g. Blogger with its dynamic views templates.
+                                                               // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+                                                               if (isset($this->requests[$orig]['body'])) {
+                                                                       $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+                                                                       if ($redirectURL) {
+                                                                               $this->redirectQueue[$orig] = $redirectURL;
+                                                                       }
+                                                               }
+                                                       }
+                                                       //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+                                                       $pool->detach($request);
+                                                       unset($this->requests[$orig]['httpRequest'], $request);
+                                                       /*
+                                                       if ($this->minimiseMemoryUse) {
+                                                               if ($this->cache($url)) {
+                                                                       unset($this->requests[$url]);
+                                                               }
+                                                       }
+                                                       */
+                                               }
+                                       }
+                               }
+                       } catch (HttpException $e) {
+                               $this->debug($e);
+                               return false;
+                       }
+               }
+               
+               //////////////////////////////////////////////////////////
+               // parallel (curl_multi_*)
+               elseif ($this->method == self::METHOD_CURL_MULTI) {
+                       $this->debug('Starting parallel fetch (curl_multi_*)');
+                       while (count($urls) > 0) {
+                               $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+                               $subset = array_splice($urls, 0, $this->maxParallelRequests);
+                               $pool = new RollingCurl(array($this, 'handleCurlResponse'));
+                               $pool->window_size = count($subset);            
+                               
+                               foreach ($subset as $orig => $url) {
+                                       if (!$isRedirect) $orig = $url;
+                                       unset($this->redirectQueue[$orig]);
+                                       $this->debug("...$url");
+                                       if (!$isRedirect && isset($this->requests[$url])) {
+                                               $this->debug("......in memory");
+                                       /*
+                                       } elseif ($this->isCached($url)) {
+                                               $this->debug("......is cached");
+                                               if (!$this->minimiseMemoryUse) {
+                                                       $this->requests[$url] = $this->getCached($url);
+                                               }
+                                       */
+                                       } else {
+                                               $this->debug("......adding to pool");
+                                               $req_url = $this->rewriteUrls($url);
+                                               $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+                                               $req_url = $this->removeFragment($req_url);
+                                               if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+                                                       $_meth = 'HEAD';
+                                               } else {
+                                                       $_meth = 'GET';
+                                                       unset($this->requests[$orig]['wrongGuess']);
+                                               }                                               
+                                               $headers = array();
+                                               //$headers[] = 'User-Agent: '.$this->userAgent;
+                                               $headers[] = $this->getUserAgent($req_url);
+                                               // add referer for picky sites
+                                               $headers[] = 'Referer: '.$this->referer;
+                                               // send cookies, if we have any
+                                               if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+                                                       $this->debug("......sending cookies: $cookies");
+                                                       $headers[] = 'Cookie: '.$cookies;
+                                               }
+                                               $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
+                                                       CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
+                                                       CURLOPT_TIMEOUT => $this->requestOptions['timeout']
+                                                       ));
+                                               $httpRequest->set_original_url($orig);
+                                               $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+                                               $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
+                                               $pool->add($httpRequest);
+                                       }
+                               }
+                               // did we get anything into the pool?
+                               if (count($pool) > 0) {
+                                       $this->debug('Sending request...');
+                                       $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
+                                       $this->debug('Received responses');
+                                       foreach($subset as $orig => $url) {
+                                               if (!$isRedirect) $orig = $url;
+                                               // $this->requests[$orig]['headers']
+                                               // $this->requests[$orig]['body']
+                                               // $this->requests[$orig]['effective_url']
+                                               // check content type
+                                               if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+                                                       $this->requests[$orig]['body'] = '';
+                                                       $_header_only_type = true;
+                                                       $this->debug('Header only type returned');
+                                               } else {
+                                                       $_header_only_type = false;
+                                               }
+                                               $status_code = $this->requests[$orig]['status_code'];
+                                               if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+                                                       $redirectURL = $this->requests[$orig]['location'];
+                                                       if (!preg_match('!^https?://!i', $redirectURL)) {
+                                                               $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+                                                       }
+                                                       if ($this->validateURL($redirectURL)) {
+                                                               $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+                                                               // store any cookies
+                                                               $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+                                                               if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);                                                   
+                                                               $this->redirectQueue[$orig] = $redirectURL;
+                                                       } else {
+                                                               $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+                                                       }
+                                               } elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
+                                                       // the response content-type did not match our 'header only' types, 
+                                                       // but we'd issues a HEAD request because we assumed it would. So
+                                                       // let's queue a proper GET request for this item...
+                                                       $this->debug('Wrong guess at content-type, queing GET request');
+                                                       $this->requests[$orig]['wrongGuess'] = true;
+                                                       $this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+                                               } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+                                                       // check for <meta name='fragment' content='!'/>
+                                                       // for AJAX sites, e.g. Blogger with its dynamic views templates.
+                                                       // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+                                                       if (isset($this->requests[$orig]['body'])) {
+                                                               $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+                                                               if ($redirectURL) {
+                                                                       $this->redirectQueue[$orig] = $redirectURL;
+                                                               }
+                                                       }
+                                               }
+                                               // die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+                                               unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
+                                       }
+                               }
+                       }
+               }
+
+               //////////////////////////////////////////////////////
+               // sequential (file_get_contents)
+               else {
+                       $this->debug('Starting sequential fetch (file_get_contents)');
+                       $this->debug('Processing set of '.count($urls));
+                       foreach ($urls as $orig => $url) {
+                               if (!$isRedirect) $orig = $url;
+                               unset($this->redirectQueue[$orig]);
+                               $this->debug("...$url");
+                               if (!$isRedirect && isset($this->requests[$url])) {
+                                       $this->debug("......in memory");
+                               /*
+                               } elseif ($this->isCached($url)) {
+                                       $this->debug("......is cached");
+                                       if (!$this->minimiseMemoryUse) {
+                                               $this->requests[$url] = $this->getCached($url);
+                                       }
+                               */
+                               } else {
+                                       $this->debug("Sending request for $url");
+                                       $this->requests[$orig]['original_url'] = $orig;
+                                       $req_url = $this->rewriteUrls($url);
+                                       $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+                                       $req_url = $this->removeFragment($req_url);
+                                       // send cookies, if we have any
+                                       $httpContext = $this->httpContext;
+                                       $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
+                                       // add referer for picky sites
+                                       $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
+                                       if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+                                               $this->debug("......sending cookies: $cookies");
+                                               $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
+                                       }
+                                       if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
+                                               $this->debug('Received response');
+                                               // get status code
+                                               if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
+                                                       $this->debug('Error: no status code found');
+                                                       // TODO: handle error - no status code
+                                               } else {
+                                                       $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
+                                                       // check content type
+                                                       if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+                                                               $this->requests[$orig]['body'] = '';
+                                                       } else {
+                                                               $this->requests[$orig]['body'] = $html;
+                                                       }
+                                                       $this->requests[$orig]['effective_url'] = $req_url;
+                                                       $this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
+                                                       unset($match);
+                                                       // handle redirect
+                                                       if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+                                                               $this->requests[$orig]['location'] =  trim($match[1]);
+                                                       }
+                                                       if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+                                                               $redirectURL = $this->requests[$orig]['location'];
+                                                               if (!preg_match('!^https?://!i', $redirectURL)) {
+                                                                       $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+                                                               }
+                                                               if ($this->validateURL($redirectURL)) {
+                                                                       $this->debug('Redirect detected. Valid URL: '.$redirectURL);
+                                                                       // store any cookies
+                                                                       $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+                                                                       if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
+                                                                       $this->redirectQueue[$orig] = $redirectURL;
+                                                               } else {
+                                                                       $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+                                                               }
+                                                       } elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+                                                               // check for <meta name='fragment' content='!'/>
+                                                               // for AJAX sites, e.g. Blogger with its dynamic views templates.
+                                                               // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+                                                               if (isset($this->requests[$orig]['body'])) {
+                                                                       $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+                                                                       if ($redirectURL) {
+                                                                               $this->redirectQueue[$orig] = $redirectURL;
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       } else {
+                                               $this->debug('Error retrieving URL');
+                                               //print_r($req_url);
+                                               //print_r($http_response_header);
+                                               //print_r($html);
+                                               
+                                               // TODO: handle error - failed to retrieve URL
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       public function handleCurlResponse($response, $info, $request) {
+               $orig = $request->url_original;
+               $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
+               $this->requests[$orig]['body'] = substr($response, $info['header_size']);
+               $this->requests[$orig]['method'] = $request->method;
+               $this->requests[$orig]['effective_url'] = $info['url'];
+               $this->requests[$orig]['status_code'] = (int)$info['http_code'];
+               if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+                       $this->requests[$orig]['location'] =  trim($match[1]);
+               }
+       }
+       
+       protected function headersToString(array $headers, $associative=true) {
+               if (!$associative) {
+                       return implode("\n", $headers);
+               } else {
+                       $str = '';
+                       foreach ($headers as $key => $val) {
+                               if (is_array($val)) {
+                                       foreach ($val as $v) $str .= "$key: $v\n";
+                               } else {
+                                       $str .= "$key: $val\n";
+                               }
+                       }
+                       return rtrim($str);
+               }
+       }
+       
+       public function get($url, $remove=false, $gzdecode=true) {
+               $url = "$url";
+               if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+                       $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
+                       $response = $this->requests[$url];
+               /*
+               } elseif ($this->isCached($url)) {
+                       $this->debug("URL already fetched - in disk cache ($url)");
+                       $response = $this->getCached($url);
+                       $this->requests[$url] = $response;
+               */
+               } else {
+                       $this->debug("Fetching URL ($url)");
+                       $this->fetchAll(array($url));
+                       if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+                               $response = $this->requests[$url];
+                       } else {
+                               $this->debug("Request failed");
+                               $response = false;
+                       }
+               }
+               /*
+               if ($this->minimiseMemoryUse && $response) {
+                       $this->cache($url);
+                       unset($this->requests[$url]);
+               }
+               */
+               if ($remove && $response) unset($this->requests[$url]);
+               if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
+                       if ($html = gzdecode($response['body'])) {
+                               $response['body'] = $html;
+                       }
+               }
+               return $response;
+       }
+       
+       public function parallelSupport() {
+               return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
+       }
+       
+       private function headerOnlyType($headers) {
+               if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
+                       // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
+                       $match[1] = strtolower(trim($match[1]));
+                       $match[2] = strtolower(trim($match[2]));
+                       foreach (array($match[1], $match[2]) as $mime) {
+                               if (in_array($mime, $this->headerOnlyTypes)) return true;
+                       }
+               }
+               return false;
+       }
+       
+       private function possibleUnsupportedType($url) {
+               $path = @parse_url($url, PHP_URL_PATH);
+               if ($path && strpos($path, '.') !== false) {
+                       $ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
+                       return in_array($ext, $this->headerOnlyClues);
+               }
+               return false;
+       }
+}
+
+// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
+if (!function_exists('gzdecode')) {
+       function gzdecode($data,&$filename='',&$error='',$maxlength=null) 
+       {
+               $len = strlen($data);
+               if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
+                       $error = "Not in GZIP format.";
+                       return null;  // Not GZIP format (See RFC 1952)
+               }
+               $method = ord(substr($data,2,1));  // Compression method
+               $flags  = ord(substr($data,3,1));  // Flags
+               if ($flags & 31 != $flags) {
+                       $error = "Reserved bits not allowed.";
+                       return null;
+               }
+               // NOTE: $mtime may be negative (PHP integer limitations)
+               $mtime = unpack("V", substr($data,4,4));
+               $mtime = $mtime[1];
+               $xfl   = substr($data,8,1);
+               $os    = substr($data,8,1);
+               $headerlen = 10;
+               $extralen  = 0;
+               $extra     = "";
+               if ($flags & 4) {
+                       // 2-byte length prefixed EXTRA data in header
+                       if ($len - $headerlen - 2 < 8) {
+                               return false;  // invalid
+                       }
+                       $extralen = unpack("v",substr($data,8,2));
+                       $extralen = $extralen[1];
+                       if ($len - $headerlen - 2 - $extralen < 8) {
+                               return false;  // invalid
+                       }
+                       $extra = substr($data,10,$extralen);
+                       $headerlen += 2 + $extralen;
+               }
+               $filenamelen = 0;
+               $filename = "";
+               if ($flags & 8) {
+                       // C-style string
+                       if ($len - $headerlen - 1 < 8) {
+                               return false; // invalid
+                       }
+                       $filenamelen = strpos(substr($data,$headerlen),chr(0));
+                       if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
+                               return false; // invalid
+                       }
+                       $filename = substr($data,$headerlen,$filenamelen);
+                       $headerlen += $filenamelen + 1;
+               }
+               $commentlen = 0;
+               $comment = "";
+               if ($flags & 16) {
+                       // C-style string COMMENT data in header
+                       if ($len - $headerlen - 1 < 8) {
+                               return false;    // invalid
+                       }
+                       $commentlen = strpos(substr($data,$headerlen),chr(0));
+                       if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
+                               return false;    // Invalid header format
+                       }
+                       $comment = substr($data,$headerlen,$commentlen);
+                       $headerlen += $commentlen + 1;
+               }
+               $headercrc = "";
+               if ($flags & 2) {
+                       // 2-bytes (lowest order) of CRC32 on header present
+                       if ($len - $headerlen - 2 < 8) {
+                               return false;    // invalid
+                       }
+                       $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
+                       $headercrc = unpack("v", substr($data,$headerlen,2));
+                       $headercrc = $headercrc[1];
+                       if ($headercrc != $calccrc) {
+                               $error = "Header checksum failed.";
+                               return false;    // Bad header CRC
+                       }
+                       $headerlen += 2;
+               }
+               // GZIP FOOTER
+               $datacrc = unpack("V",substr($data,-8,4));
+               $datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
+               $isize = unpack("V",substr($data,-4));
+               $isize = $isize[1];
+               // decompression:
+               $bodylen = $len-$headerlen-8;
+               if ($bodylen < 1) {
+                       // IMPLEMENTATION BUG!
+                       return null;
+               }
+               $body = substr($data,$headerlen,$bodylen);
+               $data = "";
+               if ($bodylen > 0) {
+                       switch ($method) {
+                       case 8:
+                               // Currently the only supported compression method:
+                               $data = gzinflate($body,$maxlength);
+                               break;
+                       default:
+                               $error = "Unknown compression method.";
+                               return false;
+                       }
+               }  // zero-byte body content is allowed
+               // Verifiy CRC32
+               $crc   = sprintf("%u",crc32($data));
+               $crcOK = $crc == $datacrc;
+               $lenOK = $isize == strlen($data);
+               if (!$lenOK || !$crcOK) {
+                       $error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
+                       return false;
+               }
+               return $data;
+       }
+}
\ No newline at end of file
index ecd46d5f8af37f38f56bdd23d7b23a0a945fd090..c524a1ee11edef33a5424b914cd3423638ce41db 100644 (file)
@@ -1,79 +1,78 @@
-<?php\r
-/**\r
- * Humble HTTP Agent extension for SimplePie_File\r
- * \r
- * This class is designed to extend and override SimplePie_File\r
- * in order to prevent duplicate HTTP requests being sent out.\r
- * The idea is to initialise an instance of Humble HTTP Agent\r
- * and attach it, to a static class variable, of this class.\r
- * SimplePie will then automatically initialise this class\r
- * \r
- * @date 2011-02-28\r
- */\r
-\r
-class SimplePie_HumbleHttpAgent extends SimplePie_File\r
-{\r
-       protected static $agent;\r
-       var $url;\r
-       var $useragent;\r
-       var $success = true;\r
-       var $headers = array();\r
-       var $body;\r
-       var $status_code;\r
-       var $redirects = 0;\r
-       var $error;\r
-       var $method = SIMPLEPIE_FILE_SOURCE_NONE;\r
-\r
-       public static function set_agent(HumbleHttpAgent $agent) {\r
-               self::$agent = $agent;\r
-       }\r
-       \r
-       public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {\r
-               if (class_exists('idna_convert'))\r
-               {\r
-                       $idn = new idna_convert();\r
-                       $parsed = SimplePie_Misc::parse_url($url);\r
-                       $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);\r
-               }\r
-               $this->url = $url;\r
-               $this->useragent = $useragent;\r
-               if (preg_match('/^http(s)?:\/\//i', $url))\r
-               {\r
-                       if (!is_array($headers))\r
-                       {\r
-                               $headers = array();\r
-                       }\r
-                       $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;\r
-                       $headers2 = array();\r
-                       foreach ($headers as $key => $value) {\r
-                               $headers2[] = "$key: $value";\r
-                       }\r
-                       //TODO: allow for HTTP headers\r
-                       // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);\r
-\r
-                       $response = self::$agent->get($url);\r
-                       \r
-                       if ($response === false || !isset($response['status_code'])) {\r
-                               $this->error = 'failed to fetch URL';\r
-                               $this->success = false;\r
-                       } else {\r
-                               // The extra lines at the end are there to satisfy SimplePie's HTTP parser.\r
-                               // The class expects a full HTTP message, whereas we're giving it only\r
-                               // headers - the new lines indicate the start of the body.\r
-                               $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");\r
-                               if ($parser->parse()) {\r
-                                       $this->headers = $parser->headers;\r
-                                       //$this->body = $parser->body;\r
-                                       $this->body = $response['body'];\r
-                                       $this->status_code = $parser->status_code;\r
-                               }\r
-                       }\r
-               }\r
-               else\r
-               {\r
-                       $this->error = 'invalid URL';\r
-                       $this->success = false;\r
-               }\r
-       }\r
-}\r
-?>
\ No newline at end of file
+<?php
+/**
+ * Humble HTTP Agent extension for SimplePie_File
+ * 
+ * This class is designed to extend and override SimplePie_File
+ * in order to prevent duplicate HTTP requests being sent out.
+ * The idea is to initialise an instance of Humble HTTP Agent
+ * and attach it, to a static class variable, of this class.
+ * SimplePie will then automatically initialise this class
+ * 
+ * @date 2011-02-28
+ */
+
+class SimplePie_HumbleHttpAgent extends SimplePie_File
+{
+       protected static $agent;
+       var $url;
+       var $useragent;
+       var $success = true;
+       var $headers = array();
+       var $body;
+       var $status_code;
+       var $redirects = 0;
+       var $error;
+       var $method = SIMPLEPIE_FILE_SOURCE_NONE;
+
+       public static function set_agent(HumbleHttpAgent $agent) {
+               self::$agent = $agent;
+       }
+       
+       public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
+               if (class_exists('idna_convert'))
+               {
+                       $idn = new idna_convert();
+                       $parsed = SimplePie_Misc::parse_url($url);
+                       $url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
+               }
+               $this->url = $url;
+               $this->useragent = $useragent;
+               if (preg_match('/^http(s)?:\/\//i', $url))
+               {
+                       if (!is_array($headers))
+                       {
+                               $headers = array();
+                       }
+                       $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
+                       $headers2 = array();
+                       foreach ($headers as $key => $value) {
+                               $headers2[] = "$key: $value";
+                       }
+                       //TODO: allow for HTTP headers
+                       // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
+
+                       $response = self::$agent->get($url);
+                       
+                       if ($response === false || !isset($response['status_code'])) {
+                               $this->error = 'failed to fetch URL';
+                               $this->success = false;
+                       } else {
+                               // The extra lines at the end are there to satisfy SimplePie's HTTP parser.
+                               // The class expects a full HTTP message, whereas we're giving it only
+                               // headers - the new lines indicate the start of the body.
+                               $parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
+                               if ($parser->parse()) {
+                                       $this->headers = $parser->headers;
+                                       //$this->body = $parser->body;
+                                       $this->body = $response['body'];
+                                       $this->status_code = $parser->status_code;
+                               }
+                       }
+               }
+               else
+               {
+                       $this->error = 'invalid URL';
+                       $this->success = false;
+               }
+       }
+}
\ No newline at end of file
index 09b115467121eb3e2ad79d022772a1290118b92e..382d869c4432fdff55b2315796b2fb636f95726c 100644 (file)
@@ -6,23 +6,24 @@
  * Attempts to detect the language of a sample of text by correlating ranked
  * 3-gram frequencies to a table of 3-gram frequencies of known languages.
  *
- * Implements a version of a technique originally proposed by Cavnar & Trenkle 
- * (1994): "N-Gram-Based Text Categorization" 
+ * Implements a version of a technique originally proposed by Cavnar & Trenkle
+ * (1994): "N-Gram-Based Text Categorization"
  *
- * PHP versions 4 and 5
+ * PHP version 5
  *
- * @category   Text
- * @package    Text_LanguageDetect
- * @author     Nicholas Pisarro <infinityminusnine+pear@gmail.com>
- * @copyright  2005-2006 Nicholas Pisarro
- * @license    http://www.debian.org/misc/bsd.license BSD
- * @version    CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $
- * @link       http://pear.php.net/package/Text_LanguageDetect/
- * @link       http://langdetect.blogspot.com/
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2005-2006 Nicholas Pisarro
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ * @link      http://langdetect.blogspot.com/
  */
 
-//require_once 'PEAR.php';
-require_once 'Parser.php';
+require_once 'LanguageDetect/Exception.php';
+require_once 'LanguageDetect/Parser.php';
+require_once 'LanguageDetect/ISO639.php';
 
 /**
  * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
  *
  * echo "Supported languages:\n";
  *
- * $langs = $l->getLanguages();
- * if (PEAR::isError($langs)) {
- *     die($langs->getMessage());
+ * try {
+ *     $langs = $l->getLanguages();
+ * } catch (Text_LanguageDetect_Exception $e) {
+ *     die($e->getMessage());
  * }
  *
  * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
  * }
  * </code>
  *
- * @category   Text
- * @package    Text_LanguageDetect
- * @author     Nicholas Pisarro <infinityminusnine+pear@gmail.com>
- * @copyright  2005 Nicholas Pisarro
- * @license    http://www.debian.org/misc/bsd.license BSD
- * @version    Release: @package_version@
- * @todo       allow users to generate their own language models
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2005 Nicholas Pisarro
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   Release: @package_version@
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ * @todo      allow users to generate their own language models
  */
 class Text_LanguageDetect
 {
-    /** 
+    /**
      * The filename that stores the trigram data for the detector
      *
-     * If this value starts with a slash (/) or a dot (.) the value of 
+     * If this value starts with a slash (/) or a dot (.) the value of
      * $this->_data_dir will be ignored
-     * 
+     *
      * @var      string
      * @access   private
      */
-    var $_db_filename = './lang.dat';
+    var $_db_filename = 'lang.dat';
 
     /**
      * The filename that stores the unicode block definitions
      *
-     * If this value starts with a slash (/) or a dot (.) the value of 
+     * If this value starts with a slash (/) or a dot (.) the value of
      * $this->_data_dir will be ignored
-     * 
+     *
      * @var string
      * @access private
      */
-    var $_unicode_db_filename = './unicode_blocks.dat';
+    var $_unicode_db_filename = 'unicode_blocks.dat';
 
     /**
      * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
 
     /**
      * The trigram data for comparison
-     * 
-     * Will be loaded on start from $this->_db_filename
      *
-     * May be set to a PEAR_Error object if there is an error during its 
-     * initialization
+     * Will be loaded on start from $this->_db_filename
      *
      * @var      array
      * @access   private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
 
     /**
      * The size of the trigram data arrays
-     * 
+     *
      * @var      int
      * @access   private
      */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
 
     /**
      * Whether or not to simulate perl's Language::Guess exactly
-     * 
+     *
      * @access  private
      * @var     bool
      * @see     setPerlCompatible()
@@ -164,19 +163,25 @@ class Text_LanguageDetect
      */
     var $_clusters;
 
+    /**
+     * Which type of "language names" are accepted and returned:
+     *
+     * 0 - language name ("english")
+     * 2 - 2-letter ISO 639-1 code ("en")
+     * 3 - 3-letter ISO 639-2 code ("eng")
+     */
+    var $_name_mode = 0;
+
     /**
      * Constructor
      *
      * Will attempt to load the language database. If it fails, you will get
-     * a PEAR_Error object returned when you try to use detect()
-     *
+     * an exception.
      */
-    function Text_LanguageDetect($db=null, $unicode_db=null)
+    function __construct()
     {
-               if (isset($db)) $this->_db_filename = $db;
-               if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
-               
         $data = $this->_readdb($this->_db_filename);
+        $this->_checkTrigram($data['trigram']);
         $this->_lang_db = $data['trigram'];
 
         if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
         // Not yet implemented:
         if (isset($data['trigram-clusters'])) {
             $this->_clusters = $data['trigram-clusters'];
-        }              
+        }
     }
 
     /**
      * Returns the path to the location of the database
      *
-     * @access    private
-     * @return    string    expected path to the language model database
+     * @param string $fname File name to load
+     *
+     * @return string expected path to the language model database
+     * @access private
      */
     function _get_data_loc($fname)
     {
-        return $fname;
+        return dirname(__FILE__).'/'.$fname;
     }
 
     /**
      * Loads the language trigram database from filename
      *
      * Trigram datbase should be a serialize()'d array
-     * 
-     * @access    private
-     * @param     string      $fname   the filename where the data is stored
-     * @return    array                the language model data
-     * @throws    PEAR_Error
+     *
+     * @param string $fname the filename where the data is stored
+     *
+     * @return array the language model data
+     * @throws Text_LanguageDetect_Exception
+     * @access private
      */
     function _readdb($fname)
     {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
 
         // input check
         if (!file_exists($fname)) {
-            throw new Exception('Language database does not exist.');
+            throw new Text_LanguageDetect_Exception(
+                'Language database does not exist: ' . $fname,
+                Text_LanguageDetect_Exception::DB_NOT_FOUND
+            );
         } elseif (!is_readable($fname)) {
-            throw new Exception('Language database is not readable.');
+            throw new Text_LanguageDetect_Exception(
+                'Language database is not readable: ' . $fname,
+                Text_LanguageDetect_Exception::DB_NOT_READABLE
+            );
         }
 
-        if (function_exists('file_get_contents')) {
-            return unserialize(file_get_contents($fname));
-        } else {
-            // if you don't have file_get_contents(), 
-            // then this is the next fastest way
-            ob_start();
-            readfile($fname);
-            $contents = ob_get_contents();
-            ob_end_clean();
-            return unserialize($contents);
-        }
+        return unserialize(file_get_contents($fname));
     }
 
 
     /**
      * Checks if this object is ready to detect languages
-     * 
-     * @access   private
-     * @param    mixed   &$err  error object to be returned by reference, if any
-     * @return   bool           true if no errors
+     *
+     * @param array $trigram Trigram data from database
+     *
+     * @return void
+     * @access private
      */
-    function _setup_ok(&$err)
+    function _checkTrigram($trigram)
     {
-        if (!is_array($this->_lang_db)) {
+        if (!is_array($trigram)) {
             if (ini_get('magic_quotes_runtime')) {
-                throw new Exception('Error loading database. Try turning magic_quotes_runtime off.');
-            } else {
-                throw new Exception('Language database is not an array.');
+                throw new Text_LanguageDetect_Exception(
+                    'Error loading database. Try turning magic_quotes_runtime off.',
+                    Text_LanguageDetect_Exception::MAGIC_QUOTES
+                );
             }
-            return false;
-
-        } elseif (empty($this->_lang_db)) {
-            throw new Exception('Language database has no elements.');
-            return false;
-
-        } else {
-            return true;
+            throw new Text_LanguageDetect_Exception(
+                'Language database is not an array.',
+                Text_LanguageDetect_Exception::DB_NOT_ARRAY
+            );
+        } elseif (empty($trigram)) {
+            throw new Text_LanguageDetect_Exception(
+                'Language database has no elements.',
+                Text_LanguageDetect_Exception::DB_EMPTY
+            );
         }
     }
 
     /**
      * Omits languages
      *
-     * Pass this function the name of or an array of names of 
+     * Pass this function the name of or an array of names of
      * languages that you don't want considered
      *
-     * If you're only expecting a limited set of languages, this can greatly 
+     * If you're only expecting a limited set of languages, this can greatly
      * speed up processing
      *
-     * @access   public
-     * @param    mixed  $omit_list      language name or array of names to omit
-     * @param    bool   $include_only   if true will include (rather than 
-     *                                  exclude) only those in the list
-     * @return   int                    number of languages successfully deleted
-     * @throws   PEAR_Error
+     * @param mixed $omit_list    language name or array of names to omit
+     * @param bool  $include_only if true will include (rather than
+     *                            exclude) only those in the list
+     *
+     * @return int number of languages successfully deleted
+     * @throws Text_LanguageDetect_Exception
      */
-    function omitLanguages($omit_list, $include_only = false)
+    public function omitLanguages($omit_list, $include_only = false)
     {
-
-        // setup check
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         $deleted = 0;
 
-        // deleting the given languages
+        $omit_list = $this->_convertFromNameMode($omit_list);
+
         if (!$include_only) {
+            // deleting the given languages
             if (!is_array($omit_list)) {
                 $omit_list = strtolower($omit_list); // case desensitize
                 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
                     if (isset($this->_lang_db[$omit_lang])) {
                         unset($this->_lang_db[$omit_lang]);
                         $deleted++;
-                    } 
+                    }
                 }
             }
 
-        // deleting all except the given languages
         } else {
+            // deleting all except the given languages
             if (!is_array($omit_list)) {
                 $omit_list = array($omit_list);
             }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
         // reset the cluster cache if the number of languages changes
         // this will then have to be recalculated
         if (isset($this->_clusters) && $deleted > 0) {
-            unset($this->_clusters);
+            $this->_clusters = null;
         }
 
         return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
      *
      * @access public
      * @return int            the number of languages
-     * @throws PEAR_Error
+     * @throws   Text_LanguageDetect_Exception
      */
     function getLanguageCount()
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            return count($this->_lang_db);
-        }
+        return count($this->_lang_db);
     }
 
     /**
-     * Returns true if a given language exists
+     * Checks if the language with the given name exists in the database
      *
-     * If passed an array of names, will return true only if all exist
+     * @param mixed $lang Language name or array of language names
      *
-     * @access    public
-     * @param     mixed       $lang    language name or array of language names
-     * @return    bool                 true if language model exists
-     * @throws    PEAR_Error
+     * @return bool true if language model exists
      */
-    function languageExists($lang)
+    public function languageExists($lang)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            // string
-            if (is_string($lang)) {
-                return isset($this->_lang_db[strtolower($lang)]);
-
-            // array
-            } elseif (is_array($lang)) {
-                foreach ($lang as $test_lang) {
-                    if (!isset($this->_lang_db[strtolower($test_lang)])) {
-                        return false;
-                    } 
-                }
-                return true;
+        $lang = $this->_convertFromNameMode($lang);
 
-            // other (error)
-            } else {
-                throw new Exception('Unknown type passed to languageExists()');
+        if (is_string($lang)) {
+            return isset($this->_lang_db[strtolower($lang)]);
+
+        } elseif (is_array($lang)) {
+            foreach ($lang as $test_lang) {
+                if (!isset($this->_lang_db[strtolower($test_lang)])) {
+                    return false;
+                }
             }
+            return true;
+
+        } else {
+            throw new Text_LanguageDetect_Exception(
+                'Unsupported parameter type passed to languageExists()',
+                Text_LanguageDetect_Exception::PARAM_TYPE
+            );
         }
     }
 
@@ -389,25 +383,24 @@ class Text_LanguageDetect
      * Returns the list of detectable languages
      *
      * @access public
-     * @return array        the names of the languages known to this object
-     * @throws PEAR_Error
+     * @return array        the names of the languages known to this object<<<<<<<
+     * @throws   Text_LanguageDetect_Exception
      */
     function getLanguages()
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            return array_keys($this->_lang_db);
-        }
+        return $this->_convertToNameMode(
+            array_keys($this->_lang_db)
+        );
     }
 
     /**
      * Make this object behave like Language::Guess
-     * 
-     * @access    public
-     * @param     bool     $setting     false to turn off perl compatibility
+     *
+     * @param bool $setting false to turn off perl compatibility
+     *
+     * @return void
      */
-    function setPerlCompatible($setting = true)
+    public function setPerlCompatible($setting = true)
     {
         if (is_bool($setting)) { // input check
             $this->_perl_compatible = $setting;
@@ -421,6 +414,21 @@ class Text_LanguageDetect
 
     }
 
+    /**
+     * Sets the way how language names are accepted and returned.
+     *
+     * @param integer $name_mode One of the following modes:
+     *                           0 - language name ("english")
+     *                           2 - 2-letter ISO 639-1 code ("en")
+     *                           3 - 3-letter ISO 639-2 code ("eng")
+     *
+     * @return void
+     */
+    function setNameMode($name_mode)
+    {
+        $this->_name_mode = $name_mode;
+    }
+
     /**
      * Whether to use unicode block ranges in detection
      *
@@ -429,10 +437,11 @@ class Text_LanguageDetect
      * in languages that use latin scripts. In other cases it should speed up
      * detection noticeably.
      *
-     * @access  public
-     * @param   bool    $setting    false to turn off
+     * @param bool $setting false to turn off
+     *
+     * @return void
      */
-    function useUnicodeBlocks($setting = true)
+    public function useUnicodeBlocks($setting = true)
     {
         if (is_bool($setting)) {
             $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
     /**
      * Converts a piece of text into trigrams
      *
-     * Superceded by the Text_LanguageDetect_Parser class 
+     * @param string $text text to convert
      *
-     * @access    private
-     * @param     string    $text    text to convert
-     * @return    array              array of trigram frequencies
+     * @return     array array of trigram frequencies
+     * @access     private
+     * @deprecated Superceded by the Text_LanguageDetect_Parser class
      */
     function _trigram($text)
     {
-        $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename);
+        $s = new Text_LanguageDetect_Parser($text);
         $s->prepareTrigram();
         $s->prepareUnicode(false);
         $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
      *
      * Thresholds (cuts off) the list at $this->_threshold
      *
-     * @access    protected
-     * @param     array     $arr     array of trgram 
-     * @return    array              ranks of trigrams
+     * @param array $arr array of trigram
+     *
+     * @return array ranks of trigrams
+     * @access protected
      */
-    function _arr_rank(&$arr)
+    function _arr_rank($arr)
     {
 
         // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
 
     /**
      * Sorts an array by value breaking ties alphabetically
-     * 
-     * @access   private
-     * @param    array     &$arr     the array to sort
+     *
+     * @param array &$arr the array to sort
+     *
+     * @return void
+     * @access private
      */
     function _bub_sort(&$arr)
     {
         // should do the same as this perl statement:
-        // sort { $trigrams{$b} == $trigrams{$a} ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
+        // sort { $trigrams{$b} == $trigrams{$a}
+        //   ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
 
         // needs to sort by both key and value at once
         // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
     /**
      * Sort function used by bubble sort
      *
-     * Callback function for usort(). 
+     * Callback function for usort().
      *
-     * @access   private
-     * @param    array        first param passed by usort()
-     * @param    array        second param passed by usort()
-     * @return   int          1 if $a is greater, -1 if not
-     * @see      _bub_sort()
+     * @param array $a first param passed by usort()
+     * @param array $b second param passed by usort()
+     *
+     * @return int 1 if $a is greater, -1 if not
+     * @see    _bub_sort()
+     * @access private
      */
     function _sort_func($a, $b)
     {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
         list($a_key, $a_value) = $a;
         list($b_key, $b_value) = $b;
 
-        // if the values are the same, break ties using the key
         if ($a_value == $b_value) {
+            // if the values are the same, break ties using the key
             return strcmp($a_key, $b_key);
 
-        // if not, just sort normally
         } else {
+            // if not, just sort normally
             if ($a_value > $b_value) {
                 return -1;
             } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
     }
 
     /**
-     * Calculates a linear rank-order distance statistic between two sets of 
+     * Calculates a linear rank-order distance statistic between two sets of
      * ranked trigrams
      *
-     * Sums the differences in rank for each trigram. If the trigram does not 
+     * Sums the differences in rank for each trigram. If the trigram does not
      * appear in both, consider it a difference of $this->_threshold.
      *
      * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
      * its simplicity it has been shown to be highly accurate for language
      * identification tasks.
      *
-     * @access  private
-     * @param   array    $arr1  the reference set of trigram ranks
-     * @param   array    $arr2  the target set of trigram ranks
-     * @return  int             the sum of the differences between the ranks of
-     *                          the two trigram sets
+     * @param array $arr1 the reference set of trigram ranks
+     * @param array $arr2 the target set of trigram ranks
+     *
+     * @return int the sum of the differences between the ranks of
+     *             the two trigram sets
+     * @access private
      */
-    function _distance(&$arr1, &$arr2)
+    function _distance($arr1, $arr2)
     {
         $sumdist = 0;
 
@@ -598,14 +613,15 @@ class Text_LanguageDetect
 
     /**
      * Normalizes the score returned by _distance()
-     * 
+     *
      * Different if perl compatible or not
      *
-     * @access    private
-     * @param     int    $score          the score from _distance()
-     * @param     int    $base_count     the number of trigrams being considered
-     * @return    float                  the normalized score
-     * @see       _distance()
+     * @param int $score      the score from _distance()
+     * @param int $base_count the number of trigrams being considered
+     *
+     * @return float the normalized score
+     * @see    _distance()
+     * @access private
      */
     function _normalize_score($score, $base_count = null)
     {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
      *
      * If perl compatible, the score is 300-0, 0 being most similar.
      * Otherwise, it's 0-1 with 1 being most similar.
-     * 
+     *
      * The $sample text should be at least a few sentences in length;
      * should be ascii-7 or utf8 encoded, if another and the mbstring extension
      * is present it will try to detect and convert. However, experience has
-     * shown that mb_detect_encoding() *does not work very well* with at least 
+     * shown that mb_detect_encoding() *does not work very well* with at least
      * some types of encoding.
      *
-     * @access  public
-     * @param   string  $sample a sample of text to compare.
-     * @param   int     $limit  if specified, return an array of the most likely
-     *                           $limit languages and their scores.
-     * @return  mixed       sorted array of language scores, blank array if no 
-     *                      useable text was found, or PEAR_Error if error 
-     *                      with the object setup
-     * @see     _distance()
-     * @throws  PEAR_Error
+     * @param string $sample a sample of text to compare.
+     * @param int    $limit  if specified, return an array of the most likely
+     *                       $limit languages and their scores.
+     *
+     * @return mixed sorted array of language scores, blank array if no
+     *               useable text was found
+     * @see    _distance()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detect($sample, $limit = 0)
+    public function detect($sample, $limit = 0)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         // input check
         if (!Text_LanguageDetect_Parser::validateString($sample)) {
             return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
 
         // check char encoding
         // (only if mbstring extension is compiled and PHP > 4.0.6)
-        if (function_exists('mb_detect_encoding') 
-            && function_exists('mb_convert_encoding')) {
-
+        if (function_exists('mb_detect_encoding')
+            && function_exists('mb_convert_encoding')
+        ) {
             // mb_detect_encoding isn't very reliable, to say the least
-            // detection should still work with a sufficient sample of ascii characters
+            // detection should still work with a sufficient sample
+            //  of ascii characters
             $encoding = mb_detect_encoding($sample);
 
             // mb_detect_encoding() will return FALSE if detection fails
             // don't attempt conversion if that's the case
-            if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) {
-            
-                if (function_exists('mb_list_encodings')) {
-                    // verify the encoding exists in mb_list_encodings
-                    if (in_array($encoding, mb_list_encodings())) {
-                        $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
-                    }
-
-                    // if the previous condition failed:
-                    // somehow we detected an encoding that also we don't support
-
-                } else {
-                    // php 4 doesnt have mb_list_encodings()
-                    // so attempt with error suppression
-                    $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
+            if ($encoding != 'ASCII' && $encoding != 'UTF-8'
+                && $encoding !== false
+            ) {
+                // verify the encoding exists in mb_list_encodings
+                if (in_array($encoding, mb_list_encodings())) {
+                    $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
                 }
             }
         }
 
-        $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($sample);
         $sample_obj->prepareTrigram();
         if ($this->_use_unicode_narrowing) {
             $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
             if (is_array($blocks)) {
                 $present_blocks = array_keys($blocks);
             } else {
-                throw new Exception('Error during block detection');
+                throw new Text_LanguageDetect_Exception(
+                    'Error during block detection',
+                    Text_LanguageDetect_Exception::BLOCK_DETECTION
+                );
             }
 
             $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
             }
 
             // could also try an intersect operation rather than a union
-            // in other words, choose languages whose trigrams contain 
+            // in other words, choose languages whose trigrams contain
             // ALL of the unicode blocks found in this sample
             // would improve speed but would be completely thrown off by an
             // unexpected character, like an umlaut appearing in english text
 
             $possible_langs = array_intersect(
-                        array_keys($this->_lang_db),
-                        array_unique($possible_langs)
+                array_keys($this->_lang_db),
+                array_unique($possible_langs)
             );
 
-            // needs to intersect it with the keys of _lang_db in case 
+            // needs to intersect it with the keys of _lang_db in case
             // languages have been omitted
 
-        // or just try 'em all
         } else {
+            // or just try 'em all
             $possible_langs = array_keys($this->_lang_db);
         }
 
 
         foreach ($possible_langs as $lang) {
-            $scores[$lang] =
-                $this->_normalize_score(
-                        $this->_distance($this->_lang_db[$lang], $trigram_freqs),
-                        $trigram_count);
+            $scores[$lang] = $this->_normalize_score(
+                $this->_distance($this->_lang_db[$lang], $trigram_freqs),
+                $trigram_count
+            );
         }
 
         unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
             $limited_scores = array();
 
             $i = 0;
-
             foreach ($scores as $key => $value) {
                 if ($i++ >= $limit) {
                     break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
                 $limited_scores[$key] = $value;
             }
 
-            return $limited_scores;
+            return $this->_convertToNameMode($limited_scores, true);
         } else {
-            return $scores;
+            return $this->_convertToNameMode($scores, true);
         }
     }
 
@@ -791,35 +795,33 @@ class Text_LanguageDetect
      * Returns only the most similar language to the text sample
      *
      * Calls $this->detect() and returns only the top result
-     * 
-     * @access   public
-     * @param    string    $sample    text to detect the language of
-     * @return   string               the name of the most likely language
-     *                                or null if no language is similar
-     * @see      detect()
-     * @throws   PEAR_Error
+     *
+     * @param string $sample text to detect the language of
+     *
+     * @return string the name of the most likely language
+     *                or null if no language is similar
+     * @see    detect()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectSimple($sample)
+    public function detectSimple($sample)
     {
         $scores = $this->detect($sample, 1);
 
         // if top language has the maximum possible score,
         // then the top score will have been picked at random
-        if (    !is_array($scores) 
-                || empty($scores) 
-                || current($scores) == $this->_max_score) {
-
+        if (!is_array($scores) || empty($scores)
+            || current($scores) == $this->_max_score
+        ) {
             return null;
-
         } else {
-            return ucfirst(key($scores));
+            return key($scores);
         }
     }
 
     /**
      * Returns an array containing the most similar language and a confidence
      * rating
-     * 
+     *
      * Confidence is a simple measure calculated from the similarity score
      * minus the similarity score from the next most similar language
      * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
      * confidence scores.
      *
      * The similarity score answers the question "How likely is the text the
-     * returned language regardless of the other languages considered?" The 
+     * returned language regardless of the other languages considered?" The
      * confidence score is one way of answering the question "how likely is the
      * text the detected language relative to the rest of the language model
      * set?"
      *
      * To see how similar languages are a priori, see languageSimilarity()
-     * 
-     * @access   public
-     * @param    string    $sample    text for which language will be detected
-     * @return   array     most similar language, score and confidence rating
-     *                     or null if no language is similar
-     * @see      detect()
-     * @throws   PEAR_Error
+     *
+     * @param string $sample text for which language will be detected
+     *
+     * @return array most similar language, score and confidence rating
+     *               or null if no language is similar
+     * @see    detect()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectConfidence($sample)
+    public function detectConfidence($sample)
     {
         $scores = $this->detect($sample, 2);
 
-        // if most similar language has the max score, it 
+        // if most similar language has the max score, it
         // will have been picked at random
-        if (    !is_array($scores) 
-                || empty($scores) 
-                || current($scores) == $this->_max_score) {
-
+        if (!is_array($scores) || empty($scores)
+            || current($scores) == $this->_max_score
+        ) {
             return null;
         }
 
-        $arr['language'] = ucfirst(key($scores));
+        $arr['language'] = key($scores);
         $arr['similarity'] = current($scores);
         if (next($scores) !== false) { // if false then no next element
             // the goal is to return a higher value if the distance between
             // the similarity of the first score and the second score is high
 
             if ($this->_perl_compatible) {
-
-                $arr['confidence'] =
-                    (current($scores) - $arr['similarity']) / $this->_max_score;
+                $arr['confidence'] = (current($scores) - $arr['similarity'])
+                    / $this->_max_score;
 
             } else {
-
                 $arr['confidence'] = $arr['similarity'] - current($scores);
 
             }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
      * Returns the distribution of unicode blocks in a given utf8 string
      *
      * For the block name of a single char, use unicodeBlockName()
-     * 
-     * @access public
-     * @param string $str input string. Must be ascii or utf8
-     * @param bool $skip_symbols if true, skip ascii digits, symbols and 
-     *                           non-printing characters. Includes spaces,
-     *                           newlines and common punctutation characters.
+     *
+     * @param string $str          input string. Must be ascii or utf8
+     * @param bool   $skip_symbols if true, skip ascii digits, symbols and
+     *                             non-printing characters. Includes spaces,
+     *                             newlines and common punctutation characters.
+     *
      * @return array
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectUnicodeBlocks($str, $skip_symbols)
+    public function detectUnicodeBlocks($str, $skip_symbols)
     {
-        // input check
-        if (!is_bool($skip_symbols)) {
-            throw new Exception('Second parameter must be boolean');
-        } 
-
-        if (!is_string($str)) {
-            throw new Exception('First parameter was not a string');
-        }
+        $skip_symbols = (bool)$skip_symbols;
+        $str          = (string)$str;
 
-        $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($str);
         $sample_obj->prepareUnicode();
         $sample_obj->prepareTrigram(false);
         $sample_obj->setUnicodeSkipSymbols($skip_symbols);
         $sample_obj->analyze();
-        $blocks =& $sample_obj->getUnicodeBlocks();
+        $blocks = $sample_obj->getUnicodeBlocks();
         unset($sample_obj);
         return $blocks;
     }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
     /**
      * Returns the block name for a given unicode value
      *
-     * If passed a string, will assume it is being passed a UTF8-formatted 
+     * If passed a string, will assume it is being passed a UTF8-formatted
      * character and will automatically convert. Otherwise it will assume it
      * is being passed a numeric unicode value.
      *
      * Make sure input is of the correct type!
      *
-     * @access public
      * @param mixed $unicode unicode value or utf8 char
+     *
      * @return mixed the block name string or false if not found
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
      */
-    function unicodeBlockName($unicode) {
+    public function unicodeBlockName($unicode)
+    {
         if (is_string($unicode)) {
             // assume it is being passed a utf8 char, so convert it
-
-            // input check
-            if ($this->utf8strlen($unicode) > 1) {
-                throw new Exception('Pass this function only a single char');
+            if (self::utf8strlen($unicode) > 1) {
+                throw new Text_LanguageDetect_Exception(
+                    'Pass a single char only to this method',
+                    Text_LanguageDetect_Exception::PARAM_TYPE
+                );
             }
-
             $unicode = $this->_utf8char2unicode($unicode);
 
-            if ($unicode == -1) {
-                throw new Exception('Malformatted char');
-            }
-
-        // input check
         } elseif (!is_int($unicode)) {
-            throw new Exception('Input must be of type string or int.');
+            throw new Text_LanguageDetect_Exception(
+                'Input must be of type string or int.',
+                Text_LanguageDetect_Exception::PARAM_TYPE
+            );
         }
 
-        $blocks =& $this->_read_unicode_block_db();
+        $blocks = $this->_read_unicode_block_db();
 
         $result = $this->_unicode_block_name($unicode, $blocks);
 
@@ -964,14 +956,17 @@ class Text_LanguageDetect
      * the public interface for this function, which does input checks which
      * this function omits for speed.
      *
-     * @access  protected
-     * @param   int     $unicode the unicode value
-     * @param   array   &$blocks the block database
-     * @param   int     $block_count the number of defined blocks in the database
-     * @see     unicodeBlockName()
+     * @param int   $unicode     the unicode value
+     * @param array $blocks      the block database
+     * @param int   $block_count the number of defined blocks in the database
+     *
+     * @return mixed Block name, -1 if it failed
+     * @see    unicodeBlockName()
+     * @access protected
      */
-    function _unicode_block_name($unicode, &$blocks, $block_count = -1) {
-        // for a reference, see 
+    function _unicode_block_name($unicode, $blocks, $block_count = -1)
+    {
+        // for a reference, see
         // http://www.unicode.org/Public/UNIDATA/Blocks.txt
 
         // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
         while ($low <= $high) {
             $mid = floor(($low + $high) / 2);
 
-            // if it's lower than the lower bound
             if ($unicode < $blocks[$mid][0]) {
+                // if it's lower than the lower bound
                 $high = $mid - 1;
 
-            // if it's higher than the upper bound
             } elseif ($unicode > $blocks[$mid][1]) {
+                // if it's higher than the upper bound
                 $low = $mid + 1;
 
-            // found it
             } else {
+                // found it
                 return $blocks[$mid];
             }
         }
 
-        // failed to find the block 
+        // failed to find the block
         return -1;
 
-        // todo: differentiate when it's out of range or when it falls 
+        // todo: differentiate when it's out of range or when it falls
         //       into an unassigned range?
     }
 
     /**
      * Brings up the unicode block database
      *
-     * @access protected
      * @return array the database of unicode block definitions
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
+     * @access protected
      */
-    function &_read_unicode_block_db() {
+    function _read_unicode_block_db()
+    {
         // since the unicode definitions are always going to be the same,
         // might as well share the memory for the db with all other instances
         // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
 
     /**
      * Calculate the similarities between the language models
-     * 
+     *
      * Use this function to see how similar languages are to each other.
      *
      * If passed 2 language names, will return just those languages compared.
      * If passed 1 language name, will return that language compared to
      * all others.
-     * If passed none, will return an array of every language model compared 
+     * If passed none, will return an array of every language model compared
      * to every other one.
      *
-     * @access  public
-     * @param   string   $lang1   the name of the first language to be compared
-     * @param   string   $lang2   the name of the second language to be compared
-     * @return  array    scores of every language compared
-     *                   or the score of just the provided languages
-     *                   or null if one of the supplied languages does not exist
-     * @throws  PEAR_Error
+     * @param string $lang1 the name of the first language to be compared
+     * @param string $lang2 the name of the second language to be compared
+     *
+     * @return array scores of every language compared
+     *               or the score of just the provided languages
+     *               or null if one of the supplied languages does not exist
+     * @throws Text_LanguageDetect_Exception
      */
-    function languageSimilarity($lang1 = null, $lang2 = null)
+    public function languageSimilarity($lang1 = null, $lang2 = null)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
+        $lang1 = $this->_convertFromNameMode($lang1);
+        $lang2 = $this->_convertFromNameMode($lang2);
         if ($lang1 != null) {
             $lang1 = strtolower($lang1);
 
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
             }
 
             if ($lang2 != null) {
-
-                // can't only set the second param
-                if ($lang1 == null) {
-                    return null;
-                // check if language model exists
-                } elseif (!isset($this->_lang_db[$lang2])) {
+                if (!isset($this->_lang_db[$lang2])) {
+                    // check if language model exists
                     return null;
                 }
 
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
                     )
                 );
 
-
-            // compare just $lang1 to all languages
             } else {
+                // compare just $lang1 to all languages
                 $return_arr = array();
                 foreach ($this->_lang_db as $key => $value) {
-                    if ($key != $lang1) { // don't compare a language to itself
+                    if ($key != $lang1) {
+                        // don't compare a language to itself
                         $return_arr[$key] = $this->_normalize_score(
-                            $this->_distance($this->_lang_db[$lang1], $value));
+                            $this->_distance($this->_lang_db[$lang1], $value)
+                        );
                     }
                 }
                 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
             }
 
 
-        // compare all languages to each other
         } else {
+            // compare all languages to each other
             $return_arr = array();
             foreach (array_keys($this->_lang_db) as $lang1) {
                 foreach (array_keys($this->_lang_db) as $lang2) {
-
                     // skip comparing languages to themselves
-                    if ($lang1 != $lang2) { 
-                    
-                        // don't re-calculate what's already been done
-                        if (isset($return_arr[$lang2][$lang1])) {
+                    if ($lang1 != $lang2) {
 
-                            $return_arr[$lang1][$lang2] =
-                                $return_arr[$lang2][$lang1];
+                        if (isset($return_arr[$lang2][$lang1])) {
+                            // don't re-calculate what's already been done
+                            $return_arr[$lang1][$lang2]
+                                = $return_arr[$lang2][$lang1];
 
-                        // calculate
                         } else {
-
-                            $return_arr[$lang1][$lang2] = 
-                                $this->_normalize_score(
-                                        $this->_distance(
-                                            $this->_lang_db[$lang1],
-                                            $this->_lang_db[$lang2]
-                                        )
+                            // calculate
+                            $return_arr[$lang1][$lang2]
+                                $this->_normalize_score(
+                                    $this->_distance(
+                                        $this->_lang_db[$lang1],
+                                        $this->_lang_db[$lang2]
+                                    )
                                 );
 
                         }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
      *
      * @access      public
      * @return      array language cluster data
-     * @throws      PEAR_Error
+     * @throws      Text_LanguageDetect_Exception
      * @see         languageSimilarity()
-     * @deprecated  this function will eventually be removed and placed into 
+     * @deprecated  this function will eventually be removed and placed into
      *              the model generation class
      */
     function clusterLanguages()
     {
         // todo: set the maximum number of clusters
-
-        // setup check
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         // return cached result, if any
         if (isset($this->_clusters)) {
             return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
 
         foreach ($langs as $lang) {
             if (!isset($this->_lang_db[$lang])) {
-                throw new Exception("missing $lang!\n");
+                throw new Text_LanguageDetect_Exception(
+                    "missing $lang!",
+                    Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
+                );
             }
         }
 
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
             $langs[$lang1] = $lang1;
             unset($langs[$old_key]);
         }
-        
+
+        $result_data = $really_map = array();
+
         $i = 0;
         while (count($langs) > 2 && $i++ < 200) {
             $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
             $highest_key2 = '';
             foreach ($langs as $lang1) {
                 foreach ($langs as $lang2) {
-                    if (    $lang1 != $lang2 
-                            && $arr[$lang1][$lang2] > $highest_score) {
+                    if ($lang1 != $lang2
+                        && $arr[$lang1][$lang2] > $highest_score
+                    ) {
                         $highest_score = $arr[$lang1][$lang2];
                         $highest_key1 = $lang1;
                         $highest_key2 = $lang2;
                     }
                 }
             }
-            
+
             if (!$highest_key1) {
                 // should not ever happen
-                throw new Exception("no highest key? (step: $i)");
+                throw new Text_LanguageDetect_Exception(
+                    "no highest key? (step: $i)",
+                    Text_LanguageDetect_Exception::NO_HIGHEST_KEY
+                );
             }
 
             if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
             $sum1 = array_sum($arr[$highest_key1]);
             $sum2 = array_sum($arr[$highest_key2]);
 
-            // use the score for the one that is most similar to the rest of 
+            // use the score for the one that is most similar to the rest of
             // the field as the score for the group
             // todo: could try averaging or "centroid" method instead
             // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
             $really_lang = $replaceme;
             while (isset($really_map[$really_lang])) {
                 $really_lang = $really_map[$really_lang];
-            } 
+            }
             $really_map[$newkey] = $really_lang;
 
 
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
                         $arr[$key1][$newkey] = $arr[$key1][$key2];
                         unset($arr[$key1][$key2]);
                         // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
-                    } 
-                    
+                    }
+
                     if ($key1 == $replaceme) {
                         $arr[$newkey][$key2] = $arr[$key1][$key2];
                         unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
                     }
                 }
             }
-                        
+
 
             unset($langs[$highest_key1]);
             unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
         }
 
         $return_val = array(
-                'open_forks' => $langs, 
+                'open_forks' => $langs,
                         // the top level of clusters
                         // clusters that are mutually exclusive
                         // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
      * use, and it may disappear or its functionality may change in future
      * releases without notice.
      *
-     * This compares the sample text to top the top level of clusters. If the 
+     * This compares the sample text to top the top level of clusters. If the
      * sample is similar to the cluster it will drop down and compare it to the
      * languages in the cluster, and so on until it hits a leaf node.
      *
-     * this should find the language in considerably fewer compares 
+     * this should find the language in considerably fewer compares
      * (the equivalent of a binary search), however clusterLanguages() is costly
      * and the loss of accuracy from this technique is significant.
      *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
      * was very large, however in such cases some method of Bayesian inference
      * might be more helpful.
      *
-     * @see     clusterLanguages()
-     * @access  public
-     * @param   string $str input string
-     * @return  array language scores (only those compared)
-     * @throws  PEAR_Error
+     * @param string $str input string
+     *
+     * @return array language scores (only those compared)
+     * @throws Text_LanguageDetect_Exception
+     * @see    clusterLanguages()
      */
-    function clusteredSearch($str)
+    public function clusteredSearch($str)
     {
-
         // input check
         if (!Text_LanguageDetect_Parser::validateString($str)) {
             return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
         $dendogram_data  = $result['fork_data'];
         $dendogram_alias = $result['name_map'];
 
-        $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($str);
         $sample_obj->prepareTrigram();
         $sample_obj->setPadStart(!$this->_perl_compatible);
         $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
         }
 
         $i = 0; // counts the number of steps
-        
+
         foreach ($dendogram_start as $lang) {
             if (isset($dendogram_alias[$lang])) {
                 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
 
             $scores[$lang] = $this->_normalize_score(
                 $this->_distance($this->_lang_db[$lang_key], $sample_result),
-                $sample_count);
+                $sample_count
+            );
 
             $i++;
         }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
 
                 $scores[$lang] = $this->_normalize_score(
                     $this->_distance($this->_lang_db[$lang_key], $sample_result),
-                    $sample_count);
+                    $sample_count
+                );
 
                 //todo: does not need to do same comparison again
             }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
 
             $diff = $scores[$cur_key] - $scores[$loser_key];
 
-            // $cur_key ({$dendogram_alias[$cur_key]}) wins 
-            // over $loser_key ({$dendogram_alias[$loser_key]}) 
+            // $cur_key ({$dendogram_alias[$cur_key]}) wins
+            // over $loser_key ({$dendogram_alias[$loser_key]})
             // with a difference of $diff
         }
 
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
         // which paths the algorithm decided to take along the tree
 
         // but sometimes the last item is only the second highest
-        if (   ($this->_perl_compatible  && (end($scores) > prev($scores)))
-            || (!$this->_perl_compatible && (end($scores) < prev($scores)))) {
-
+        if (($this->_perl_compatible  && (end($scores) > prev($scores)))
+            || (!$this->_perl_compatible && (end($scores) < prev($scores)))
+        ) {
             $real_last_score = current($scores);
             $real_last_key = key($scores);
 
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
             unset($scores[$real_last_key]);
             $scores[$real_last_key] = $real_last_score;
         }
-            
+
 
         if (!$this->_perl_compatible) {
             $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
      *
      * Returns the numbers of characters (not bytes) in a utf8 string
      *
-     * @static
-     * @access  public
-     * @param   string $str string to get the length of
-     * @return  int         number of chars
+     * @param string $str string to get the length of
+     *
+     * @return int number of chars
      */
-    function utf8strlen($str)
+    public static function utf8strlen($str)
     {
         // utf8_decode() will convert unknown chars to '?', which is actually
         // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
     /**
      * Returns the unicode value of a utf8 char
      *
-     * @access  protected
-     * @param   string $char a utf8 (possibly multi-byte) char
-     * @return  int          unicode value or -1 if malformatted
+     * @param string $char a utf8 (possibly multi-byte) char
+     *
+     * @return int unicode value
+     * @access protected
+     * @link   http://en.wikipedia.org/wiki/UTF-8
      */
-    function _utf8char2unicode($char) {
-
+    function _utf8char2unicode($char)
+    {
         // strlen() here will actually get the binary length of a single char
         switch (strlen($char)) {
-
-            // for a reference, see http://en.wikipedia.org/wiki/UTF-8
-
-            case 1:
-                // normal ASCII-7 byte
-                // 0xxxxxxx -->  0xxxxxxx
-                return ord($char{0});
-
-            case 2:
-                // 2 byte unicode
-                // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
-                $z = (ord($char{0}) & 0x000001F) << 6;
-                $x = (ord($char{1}) & 0x0000003F);
-
-                return ($z | $x);
-
-            case 3:
-                // 3 byte unicode
-                // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 
-                $z =  (ord($char{0}) & 0x0000000F) << 12;
-                $x1 = (ord($char{1}) & 0x0000003F) << 6;
-                $x2 = (ord($char{2}) & 0x0000003F);
-
-                return ($z | $x1 | $x2);
-
-            case 4:
-                // 4 byte unicode
-                // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
-                // 000zzzzz xxxxxxxx xxxxxxxx
-                $z1 = (ord($char{0}) & 0x00000007) << 18;
-                $z2 = (ord($char{1}) & 0x0000003F) << 12;
-                $x1 = (ord($char{2}) & 0x0000003F) << 6;
-                $x2 = (ord($char{3}) & 0x0000003F);
-
-                return ($z1 | $z2 | $x1 | $x2);
-
-            default:
-                // error: malformatted char?
-                return -1;
+        case 1:
+            // normal ASCII-7 byte
+            // 0xxxxxxx -->  0xxxxxxx
+            return ord($char{0});
+
+        case 2:
+            // 2 byte unicode
+            // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
+            $z = (ord($char{0}) & 0x000001F) << 6;
+            $x = (ord($char{1}) & 0x0000003F);
+            return ($z | $x);
+
+        case 3:
+            // 3 byte unicode
+            // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
+            $z =  (ord($char{0}) & 0x0000000F) << 12;
+            $x1 = (ord($char{1}) & 0x0000003F) << 6;
+            $x2 = (ord($char{2}) & 0x0000003F);
+            return ($z | $x1 | $x2);
+
+        case 4:
+            // 4 byte unicode
+            // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
+            // 000zzzzz xxxxxxxx xxxxxxxx
+            $z1 = (ord($char{0}) & 0x00000007) << 18;
+            $z2 = (ord($char{1}) & 0x0000003F) << 12;
+            $x1 = (ord($char{2}) & 0x0000003F) << 6;
+            $x2 = (ord($char{3}) & 0x0000003F);
+            return ($z1 | $z2 | $x1 | $x2);
         }
     }
 
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
      * utf8-safe fast character iterator
      *
      * Will get the next character starting from $counter, which will then be
-     * incremented. If a multi-byte char the bytes will be concatenated and 
+     * incremented. If a multi-byte char the bytes will be concatenated and
      * $counter will be incremeted by the number of bytes in the char.
      *
-     * @access  private
-     * @param   string  &$str        the string being iterated over
-     * @param   int     &$counter    the iterator, will increment by reference
-     * @param   bool    $special_convert whether to do special conversions
-     * @return  char    the next (possibly multi-byte) char from $counter
+     * @param string $str             the string being iterated over
+     * @param int    &$counter        the iterator, will increment by reference
+     * @param bool   $special_convert whether to do special conversions
+     *
+     * @return char the next (possibly multi-byte) char from $counter
+     * @access private
      */
-    function _next_char(&$str, &$counter, $special_convert = false)
+    static function _next_char($str, &$counter, $special_convert = false)
     {
-
         $char = $str{$counter++};
         $ord = ord($char);
 
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
 
         // normal ascii one byte char
         if ($ord <= 127) {
-
             // special conversions needed for this package
             // (that only apply to regular ascii characters)
             // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
 
             return $char;
 
-        // multi-byte chars
         } elseif ($ord >> 5 == 6) { // two-byte char
+            // multi-byte chars
             $nextchar = $str{$counter++}; // get next byte
 
             // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
                 if ($ord == 195) {
                     $nextord = ord($nextchar);
                     $nextord_adj = $nextord + 64;
-                    // for a reference, see 
+                    // for a reference, see
                     // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
 
                     // &Agrave; - &THORN; but not &times;
-                    if (    $nextord_adj >= 192
-                            && $nextord_adj <= 222 
-                            && $nextord_adj != 215) {
-
-                        $nextchar = chr($nextord + 32); 
+                    if ($nextord_adj >= 192
+                        && $nextord_adj <= 222
+                        && $nextord_adj != 215
+                    ) {
+                        $nextchar = chr($nextord + 32);
                     }
 
-                // lower case cyrillic alphabet
                 } elseif ($ord == 208) {
+                    // lower case cyrillic alphabet
                     $nextord = ord($nextchar);
                     // if A - Pe
                     if ($nextord >= 144 && $nextord <= 159) {
                         // lower case
                         $nextchar = chr($nextord + 32);
 
-                    // if Er - Ya
                     } elseif ($nextord >= 160 && $nextord <= 175) {
+                        // if Er - Ya
                         // lower case
                         $char = chr(209); // == $ord++
                         $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
             }
 
             // tag on next byte
-            return $char . $nextchar; 
-
+            return $char . $nextchar;
         } elseif ($ord >> 4  == 14) { // three-byte char
-            
+
             // tag on next 2 bytes
-            return $char . $str{$counter++} . $str{$counter++}; 
+            return $char . $str{$counter++} . $str{$counter++};
 
         } elseif ($ord >> 3 == 30) { // four-byte char
 
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
         }
     }
 
-}
+    /**
+     * Converts an $language input parameter from the configured mode
+     * to the language name that is used internally.
+     *
+     * Works for strings and arrays.
+     *
+     * @param string|array $lang       A language description ("english"/"en"/"eng")
+     * @param boolean      $convertKey If $lang is an array, setting $key
+     *                                 converts the keys to the language name.
+     *
+     * @return string|array Language name
+     */
+    function _convertFromNameMode($lang, $convertKey = false)
+    {
+        if ($this->_name_mode == 0) {
+            return $lang;
+        }
+
+        if ($this->_name_mode == 2) {
+            $method = 'code2ToName';
+        } else {
+            $method = 'code3ToName';
+        }
+
+        if (is_string($lang)) {
+            return (string)Text_LanguageDetect_ISO639::$method($lang);
+        }
+
+        $newlang = array();
+        foreach ($lang as $key => $val) {
+            if ($convertKey) {
+                $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
+                $newlang[$newkey] = $val;
+            } else {
+                $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
+            }
+        }
+        return $newlang;
+    }
 
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+    /**
+     * Converts an $language output parameter from the language name that is
+     * used internally to the configured mode.
+     *
+     * Works for strings and arrays.
+     *
+     * @param string|array $lang       A language description ("english"/"en"/"eng")
+     * @param boolean      $convertKey If $lang is an array, setting $key
+     *                                 converts the keys to the language name.
+     *
+     * @return string|array Language name
+     */
+    function _convertToNameMode($lang, $convertKey = false)
+    {
+        if ($this->_name_mode == 0) {
+            return $lang;
+        }
+
+        if ($this->_name_mode == 2) {
+            $method = 'nameToCode2';
+        } else {
+            $method = 'nameToCode3';
+        }
+
+        if (is_string($lang)) {
+            return Text_LanguageDetect_ISO639::$method($lang);
+        }
+
+        $newlang = array();
+        foreach ($lang as $key => $val) {
+            if ($convertKey) {
+                $newkey = Text_LanguageDetect_ISO639::$method($key);
+                $newlang[$newkey] = $val;
+            } else {
+                $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
+            }
+        }
+        return $newlang;
+    }
+}
 
-?>
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
\ No newline at end of file
index 2e8991cc3196c91db0bfe67339f1dbd9a7924cba..d0f09d74c3bdbda11d7a669a2a4f31ffbd539e44 100644 (file)
-<?php\r
-/** \r
-* Arc90's Readability ported to PHP for FiveFilters.org\r
-* Based on readability.js version 1.7.1 (without multi-page support)\r
-* Updated to allow HTML5 parsing with html5lib\r
-* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds\r
-* ------------------------------------------------------\r
-* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js\r
-* Arc90's project URL: http://lab.arc90.com/experiments/readability/\r
-* JS Source: http://code.google.com/p/arc90labs-readability\r
-* Ported by: Keyvan Minoukadeh, http://www.keyvan.net\r
-* More information: http://fivefilters.org/content-only/\r
-* License: Apache License, Version 2.0\r
-* Requires: PHP5\r
-* Date: 2012-09-19\r
-* \r
-* Differences between the PHP port and the original\r
-* ------------------------------------------------------\r
-* Arc90's Readability is designed to run in the browser. It works on the DOM \r
-* tree (the parsed HTML) after the page's CSS styles have been applied and \r
-* Javascript code executed. This PHP port does not run inside a browser. \r
-* We use PHP's ability to parse HTML to build our DOM tree, but we cannot \r
-* rely on CSS or Javascript support. As such, the results will not always \r
-* match Arc90's Readability. (For example, if a web page contains CSS style \r
-* rules or Javascript code which hide certain HTML elements from display, \r
-* Arc90's Readability will dismiss those from consideration but our PHP port, \r
-* unable to understand CSS or Javascript, will not know any better.)\r
-* \r
-* Another significant difference is that the aim of Arc90's Readability is \r
-* to re-present the main content block of a given web page so users can \r
-* read it more easily in their browsers. Correct identification, clean up, \r
-* and separation of the content block is only a part of this process. \r
-* This PHP port is only concerned with this part, it does not include code \r
-* that relates to presentation in the browser - Arc90 already do \r
-* that extremely well, and for PDF output there's FiveFilters.org's \r
-* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.\r
-* \r
-* Finally, this class contains methods that might be useful for developers \r
-* working on HTML document fragments. So without deviating too much from \r
-* the original code (which I don't want to do because it makes debugging \r
-* and updating more difficult), I've tried to make it a little more \r
-* developer friendly. You should be able to use the methods here on \r
-* existing DOMElement objects without passing an entire HTML document to \r
-* be parsed.\r
-*/\r
-\r
-// This class allows us to do JavaScript like assignements to innerHTML\r
-require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');\r
-\r
-// Alternative usage (for testing only!)\r
-// uncomment the lines below and call Readability.php in your browser \r
-// passing it the URL of the page you'd like content from, e.g.:\r
-// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php\r
-\r
-/*\r
-if (!isset($_GET['url']) || $_GET['url'] == '') {\r
-       die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');\r
-}\r
-$url = $_GET['url'];\r
-if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;\r
-$html = file_get_contents($url);\r
-$r = new Readability($html, $url);\r
-$r->init();\r
-echo $r->articleContent->innerHTML;\r
-*/\r
-\r
-class Readability\r
-{\r
-       public $version = '1.7.1-without-multi-page';\r
-       public $convertLinksToFootnotes = false;\r
-       public $revertForcedParagraphElements = true;\r
-       public $articleTitle;\r
-       public $articleContent;\r
-       public $dom;\r
-       public $url = null; // optional - URL where HTML was retrieved\r
-       public $debug = false;\r
-       public $lightClean = true; // preserves more content (experimental) added 2012-09-19\r
-       protected $body = null; // \r
-       protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later\r
-       protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.\r
-       protected $success = false; // indicates whether we were able to extract or not\r
-       \r
-       /**\r
-       * All of the regular expressions in use within readability.\r
-       * Defined up here so we don't instantiate them repeatedly in loops.\r
-       **/\r
-       public $regexps = array(\r
-               'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',\r
-               'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',\r
-               'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',\r
-               'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',\r
-               'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',\r
-               'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',\r
-               'replaceFonts' => '/<(\/?)font[^>]*>/i',\r
-               // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()\r
-               'normalize' => '/\s{2,}/',\r
-               'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',\r
-               'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',\r
-               'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'\r
-       );      \r
-       \r
-       /* constants */\r
-       const FLAG_STRIP_UNLIKELYS = 1;\r
-       const FLAG_WEIGHT_CLASSES = 2;\r
-       const FLAG_CLEAN_CONDITIONALLY = 4;\r
-       \r
-       /**\r
-       * Create instance of Readability\r
-       * @param string UTF-8 encoded string\r
-       * @param string (optional) URL associated with HTML (used for footnotes)\r
-       * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')\r
-       */      \r
-       function __construct($html, $url=null, $parser='libxml')\r
-       {\r
-               $this->url = $url;\r
-               /* Turn all double br's into p's */\r
-               $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);\r
-               $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);\r
-               $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");\r
-               if (trim($html) == '') $html = '<html></html>';\r
-               if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {\r
-                       // all good\r
-               } else {\r
-                       $this->dom = new DOMDocument();\r
-                       $this->dom->preserveWhiteSpace = false;\r
-                       @$this->dom->loadHTML($html);\r
-               }\r
-               $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');\r
-       }\r
-\r
-       /**\r
-       * Get article title element\r
-       * @return DOMElement\r
-       */\r
-       public function getTitle() {\r
-               return $this->articleTitle;\r
-       }\r
-       \r
-       /**\r
-       * Get article content element\r
-       * @return DOMElement\r
-       */\r
-       public function getContent() {\r
-               return $this->articleContent;\r
-       }       \r
-       \r
-       /**\r
-       * Runs readability.\r
-       * \r
-       * Workflow:\r
-       *  1. Prep the document by removing script tags, css, etc.\r
-       *  2. Build readability's DOM tree.\r
-       *  3. Grab the article content from the current dom tree.\r
-       *  4. Replace the current DOM tree with the new one.\r
-       *  5. Read peacefully.\r
-       *\r
-       * @return boolean true if we found content, false otherwise\r
-       **/\r
-       public function init()\r
-       {\r
-               if (!isset($this->dom->documentElement)) return false;\r
-               $this->removeScripts($this->dom);\r
-               //die($this->getInnerHTML($this->dom->documentElement));\r
-               \r
-               // Assume successful outcome\r
-               $this->success = true;\r
-\r
-               $bodyElems = $this->dom->getElementsByTagName('body');\r
-               if ($bodyElems->length > 0) {\r
-                       if ($this->bodyCache == null) {\r
-                               $this->bodyCache = $bodyElems->item(0)->innerHTML;\r
-                       }\r
-                       if ($this->body == null) {\r
-                               $this->body = $bodyElems->item(0);\r
-                       }\r
-               }\r
-\r
-               $this->prepDocument();\r
-               \r
-               //die($this->dom->documentElement->parentNode->nodeType);\r
-               //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));\r
-               //die($this->getInnerHTML($this->dom->documentElement));\r
-\r
-               /* Build readability's DOM tree */\r
-               $overlay        = $this->dom->createElement('div');\r
-               $innerDiv       = $this->dom->createElement('div');\r
-               $articleTitle   = $this->getArticleTitle();\r
-               $articleContent = $this->grabArticle();\r
-\r
-               if (!$articleContent) {\r
-                       $this->success = false;\r
-                       $articleContent = $this->dom->createElement('div');\r
-                       $articleContent->setAttribute('id', 'readability-content');\r
-                       $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';            \r
-               }\r
-               \r
-               $overlay->setAttribute('id', 'readOverlay');\r
-               $innerDiv->setAttribute('id', 'readInner');\r
-\r
-               /* Glue the structure of our document together. */\r
-               $innerDiv->appendChild($articleTitle);\r
-               $innerDiv->appendChild($articleContent);\r
-               $overlay->appendChild($innerDiv);\r
-               \r
-               /* Clear the old HTML, insert the new content. */\r
-               $this->body->innerHTML = '';\r
-               $this->body->appendChild($overlay);\r
-               //document.body.insertBefore(overlay, document.body.firstChild);\r
-               $this->body->removeAttribute('style');\r
-\r
-               $this->postProcessContent($articleContent);\r
-               \r
-               // Set title and content instance variables\r
-               $this->articleTitle = $articleTitle;\r
-               $this->articleContent = $articleContent;\r
-               \r
-               return $this->success;\r
-       }\r
-       \r
-       /**\r
-       * Debug\r
-       */\r
-       protected function dbg($msg) {\r
-               if ($this->debug) echo '* ',$msg, "\n";\r
-       }\r
-       \r
-       /**\r
-       * Run any post-process modifications to article content as necessary.\r
-       *\r
-       * @param DOMElement\r
-       * @return void\r
-       */\r
-       public function postProcessContent($articleContent) {\r
-               if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { \r
-                       $this->addFootnotes($articleContent);\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Get the article title as an H1.\r
-       *\r
-       * @return DOMElement\r
-       */\r
-       protected function getArticleTitle() {\r
-               $curTitle = '';\r
-               $origTitle = '';\r
-\r
-               try {\r
-                       $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));\r
-               } catch(Exception $e) {}\r
-               \r
-               if (preg_match('/ [\|\-] /', $curTitle))\r
-               {\r
-                       $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);\r
-                       \r
-                       if (count(explode(' ', $curTitle)) < 3) {\r
-                               $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);\r
-                       }\r
-               }\r
-               else if (strpos($curTitle, ': ') !== false)\r
-               {\r
-                       $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);\r
-\r
-                       if (count(explode(' ', $curTitle)) < 3) {\r
-                               $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);\r
-                       }\r
-               }\r
-               else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)\r
-               {\r
-                       $hOnes = $this->dom->getElementsByTagName('h1');\r
-                       if($hOnes->length == 1)\r
-                       {\r
-                               $curTitle = $this->getInnerText($hOnes->item(0));\r
-                       }\r
-               }\r
-\r
-               $curTitle = trim($curTitle);\r
-\r
-               if (count(explode(' ', $curTitle)) <= 4) {\r
-                       $curTitle = $origTitle;\r
-               }\r
-               \r
-               $articleTitle = $this->dom->createElement('h1');\r
-               $articleTitle->innerHTML = $curTitle;\r
-               \r
-               return $articleTitle;\r
-       }\r
-       \r
-       /**\r
-       * Prepare the HTML document for readability to scrape it.\r
-       * This includes things like stripping javascript, CSS, and handling terrible markup.\r
-       * \r
-       * @return void\r
-       **/\r
-       protected function prepDocument() {\r
-               /**\r
-               * In some cases a body element can't be found (if the HTML is totally hosed for example)\r
-               * so we create a new body node and append it to the document.\r
-               */\r
-               if ($this->body == null)\r
-               {\r
-                       $this->body = $this->dom->createElement('body');\r
-                       $this->dom->documentElement->appendChild($this->body);\r
-               }\r
-               $this->body->setAttribute('id', 'readabilityBody');\r
-\r
-               /* Remove all style tags in head */\r
-               $styleTags = $this->dom->getElementsByTagName('style');\r
-               for ($i = $styleTags->length-1; $i >= 0; $i--)\r
-               {\r
-                       $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));\r
-               }\r
-\r
-               /* Turn all double br's into p's */\r
-               /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */\r
-               //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');\r
-               // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.\r
-               // Manipulating innerHTML as it's done in JS is not possible in PHP.\r
-       }\r
-\r
-       /**\r
-       * For easier reading, convert this document to have footnotes at the bottom rather than inline links.\r
-       * @see http://www.roughtype.com/archives/2010/05/experiments_in.php\r
-       *\r
-       * @return void\r
-       **/\r
-       public function addFootnotes($articleContent) {\r
-               $footnotesWrapper = $this->dom->createElement('div');\r
-               $footnotesWrapper->setAttribute('id', 'readability-footnotes');\r
-               $footnotesWrapper->innerHTML = '<h3>References</h3>';\r
-               \r
-               $articleFootnotes = $this->dom->createElement('ol');\r
-               $articleFootnotes->setAttribute('id', 'readability-footnotes-list');\r
-               $footnotesWrapper->appendChild($articleFootnotes);\r
-               \r
-               $articleLinks = $articleContent->getElementsByTagName('a');\r
-               \r
-               $linkCount = 0;\r
-               for ($i = 0; $i < $articleLinks->length; $i++)\r
-               {\r
-                       $articleLink  = $articleLinks->item($i);\r
-                       $footnoteLink = $articleLink->cloneNode(true);\r
-                       $refLink      = $this->dom->createElement('a');\r
-                       $footnote     = $this->dom->createElement('li');\r
-                       $linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);\r
-                       if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);\r
-                       //linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,\r
-                       $linkText     = $this->getInnerText($articleLink);\r
-                       \r
-                       if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {\r
-                               continue;\r
-                       }\r
-                       \r
-                       $linkCount++;\r
-\r
-                       /** Add a superscript reference after the article link */\r
-                       $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);\r
-                       $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';\r
-                       $refLink->setAttribute('class', 'readability-DoNotFootnote');\r
-                       $refLink->setAttribute('style', 'color: inherit;');\r
-                       \r
-                       //TODO: does this work or should we use DOMNode.isSameNode()?\r
-                       if ($articleLink->parentNode->lastChild == $articleLink) {\r
-                               $articleLink->parentNode->appendChild($refLink);\r
-                       } else {\r
-                               $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);\r
-                       }\r
-\r
-                       $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');\r
-                       $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);\r
-\r
-                       $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';\r
-\r
-                       $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);\r
-                       $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);\r
-                       \r
-                       $footnote->appendChild($footnoteLink);\r
-                       if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';\r
-                       \r
-                       $articleFootnotes->appendChild($footnote);\r
-               }\r
-\r
-               if ($linkCount > 0) {\r
-                       $articleContent->appendChild($footnotesWrapper);           \r
-               }\r
-       }\r
-\r
-       /**\r
-       * Reverts P elements with class 'readability-styled'\r
-       * to text nodes - which is what they were before.\r
-       *\r
-       * @param DOMElement\r
-       * @return void\r
-       */\r
-       function revertReadabilityStyledElements($articleContent) {\r
-               $xpath = new DOMXPath($articleContent->ownerDocument);\r
-               $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);\r
-               //$elems = $articleContent->getElementsByTagName('p');\r
-               for ($i = $elems->length-1; $i >= 0; $i--) {\r
-                       $e = $elems->item($i);\r
-                       $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);\r
-                       //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {\r
-                       //      $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);\r
-                       //}\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Prepare the article node for display. Clean out any inline styles,\r
-       * iframes, forms, strip extraneous <p> tags, etc.\r
-       *\r
-       * @param DOMElement\r
-       * @return void\r
-       */\r
-       function prepArticle($articleContent) {\r
-               $this->cleanStyles($articleContent);\r
-               $this->killBreaks($articleContent);\r
-               if ($this->revertForcedParagraphElements) {\r
-                       $this->revertReadabilityStyledElements($articleContent);\r
-               }\r
-\r
-               /* Clean out junk from the article content */\r
-               $this->cleanConditionally($articleContent, 'form');\r
-               $this->clean($articleContent, 'object');\r
-               $this->clean($articleContent, 'h1');\r
-\r
-               /**\r
-               * If there is only one h2, they are probably using it\r
-               * as a header and not a subheader, so remove it since we already have a header.\r
-               ***/\r
-               if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {\r
-                       $this->clean($articleContent, 'h2'); \r
-               }\r
-               $this->clean($articleContent, 'iframe');\r
-\r
-               $this->cleanHeaders($articleContent);\r
-\r
-               /* Do these last as the previous stuff may have removed junk that will affect these */\r
-               $this->cleanConditionally($articleContent, 'table');\r
-               $this->cleanConditionally($articleContent, 'ul');\r
-               $this->cleanConditionally($articleContent, 'div');\r
-\r
-               /* Remove extra paragraphs */\r
-               $articleParagraphs = $articleContent->getElementsByTagName('p');\r
-               for ($i = $articleParagraphs->length-1; $i >= 0; $i--)\r
-               {\r
-                       $imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;\r
-                       $embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;\r
-                       $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;\r
-                       $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;\r
-                       \r
-                       if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')\r
-                       {\r
-                               $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));\r
-                       }\r
-               }\r
-\r
-               try {\r
-                       $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);\r
-                       //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      \r
-               }\r
-               catch (Exception $e) {\r
-                       $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Initialize a node with the readability object. Also checks the\r
-       * className/id for special names to add to its score.\r
-       *\r
-       * @param Element\r
-       * @return void\r
-       **/\r
-       protected function initializeNode($node) {\r
-               $readability = $this->dom->createAttribute('readability');\r
-               $readability->value = 0; // this is our contentScore\r
-               $node->setAttributeNode($readability);                   \r
-\r
-               switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case\r
-                       case 'DIV':\r
-                               $readability->value += 5;\r
-                               break;\r
-\r
-                       case 'PRE':\r
-                       case 'TD':\r
-                       case 'BLOCKQUOTE':\r
-                               $readability->value += 3;\r
-                               break;\r
-                               \r
-                       case 'ADDRESS':\r
-                       case 'OL':\r
-                       case 'UL':\r
-                       case 'DL':\r
-                       case 'DD':\r
-                       case 'DT':\r
-                       case 'LI':\r
-                       case 'FORM':\r
-                               $readability->value -= 3;\r
-                               break;\r
-\r
-                       case 'H1':\r
-                       case 'H2':\r
-                       case 'H3':\r
-                       case 'H4':\r
-                       case 'H5':\r
-                       case 'H6':\r
-                       case 'TH':\r
-                               $readability->value -= 5;\r
-                               break;\r
-               }\r
-               $readability->value += $this->getClassWeight($node);\r
-       }\r
-       \r
-       /***\r
-       * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is\r
-       *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.\r
-       *\r
-       * @return DOMElement\r
-       **/\r
-       protected function grabArticle($page=null) {\r
-               $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);\r
-               if (!$page) $page = $this->dom;\r
-               $allElements = $page->getElementsByTagName('*');\r
-               /**\r
-               * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs\r
-               * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)\r
-               *\r
-               * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5\r
-               * TODO: Shouldn't this be a reverse traversal?\r
-               **/\r
-               $node = null;\r
-               $nodesToScore = array();\r
-               for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {\r
-               //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {\r
-                       //$node = $targetList->item($nodeIndex);\r
-                       $tagName = strtoupper($node->tagName);\r
-                       /* Remove unlikely candidates */\r
-                       if ($stripUnlikelyCandidates) {\r
-                               $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');\r
-                               if (\r
-                                       preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&\r
-                                       !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&\r
-                                       $tagName != 'BODY'\r
-                               )\r
-                               {\r
-                                       $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);\r
-                                       //$nodesToRemove[] = $node;\r
-                                       $node->parentNode->removeChild($node);\r
-                                       $nodeIndex--;\r
-                                       continue;\r
-                               }               \r
-                       }\r
-\r
-                       if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {\r
-                               $nodesToScore[] = $node;\r
-                       }\r
-\r
-                       /* Turn all divs that don't have children block level elements into p's */\r
-                       if ($tagName == 'DIV') {\r
-                               if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {\r
-                                       //$this->dbg('Altering div to p');\r
-                                       $newNode = $this->dom->createElement('p');\r
-                                       try {\r
-                                               $newNode->innerHTML = $node->innerHTML;\r
-                                               //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);\r
-                                               $node->parentNode->replaceChild($newNode, $node);\r
-                                               $nodeIndex--;\r
-                                               $nodesToScore[] = $node; // or $newNode?\r
-                                       }\r
-                                       catch(Exception $e) {\r
-                                               $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);\r
-                                       }\r
-                               }\r
-                               else\r
-                               {\r
-                                       /* EXPERIMENTAL */\r
-                                       // TODO: change these p elements back to text nodes after processing\r
-                                       for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {\r
-                                               $childNode = $node->childNodes->item($i);\r
-                                               if ($childNode->nodeType == 3) { // XML_TEXT_NODE\r
-                                                       //$this->dbg('replacing text node with a p tag with the same content.');\r
-                                                       $p = $this->dom->createElement('p');\r
-                                                       $p->innerHTML = $childNode->nodeValue;\r
-                                                       $p->setAttribute('style', 'display: inline;');\r
-                                                       $p->setAttribute('class', 'readability-styled');\r
-                                                       $childNode->parentNode->replaceChild($p, $childNode);\r
-                                               }\r
-                                       }\r
-                               }\r
-                       }\r
-               }\r
-               \r
-               /**\r
-               * Loop through all paragraphs, and assign a score to them based on how content-y they look.\r
-               * Then add their score to their parent node.\r
-               *\r
-               * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.\r
-               **/\r
-               $candidates = array();\r
-               for ($pt=0; $pt < count($nodesToScore); $pt++) {\r
-                       $parentNode      = $nodesToScore[$pt]->parentNode;\r
-                       // $grandParentNode = $parentNode ? $parentNode->parentNode : null;\r
-                       $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);\r
-                       $innerText       = $this->getInnerText($nodesToScore[$pt]);\r
-\r
-                       if (!$parentNode || !isset($parentNode->tagName)) {\r
-                               continue;\r
-                       }\r
-\r
-                       /* If this paragraph is less than 25 characters, don't even count it. */\r
-                       if(strlen($innerText) < 25) {\r
-                               continue;\r
-                       }\r
-\r
-                       /* Initialize readability data for the parent. */\r
-                       if (!$parentNode->hasAttribute('readability')) \r
-                       {\r
-                               $this->initializeNode($parentNode);\r
-                               $candidates[] = $parentNode;\r
-                       }\r
-\r
-                       /* Initialize readability data for the grandparent. */\r
-                       if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))\r
-                       {\r
-                               $this->initializeNode($grandParentNode);\r
-                               $candidates[] = $grandParentNode;\r
-                       }\r
-\r
-                       $contentScore = 0;\r
-\r
-                       /* Add a point for the paragraph itself as a base. */\r
-                       $contentScore++;\r
-\r
-                       /* Add points for any commas within this paragraph */\r
-                       $contentScore += count(explode(',', $innerText));\r
-                       \r
-                       /* For every 100 characters in this paragraph, add another point. Up to 3 points. */\r
-                       $contentScore += min(floor(strlen($innerText) / 100), 3);\r
-                       \r
-                       /* Add the score to the parent. The grandparent gets half. */\r
-                       $parentNode->getAttributeNode('readability')->value += $contentScore;\r
-\r
-                       if ($grandParentNode) {\r
-                               $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             \r
-                       }\r
-               }\r
-\r
-               /**\r
-               * After we've calculated scores, loop through all of the possible candidate nodes we found\r
-               * and find the one with the highest score.\r
-               **/\r
-               $topCandidate = null;\r
-               for ($c=0, $cl=count($candidates); $c < $cl; $c++)\r
-               {\r
-                       /**\r
-                       * Scale the final candidates score based on link density. Good content should have a\r
-                       * relatively small link density (5% or less) and be mostly unaffected by this operation.\r
-                       **/\r
-                       $readability = $candidates[$c]->getAttributeNode('readability');\r
-                       $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));\r
-\r
-                       $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);\r
-\r
-                       if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {\r
-                               $topCandidate = $candidates[$c];\r
-                       }\r
-               }\r
-\r
-               /**\r
-               * If we still have no top candidate, just use the body as a last resort.\r
-               * We also have to copy the body node so it is something we can modify.\r
-               **/\r
-               if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')\r
-               {\r
-                       $topCandidate = $this->dom->createElement('div');\r
-                       if ($page instanceof DOMDocument) {\r
-                               if (!isset($page->documentElement)) {\r
-                                       // we don't have a body either? what a mess! :)\r
-                               } else {\r
-                                       $topCandidate->innerHTML = $page->documentElement->innerHTML;\r
-                                       $page->documentElement->innerHTML = '';\r
-                                       $page->documentElement->appendChild($topCandidate);\r
-                               }\r
-                       } else {\r
-                               $topCandidate->innerHTML = $page->innerHTML;\r
-                               $page->innerHTML = '';\r
-                               $page->appendChild($topCandidate);\r
-                       }\r
-                       $this->initializeNode($topCandidate);\r
-               }\r
-\r
-               /**\r
-               * Now that we have the top candidate, look through its siblings for content that might also be related.\r
-               * Things like preambles, content split by ads that we removed, etc.\r
-               **/\r
-               $articleContent        = $this->dom->createElement('div');\r
-               $articleContent->setAttribute('id', 'readability-content');\r
-               $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);\r
-               $siblingNodes          = $topCandidate->parentNode->childNodes;\r
-               if (!isset($siblingNodes)) {\r
-                       $siblingNodes = new stdClass;\r
-                       $siblingNodes->length = 0;\r
-               }\r
-\r
-               for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)\r
-               {\r
-                       $siblingNode = $siblingNodes->item($s);\r
-                       $append      = false;\r
-\r
-                       $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));\r
-\r
-                       //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));\r
-\r
-                       if ($siblingNode === $topCandidate)\r
-                       // or if ($siblingNode->isSameNode($topCandidate))\r
-                       {\r
-                               $append = true;\r
-                       }\r
-\r
-                       $contentBonus = 0;\r
-                       /* Give a bonus if sibling nodes and top candidates have the example same classname */\r
-                       if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {\r
-                               $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;\r
-                       }\r
-\r
-                       if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)\r
-                       {\r
-                               $append = true;\r
-                       }\r
-                       \r
-                       if (strtoupper($siblingNode->nodeName) == 'P') {\r
-                               $linkDensity = $this->getLinkDensity($siblingNode);\r
-                               $nodeContent = $this->getInnerText($siblingNode);\r
-                               $nodeLength  = strlen($nodeContent);\r
-                               \r
-                               if ($nodeLength > 80 && $linkDensity < 0.25)\r
-                               {\r
-                                       $append = true;\r
-                               }\r
-                               else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))\r
-                               {\r
-                                       $append = true;\r
-                               }\r
-                       }\r
-\r
-                       if ($append)\r
-                       {\r
-                               $this->dbg('Appending node: ' . $siblingNode->nodeName);\r
-\r
-                               $nodeToAppend = null;\r
-                               $sibNodeName = strtoupper($siblingNode->nodeName);\r
-                               if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {\r
-                                       /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */\r
-                                       \r
-                                       $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');\r
-                                       $nodeToAppend = $this->dom->createElement('div');\r
-                                       try {\r
-                                               $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));\r
-                                               $nodeToAppend->innerHTML = $siblingNode->innerHTML;\r
-                                       }\r
-                                       catch(Exception $e)\r
-                                       {\r
-                                               $this->dbg('Could not alter siblingNode to div, reverting back to original.');\r
-                                               $nodeToAppend = $siblingNode;\r
-                                               $s--;\r
-                                               $sl--;\r
-                                       }\r
-                               } else {\r
-                                       $nodeToAppend = $siblingNode;\r
-                                       $s--;\r
-                                       $sl--;\r
-                               }\r
-                               \r
-                               /* To ensure a node does not interfere with readability styles, remove its classnames */\r
-                               $nodeToAppend->removeAttribute('class');\r
-\r
-                               /* Append sibling and subtract from our list because it removes the node when you append to another node */\r
-                               $articleContent->appendChild($nodeToAppend);\r
-                       }\r
-               }\r
-\r
-               /**\r
-               * So we have all of the content that we need. Now we clean it up for presentation.\r
-               **/\r
-               $this->prepArticle($articleContent);\r
-\r
-               /**\r
-               * Now that we've gone through the full algorithm, check to see if we got any meaningful content.\r
-               * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher\r
-               * likelihood of finding the content, and the sieve approach gives us a higher likelihood of\r
-               * finding the -right- content.\r
-               **/\r
-               if (strlen($this->getInnerText($articleContent, false)) < 250)\r
-               {\r
-                       // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7\r
-                       // in the meantime, we check and create an empty element if it's not there.\r
-                       if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');\r
-                       $this->body->innerHTML = $this->bodyCache;\r
-                       \r
-                       if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {\r
-                               $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);\r
-                               return $this->grabArticle($this->body);\r
-                       }\r
-                       else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {\r
-                               $this->removeFlag(self::FLAG_WEIGHT_CLASSES);\r
-                               return $this->grabArticle($this->body);              \r
-                       }\r
-                       else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {\r
-                               $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);\r
-                               return $this->grabArticle($this->body);\r
-                       }\r
-                       else {\r
-                               return false;\r
-                       }\r
-               }\r
-               return $articleContent;\r
-       }\r
-       \r
-       /**\r
-       * Remove script tags from document\r
-       *\r
-       * @param DOMElement\r
-       * @return void\r
-       */\r
-       public function removeScripts($doc) {\r
-               $scripts = $doc->getElementsByTagName('script');\r
-               for($i = $scripts->length-1; $i >= 0; $i--)\r
-               {\r
-                       $scripts->item($i)->parentNode->removeChild($scripts->item($i));\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Get the inner text of a node.\r
-       * This also strips out any excess whitespace to be found.\r
-       *\r
-       * @param DOMElement $\r
-       * @param boolean $normalizeSpaces (default: true)\r
-       * @return string\r
-       **/\r
-       public function getInnerText($e, $normalizeSpaces=true) {\r
-               $textContent = '';\r
-\r
-               if (!isset($e->textContent) || $e->textContent == '') {\r
-                       return '';\r
-               }\r
-\r
-               $textContent = trim($e->textContent);\r
-\r
-               if ($normalizeSpaces) {\r
-                       return preg_replace($this->regexps['normalize'], ' ', $textContent);\r
-               } else {\r
-                       return $textContent;\r
-               }\r
-       }\r
-\r
-       /**\r
-       * Get the number of times a string $s appears in the node $e.\r
-       *\r
-       * @param DOMElement $e\r
-       * @param string - what to count. Default is ","\r
-       * @return number (integer)\r
-       **/\r
-       public function getCharCount($e, $s=',') {\r
-               return substr_count($this->getInnerText($e), $s);\r
-       }\r
-\r
-       /**\r
-       * Remove the style attribute on every $e and under.\r
-       *\r
-       * @param DOMElement $e\r
-       * @return void\r
-       */\r
-       public function cleanStyles($e) {\r
-               if (!is_object($e)) return;\r
-               $elems = $e->getElementsByTagName('*');\r
-               foreach ($elems as $elem) {\r
-                       $elem->removeAttribute('style');\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Get the density of links as a percentage of the content\r
-       * This is the amount of text that is inside a link divided by the total text in the node.\r
-       * \r
-       * @param DOMElement $e\r
-       * @return number (float)\r
-       */\r
-       public function getLinkDensity($e) {\r
-               $links      = $e->getElementsByTagName('a');\r
-               $textLength = strlen($this->getInnerText($e));\r
-               $linkLength = 0;\r
-               for ($i=0, $il=$links->length; $i < $il; $i++)\r
-               {\r
-                       $linkLength += strlen($this->getInnerText($links->item($i)));\r
-               }\r
-               if ($textLength > 0) {\r
-                       return $linkLength / $textLength;\r
-               } else {\r
-                       return 0;\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Get an elements class/id weight. Uses regular expressions to tell if this \r
-       * element looks good or bad.\r
-       *\r
-       * @param DOMElement $e\r
-       * @return number (Integer)\r
-       */\r
-       public function getClassWeight($e) {\r
-               if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {\r
-                       return 0;\r
-               }\r
-\r
-               $weight = 0;\r
-\r
-               /* Look for a special classname */\r
-               if ($e->hasAttribute('class') && $e->getAttribute('class') != '')\r
-               {\r
-                       if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {\r
-                               $weight -= 25;\r
-                       }\r
-                       if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {\r
-                               $weight += 25;\r
-                       }\r
-               }\r
-\r
-               /* Look for a special ID */\r
-               if ($e->hasAttribute('id') && $e->getAttribute('id') != '')\r
-               {\r
-                       if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {\r
-                               $weight -= 25;\r
-                       }\r
-                       if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {\r
-                               $weight += 25;\r
-                       }\r
-               }\r
-               return $weight;\r
-       }\r
-\r
-       /**\r
-       * Remove extraneous break tags from a node.\r
-       *\r
-       * @param DOMElement $node\r
-       * @return void\r
-       */\r
-       public function killBreaks($node) {\r
-               $html = $node->innerHTML;\r
-               $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);\r
-               $node->innerHTML = $html;\r
-       }\r
-\r
-       /**\r
-       * Clean a node of all elements of type "tag".\r
-       * (Unless it's a youtube/vimeo video. People love movies.)\r
-       *\r
-       * Updated 2012-09-18 to preserve youtube/vimeo iframes\r
-       *\r
-       * @param DOMElement $e\r
-       * @param string $tag\r
-       * @return void\r
-       */\r
-       public function clean($e, $tag) {\r
-               $targetList = $e->getElementsByTagName($tag);\r
-               $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');\r
-               \r
-               for ($y=$targetList->length-1; $y >= 0; $y--) {\r
-                       /* Allow youtube and vimeo videos through as people usually want to see those. */\r
-                       if ($isEmbed) {\r
-                               $attributeValues = '';\r
-                               for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {\r
-                                       $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)\r
-                               }\r
-                               \r
-                               /* First, check the elements attributes to see if any of them contain youtube or vimeo */\r
-                               if (preg_match($this->regexps['video'], $attributeValues)) {\r
-                                       continue;\r
-                               }\r
-\r
-                               /* Then check the elements inside this element for the same. */\r
-                               if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {\r
-                                       continue;\r
-                               }\r
-                       }\r
-                       $targetList->item($y)->parentNode->removeChild($targetList->item($y));\r
-               }\r
-       }\r
-       \r
-       /**\r
-       * Clean an element of all tags of type "tag" if they look fishy.\r
-       * "Fishy" is an algorithm based on content length, classnames, \r
-       * link density, number of images & embeds, etc.\r
-       *\r
-       * @param DOMElement $e\r
-       * @param string $tag\r
-       * @return void\r
-       */\r
-       public function cleanConditionally($e, $tag) {\r
-               if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {\r
-                       return;\r
-               }\r
-\r
-               $tagsList = $e->getElementsByTagName($tag);\r
-               $curTagsLength = $tagsList->length;\r
-\r
-               /**\r
-               * Gather counts for other typical elements embedded within.\r
-               * Traverse backwards so we can remove nodes at the same time without effecting the traversal.\r
-               *\r
-               * TODO: Consider taking into account original contentScore here.\r
-               */\r
-               for ($i=$curTagsLength-1; $i >= 0; $i--) {\r
-                       $weight = $this->getClassWeight($tagsList->item($i));\r
-                       $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;\r
-                       \r
-                       $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));\r
-\r
-                       if ($weight + $contentScore < 0) {\r
-                               $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));\r
-                       }\r
-                       else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {\r
-                               /**\r
-                               * If there are not very many commas, and the number of\r
-                               * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.\r
-                               **/\r
-                               $p      = $tagsList->item($i)->getElementsByTagName('p')->length;\r
-                               $img    = $tagsList->item($i)->getElementsByTagName('img')->length;\r
-                               $li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;\r
-                               $input  = $tagsList->item($i)->getElementsByTagName('input')->length;\r
-                               $a              = $tagsList->item($i)->getElementsByTagName('a')->length;\r
-\r
-                               $embedCount = 0;\r
-                               $embeds = $tagsList->item($i)->getElementsByTagName('embed');\r
-                               for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {\r
-                                       if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {\r
-                                               $embedCount++; \r
-                                       }\r
-                               }\r
-                               $embeds = $tagsList->item($i)->getElementsByTagName('iframe');\r
-                               for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {\r
-                                       if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {\r
-                                               $embedCount++; \r
-                                       }\r
-                               }\r
-\r
-                               $linkDensity   = $this->getLinkDensity($tagsList->item($i));\r
-                               $contentLength = strlen($this->getInnerText($tagsList->item($i)));\r
-                               $toRemove      = false;\r
-\r
-                               if ($this->lightClean) {\r
-                                       $this->dbg('Light clean...');\r
-                                       if ( ($img > $p) && ($img > 4) ) {\r
-                                               $this->dbg(' more than 4 images and more image elements than paragraph elements');\r
-                                               $toRemove = true;\r
-                                       } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {\r
-                                               $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');\r
-                                               $toRemove = true;\r
-                                       } else if ( $input > floor($p/3) ) {\r
-                                               $this->dbg(' too many <input> elements');\r
-                                               $toRemove = true; \r
-                                       } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {\r
-                                               $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');\r
-                                               $toRemove = true;\r
-                                       } else if($weight < 25 && $linkDensity > 0.2) {\r
-                                               $this->dbg(' weight smaller than 25 and link density above 0.2');\r
-                                               $toRemove = true;\r
-                                       } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {\r
-                                               $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');\r
-                                               $toRemove = true;\r
-                                       } else if($embedCount > 3) {\r
-                                               $this->dbg(' more than 3 embeds');\r
-                                               $toRemove = true;\r
-                                       }\r
-                               } else {\r
-                                       $this->dbg('Standard clean...');\r
-                                       if ( $img > $p ) {\r
-                                               $this->dbg(' more image elements than paragraph elements');\r
-                                               $toRemove = true;\r
-                                       } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {\r
-                                               $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');\r
-                                               $toRemove = true;\r
-                                       } else if ( $input > floor($p/3) ) {\r
-                                               $this->dbg(' too many <input> elements');\r
-                                               $toRemove = true; \r
-                                       } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {\r
-                                               $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');\r
-                                               $toRemove = true;\r
-                                       } else if($weight < 25 && $linkDensity > 0.2) {\r
-                                               $this->dbg(' weight smaller than 25 and link density above 0.2');\r
-                                               $toRemove = true;\r
-                                       } else if($weight >= 25 && $linkDensity > 0.5) {\r
-                                               $this->dbg(' weight above 25 but link density greater than 0.5');\r
-                                               $toRemove = true;\r
-                                       } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {\r
-                                               $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');\r
-                                               $toRemove = true;\r
-                                       }\r
-                               }\r
-\r
-                               if ($toRemove) {\r
-                                       //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);\r
-                                       $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));\r
-                               }\r
-                       }\r
-               }\r
-       }\r
-\r
-       /**\r
-       * Clean out spurious headers from an Element. Checks things like classnames and link density.\r
-       *\r
-       * @param DOMElement $e\r
-       * @return void\r
-       */\r
-       public function cleanHeaders($e) {\r
-               for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {\r
-                       $headers = $e->getElementsByTagName('h' . $headerIndex);\r
-                       for ($i=$headers->length-1; $i >=0; $i--) {\r
-                               if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {\r
-                                       $headers->item($i)->parentNode->removeChild($headers->item($i));\r
-                               }\r
-                       }\r
-               }\r
-       }\r
-\r
-       public function flagIsActive($flag) {\r
-               return ($this->flags & $flag) > 0;\r
-       }\r
-       \r
-       public function addFlag($flag) {\r
-               $this->flags = $this->flags | $flag;\r
-       }\r
-       \r
-       public function removeFlag($flag) {\r
-               $this->flags = $this->flags & ~$flag;\r
-       }\r
-}\r
+<?php
+/** 
+* Arc90's Readability ported to PHP for FiveFilters.org
+* Based on readability.js version 1.7.1 (without multi-page support)
+* Updated to allow HTML5 parsing with html5lib
+* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
+* ------------------------------------------------------
+* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
+* Arc90's project URL: http://lab.arc90.com/experiments/readability/
+* JS Source: http://code.google.com/p/arc90labs-readability
+* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
+* More information: http://fivefilters.org/content-only/
+* License: Apache License, Version 2.0
+* Requires: PHP5
+* Date: 2012-09-19
+* 
+* Differences between the PHP port and the original
+* ------------------------------------------------------
+* Arc90's Readability is designed to run in the browser. It works on the DOM 
+* tree (the parsed HTML) after the page's CSS styles have been applied and 
+* Javascript code executed. This PHP port does not run inside a browser. 
+* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 
+* rely on CSS or Javascript support. As such, the results will not always 
+* match Arc90's Readability. (For example, if a web page contains CSS style 
+* rules or Javascript code which hide certain HTML elements from display, 
+* Arc90's Readability will dismiss those from consideration but our PHP port, 
+* unable to understand CSS or Javascript, will not know any better.)
+* 
+* Another significant difference is that the aim of Arc90's Readability is 
+* to re-present the main content block of a given web page so users can 
+* read it more easily in their browsers. Correct identification, clean up, 
+* and separation of the content block is only a part of this process. 
+* This PHP port is only concerned with this part, it does not include code 
+* that relates to presentation in the browser - Arc90 already do 
+* that extremely well, and for PDF output there's FiveFilters.org's 
+* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
+* 
+* Finally, this class contains methods that might be useful for developers 
+* working on HTML document fragments. So without deviating too much from 
+* the original code (which I don't want to do because it makes debugging 
+* and updating more difficult), I've tried to make it a little more 
+* developer friendly. You should be able to use the methods here on 
+* existing DOMElement objects without passing an entire HTML document to 
+* be parsed.
+*/
+
+// This class allows us to do JavaScript like assignements to innerHTML
+require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
+
+// Alternative usage (for testing only!)
+// uncomment the lines below and call Readability.php in your browser 
+// passing it the URL of the page you'd like content from, e.g.:
+// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
+
+/*
+if (!isset($_GET['url']) || $_GET['url'] == '') {
+       die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
+}
+$url = $_GET['url'];
+if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
+$html = file_get_contents($url);
+$r = new Readability($html, $url);
+$r->init();
+echo $r->articleContent->innerHTML;
+*/
+
+class Readability
+{
+       public $version = '1.7.1-without-multi-page';
+       public $convertLinksToFootnotes = false;
+       public $revertForcedParagraphElements = true;
+       public $articleTitle;
+       public $articleContent;
+       public $dom;
+       public $url = null; // optional - URL where HTML was retrieved
+       public $debug = false;
+       public $lightClean = true; // preserves more content (experimental) added 2012-09-19
+       protected $body = null; // 
+       protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
+       protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
+       protected $success = false; // indicates whether we were able to extract or not
+       
+       /**
+       * All of the regular expressions in use within readability.
+       * Defined up here so we don't instantiate them repeatedly in loops.
+       **/
+       public $regexps = array(
+               'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
+               'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+               'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+               'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+               'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
+               'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
+               'replaceFonts' => '/<(\/?)font[^>]*>/i',
+               // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
+               'normalize' => '/\s{2,}/',
+               'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
+               'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
+               'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+       );      
+       
+       /* constants */
+       const FLAG_STRIP_UNLIKELYS = 1;
+       const FLAG_WEIGHT_CLASSES = 2;
+       const FLAG_CLEAN_CONDITIONALLY = 4;
+       
+       /**
+       * Create instance of Readability
+       * @param string UTF-8 encoded string
+       * @param string (optional) URL associated with HTML (used for footnotes)
+       * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+       */      
+       function __construct($html, $url=null, $parser='libxml')
+       {
+               $this->url = $url;
+               /* Turn all double br's into p's */
+               $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
+               $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
+               $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
+               if (trim($html) == '') $html = '<html></html>';
+               if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+                       // all good
+               } else {
+                       $this->dom = new DOMDocument();
+                       $this->dom->preserveWhiteSpace = false;
+                       @$this->dom->loadHTML($html);
+               }
+               $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+       }
+
+       /**
+       * Get article title element
+       * @return DOMElement
+       */
+       public function getTitle() {
+               return $this->articleTitle;
+       }
+       
+       /**
+       * Get article content element
+       * @return DOMElement
+       */
+       public function getContent() {
+               return $this->articleContent;
+       }       
+       
+       /**
+       * Runs readability.
+       * 
+       * Workflow:
+       *  1. Prep the document by removing script tags, css, etc.
+       *  2. Build readability's DOM tree.
+       *  3. Grab the article content from the current dom tree.
+       *  4. Replace the current DOM tree with the new one.
+       *  5. Read peacefully.
+       *
+       * @return boolean true if we found content, false otherwise
+       **/
+       public function init()
+       {
+               if (!isset($this->dom->documentElement)) return false;
+               $this->removeScripts($this->dom);
+               //die($this->getInnerHTML($this->dom->documentElement));
+               
+               // Assume successful outcome
+               $this->success = true;
+
+               $bodyElems = $this->dom->getElementsByTagName('body');
+               if ($bodyElems->length > 0) {
+                       if ($this->bodyCache == null) {
+                               $this->bodyCache = $bodyElems->item(0)->innerHTML;
+                       }
+                       if ($this->body == null) {
+                               $this->body = $bodyElems->item(0);
+                       }
+               }
+
+               $this->prepDocument();
+               
+               //die($this->dom->documentElement->parentNode->nodeType);
+               //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
+               //die($this->getInnerHTML($this->dom->documentElement));
+
+               /* Build readability's DOM tree */
+               $overlay        = $this->dom->createElement('div');
+               $innerDiv       = $this->dom->createElement('div');
+               $articleTitle   = $this->getArticleTitle();
+               $articleContent = $this->grabArticle();
+
+               if (!$articleContent) {
+                       $this->success = false;
+                       $articleContent = $this->dom->createElement('div');
+                       $articleContent->setAttribute('id', 'readability-content');
+                       $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';            
+               }
+               
+               $overlay->setAttribute('id', 'readOverlay');
+               $innerDiv->setAttribute('id', 'readInner');
+
+               /* Glue the structure of our document together. */
+               $innerDiv->appendChild($articleTitle);
+               $innerDiv->appendChild($articleContent);
+               $overlay->appendChild($innerDiv);
+               
+               /* Clear the old HTML, insert the new content. */
+               $this->body->innerHTML = '';
+               $this->body->appendChild($overlay);
+               //document.body.insertBefore(overlay, document.body.firstChild);
+               $this->body->removeAttribute('style');
+
+               $this->postProcessContent($articleContent);
+               
+               // Set title and content instance variables
+               $this->articleTitle = $articleTitle;
+               $this->articleContent = $articleContent;
+               
+               return $this->success;
+       }
+       
+       /**
+       * Debug
+       */
+       protected function dbg($msg) {
+               if ($this->debug) echo '* ',$msg, "\n";
+       }
+       
+       /**
+       * Run any post-process modifications to article content as necessary.
+       *
+       * @param DOMElement
+       * @return void
+       */
+       public function postProcessContent($articleContent) {
+               if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 
+                       $this->addFootnotes($articleContent);
+               }
+       }
+       
+       /**
+       * Get the article title as an H1.
+       *
+       * @return DOMElement
+       */
+       protected function getArticleTitle() {
+               $curTitle = '';
+               $origTitle = '';
+
+               try {
+                       $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+               } catch(Exception $e) {}
+               
+               if (preg_match('/ [\|\-] /', $curTitle))
+               {
+                       $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+                       
+                       if (count(explode(' ', $curTitle)) < 3) {
+                               $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+                       }
+               }
+               else if (strpos($curTitle, ': ') !== false)
+               {
+                       $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+
+                       if (count(explode(' ', $curTitle)) < 3) {
+                               $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
+                       }
+               }
+               else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
+               {
+                       $hOnes = $this->dom->getElementsByTagName('h1');
+                       if($hOnes->length == 1)
+                       {
+                               $curTitle = $this->getInnerText($hOnes->item(0));
+                       }
+               }
+
+               $curTitle = trim($curTitle);
+
+               if (count(explode(' ', $curTitle)) <= 4) {
+                       $curTitle = $origTitle;
+               }
+               
+               $articleTitle = $this->dom->createElement('h1');
+               $articleTitle->innerHTML = $curTitle;
+               
+               return $articleTitle;
+       }
+       
+       /**
+       * Prepare the HTML document for readability to scrape it.
+       * This includes things like stripping javascript, CSS, and handling terrible markup.
+       * 
+       * @return void
+       **/
+       protected function prepDocument() {
+               /**
+               * In some cases a body element can't be found (if the HTML is totally hosed for example)
+               * so we create a new body node and append it to the document.
+               */
+               if ($this->body == null)
+               {
+                       $this->body = $this->dom->createElement('body');
+                       $this->dom->documentElement->appendChild($this->body);
+               }
+               $this->body->setAttribute('id', 'readabilityBody');
+
+               /* Remove all style tags in head */
+               $styleTags = $this->dom->getElementsByTagName('style');
+               for ($i = $styleTags->length-1; $i >= 0; $i--)
+               {
+                       $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+               }
+
+               /* Turn all double br's into p's */
+               /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
+               //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
+               // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
+               // Manipulating innerHTML as it's done in JS is not possible in PHP.
+       }
+
+       /**
+       * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
+       * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
+       *
+       * @return void
+       **/
+       public function addFootnotes($articleContent) {
+               $footnotesWrapper = $this->dom->createElement('div');
+               $footnotesWrapper->setAttribute('id', 'readability-footnotes');
+               $footnotesWrapper->innerHTML = '<h3>References</h3>';
+               
+               $articleFootnotes = $this->dom->createElement('ol');
+               $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
+               $footnotesWrapper->appendChild($articleFootnotes);
+               
+               $articleLinks = $articleContent->getElementsByTagName('a');
+               
+               $linkCount = 0;
+               for ($i = 0; $i < $articleLinks->length; $i++)
+               {
+                       $articleLink  = $articleLinks->item($i);
+                       $footnoteLink = $articleLink->cloneNode(true);
+                       $refLink      = $this->dom->createElement('a');
+                       $footnote     = $this->dom->createElement('li');
+                       $linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
+                       if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
+                       //linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
+                       $linkText     = $this->getInnerText($articleLink);
+                       
+                       if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+                               continue;
+                       }
+                       
+                       $linkCount++;
+
+                       /** Add a superscript reference after the article link */
+                       $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
+                       $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
+                       $refLink->setAttribute('class', 'readability-DoNotFootnote');
+                       $refLink->setAttribute('style', 'color: inherit;');
+                       
+                       //TODO: does this work or should we use DOMNode.isSameNode()?
+                       if ($articleLink->parentNode->lastChild == $articleLink) {
+                               $articleLink->parentNode->appendChild($refLink);
+                       } else {
+                               $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
+                       }
+
+                       $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
+                       $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
+
+                       $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
+
+                       $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
+                       $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
+                       
+                       $footnote->appendChild($footnoteLink);
+                       if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
+                       
+                       $articleFootnotes->appendChild($footnote);
+               }
+
+               if ($linkCount > 0) {
+                       $articleContent->appendChild($footnotesWrapper);           
+               }
+       }
+
+       /**
+       * Reverts P elements with class 'readability-styled'
+       * to text nodes - which is what they were before.
+       *
+       * @param DOMElement
+       * @return void
+       */
+       function revertReadabilityStyledElements($articleContent) {
+               $xpath = new DOMXPath($articleContent->ownerDocument);
+               $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
+               //$elems = $articleContent->getElementsByTagName('p');
+               for ($i = $elems->length-1; $i >= 0; $i--) {
+                       $e = $elems->item($i);
+                       $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+                       //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
+                       //      $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
+                       //}
+               }
+       }
+       
+       /**
+       * Prepare the article node for display. Clean out any inline styles,
+       * iframes, forms, strip extraneous <p> tags, etc.
+       *
+       * @param DOMElement
+       * @return void
+       */
+       function prepArticle($articleContent) {
+               $this->cleanStyles($articleContent);
+               $this->killBreaks($articleContent);
+               if ($this->revertForcedParagraphElements) {
+                       $this->revertReadabilityStyledElements($articleContent);
+               }
+
+               /* Clean out junk from the article content */
+               $this->cleanConditionally($articleContent, 'form');
+               $this->clean($articleContent, 'object');
+               $this->clean($articleContent, 'h1');
+
+               /**
+               * If there is only one h2, they are probably using it
+               * as a header and not a subheader, so remove it since we already have a header.
+               ***/
+               if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
+                       $this->clean($articleContent, 'h2'); 
+               }
+               $this->clean($articleContent, 'iframe');
+
+               $this->cleanHeaders($articleContent);
+
+               /* Do these last as the previous stuff may have removed junk that will affect these */
+               $this->cleanConditionally($articleContent, 'table');
+               $this->cleanConditionally($articleContent, 'ul');
+               $this->cleanConditionally($articleContent, 'div');
+
+               /* Remove extra paragraphs */
+               $articleParagraphs = $articleContent->getElementsByTagName('p');
+               for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
+               {
+                       $imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
+                       $embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
+                       $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+                       $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
+                       
+                       if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+                       {
+                               $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
+                       }
+               }
+
+               try {
+                       $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
+                       //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
+               }
+               catch (Exception $e) {
+                       $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
+               }
+       }
+       
+       /**
+       * Initialize a node with the readability object. Also checks the
+       * className/id for special names to add to its score.
+       *
+       * @param Element
+       * @return void
+       **/
+       protected function initializeNode($node) {
+               $readability = $this->dom->createAttribute('readability');
+               $readability->value = 0; // this is our contentScore
+               $node->setAttributeNode($readability);                   
+
+               switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
+                       case 'DIV':
+                               $readability->value += 5;
+                               break;
+
+                       case 'PRE':
+                       case 'TD':
+                       case 'BLOCKQUOTE':
+                               $readability->value += 3;
+                               break;
+                               
+                       case 'ADDRESS':
+                       case 'OL':
+                       case 'UL':
+                       case 'DL':
+                       case 'DD':
+                       case 'DT':
+                       case 'LI':
+                       case 'FORM':
+                               $readability->value -= 3;
+                               break;
+
+                       case 'H1':
+                       case 'H2':
+                       case 'H3':
+                       case 'H4':
+                       case 'H5':
+                       case 'H6':
+                       case 'TH':
+                               $readability->value -= 5;
+                               break;
+               }
+               $readability->value += $this->getClassWeight($node);
+       }
+       
+       /***
+       * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+       *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+       *
+       * @return DOMElement
+       **/
+       protected function grabArticle($page=null) {
+               $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
+               if (!$page) $page = $this->dom;
+               $allElements = $page->getElementsByTagName('*');
+               /**
+               * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+               * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+               *
+               * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+               * TODO: Shouldn't this be a reverse traversal?
+               **/
+               $node = null;
+               $nodesToScore = array();
+               for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
+               //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
+                       //$node = $targetList->item($nodeIndex);
+                       $tagName = strtoupper($node->tagName);
+                       /* Remove unlikely candidates */
+                       if ($stripUnlikelyCandidates) {
+                               $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
+                               if (
+                                       preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+                                       !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
+                                       $tagName != 'BODY'
+                               )
+                               {
+                                       $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
+                                       //$nodesToRemove[] = $node;
+                                       $node->parentNode->removeChild($node);
+                                       $nodeIndex--;
+                                       continue;
+                               }               
+                       }
+
+                       if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
+                               $nodesToScore[] = $node;
+                       }
+
+                       /* Turn all divs that don't have children block level elements into p's */
+                       if ($tagName == 'DIV') {
+                               if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+                                       //$this->dbg('Altering div to p');
+                                       $newNode = $this->dom->createElement('p');
+                                       try {
+                                               $newNode->innerHTML = $node->innerHTML;
+                                               //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
+                                               $node->parentNode->replaceChild($newNode, $node);
+                                               $nodeIndex--;
+                                               $nodesToScore[] = $node; // or $newNode?
+                                       }
+                                       catch(Exception $e) {
+                                               $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
+                                       }
+                               }
+                               else
+                               {
+                                       /* EXPERIMENTAL */
+                                       // TODO: change these p elements back to text nodes after processing
+                                       for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
+                                               $childNode = $node->childNodes->item($i);
+                                               if ($childNode->nodeType == 3) { // XML_TEXT_NODE
+                                                       //$this->dbg('replacing text node with a p tag with the same content.');
+                                                       $p = $this->dom->createElement('p');
+                                                       $p->innerHTML = $childNode->nodeValue;
+                                                       $p->setAttribute('style', 'display: inline;');
+                                                       $p->setAttribute('class', 'readability-styled');
+                                                       $childNode->parentNode->replaceChild($p, $childNode);
+                                               }
+                                       }
+                               }
+                       }
+               }
+               
+               /**
+               * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+               * Then add their score to their parent node.
+               *
+               * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+               **/
+               $candidates = array();
+               for ($pt=0; $pt < count($nodesToScore); $pt++) {
+                       $parentNode      = $nodesToScore[$pt]->parentNode;
+                       // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
+                       $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
+                       $innerText       = $this->getInnerText($nodesToScore[$pt]);
+
+                       if (!$parentNode || !isset($parentNode->tagName)) {
+                               continue;
+                       }
+
+                       /* If this paragraph is less than 25 characters, don't even count it. */
+                       if(strlen($innerText) < 25) {
+                               continue;
+                       }
+
+                       /* Initialize readability data for the parent. */
+                       if (!$parentNode->hasAttribute('readability')) 
+                       {
+                               $this->initializeNode($parentNode);
+                               $candidates[] = $parentNode;
+                       }
+
+                       /* Initialize readability data for the grandparent. */
+                       if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
+                       {
+                               $this->initializeNode($grandParentNode);
+                               $candidates[] = $grandParentNode;
+                       }
+
+                       $contentScore = 0;
+
+                       /* Add a point for the paragraph itself as a base. */
+                       $contentScore++;
+
+                       /* Add points for any commas within this paragraph */
+                       $contentScore += count(explode(',', $innerText));
+                       
+                       /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+                       $contentScore += min(floor(strlen($innerText) / 100), 3);
+                       
+                       /* Add the score to the parent. The grandparent gets half. */
+                       $parentNode->getAttributeNode('readability')->value += $contentScore;
+
+                       if ($grandParentNode) {
+                               $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             
+                       }
+               }
+
+               /**
+               * After we've calculated scores, loop through all of the possible candidate nodes we found
+               * and find the one with the highest score.
+               **/
+               $topCandidate = null;
+               for ($c=0, $cl=count($candidates); $c < $cl; $c++)
+               {
+                       /**
+                       * Scale the final candidates score based on link density. Good content should have a
+                       * relatively small link density (5% or less) and be mostly unaffected by this operation.
+                       **/
+                       $readability = $candidates[$c]->getAttributeNode('readability');
+                       $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
+
+                       $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
+
+                       if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
+                               $topCandidate = $candidates[$c];
+                       }
+               }
+
+               /**
+               * If we still have no top candidate, just use the body as a last resort.
+               * We also have to copy the body node so it is something we can modify.
+               **/
+               if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
+               {
+                       $topCandidate = $this->dom->createElement('div');
+                       if ($page instanceof DOMDocument) {
+                               if (!isset($page->documentElement)) {
+                                       // we don't have a body either? what a mess! :)
+                               } else {
+                                       $topCandidate->innerHTML = $page->documentElement->innerHTML;
+                                       $page->documentElement->innerHTML = '';
+                                       $page->documentElement->appendChild($topCandidate);
+                               }
+                       } else {
+                               $topCandidate->innerHTML = $page->innerHTML;
+                               $page->innerHTML = '';
+                               $page->appendChild($topCandidate);
+                       }
+                       $this->initializeNode($topCandidate);
+               }
+
+               /**
+               * Now that we have the top candidate, look through its siblings for content that might also be related.
+               * Things like preambles, content split by ads that we removed, etc.
+               **/
+               $articleContent        = $this->dom->createElement('div');
+               $articleContent->setAttribute('id', 'readability-content');
+               $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
+               $siblingNodes          = $topCandidate->parentNode->childNodes;
+               if (!isset($siblingNodes)) {
+                       $siblingNodes = new stdClass;
+                       $siblingNodes->length = 0;
+               }
+
+               for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
+               {
+                       $siblingNode = $siblingNodes->item($s);
+                       $append      = false;
+
+                       $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+
+                       //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
+
+                       if ($siblingNode === $topCandidate)
+                       // or if ($siblingNode->isSameNode($topCandidate))
+                       {
+                               $append = true;
+                       }
+
+                       $contentBonus = 0;
+                       /* Give a bonus if sibling nodes and top candidates have the example same classname */
+                       if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
+                               $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
+                       }
+
+                       if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
+                       {
+                               $append = true;
+                       }
+                       
+                       if (strtoupper($siblingNode->nodeName) == 'P') {
+                               $linkDensity = $this->getLinkDensity($siblingNode);
+                               $nodeContent = $this->getInnerText($siblingNode);
+                               $nodeLength  = strlen($nodeContent);
+                               
+                               if ($nodeLength > 80 && $linkDensity < 0.25)
+                               {
+                                       $append = true;
+                               }
+                               else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
+                               {
+                                       $append = true;
+                               }
+                       }
+
+                       if ($append)
+                       {
+                               $this->dbg('Appending node: ' . $siblingNode->nodeName);
+
+                               $nodeToAppend = null;
+                               $sibNodeName = strtoupper($siblingNode->nodeName);
+                               if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
+                                       /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+                                       
+                                       $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
+                                       $nodeToAppend = $this->dom->createElement('div');
+                                       try {
+                                               $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
+                                               $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+                                       }
+                                       catch(Exception $e)
+                                       {
+                                               $this->dbg('Could not alter siblingNode to div, reverting back to original.');
+                                               $nodeToAppend = $siblingNode;
+                                               $s--;
+                                               $sl--;
+                                       }
+                               } else {
+                                       $nodeToAppend = $siblingNode;
+                                       $s--;
+                                       $sl--;
+                               }
+                               
+                               /* To ensure a node does not interfere with readability styles, remove its classnames */
+                               $nodeToAppend->removeAttribute('class');
+
+                               /* Append sibling and subtract from our list because it removes the node when you append to another node */
+                               $articleContent->appendChild($nodeToAppend);
+                       }
+               }
+
+               /**
+               * So we have all of the content that we need. Now we clean it up for presentation.
+               **/
+               $this->prepArticle($articleContent);
+
+               /**
+               * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+               * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+               * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+               * finding the -right- content.
+               **/
+               if (strlen($this->getInnerText($articleContent, false)) < 250)
+               {
+                       // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+                       // in the meantime, we check and create an empty element if it's not there.
+                       if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
+                       $this->body->innerHTML = $this->bodyCache;
+                       
+                       if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+                               $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+                               return $this->grabArticle($this->body);
+                       }
+                       else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+                               $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
+                               return $this->grabArticle($this->body);              
+                       }
+                       else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+                               $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+                               return $this->grabArticle($this->body);
+                       }
+                       else {
+                               return false;
+                       }
+               }
+               return $articleContent;
+       }
+       
+       /**
+       * Remove script tags from document
+       *
+       * @param DOMElement
+       * @return void
+       */
+       public function removeScripts($doc) {
+               $scripts = $doc->getElementsByTagName('script');
+               for($i = $scripts->length-1; $i >= 0; $i--)
+               {
+                       $scripts->item($i)->parentNode->removeChild($scripts->item($i));
+               }
+       }
+       
+       /**
+       * Get the inner text of a node.
+       * This also strips out any excess whitespace to be found.
+       *
+       * @param DOMElement $
+       * @param boolean $normalizeSpaces (default: true)
+       * @return string
+       **/
+       public function getInnerText($e, $normalizeSpaces=true) {
+               $textContent = '';
+
+               if (!isset($e->textContent) || $e->textContent == '') {
+                       return '';
+               }
+
+               $textContent = trim($e->textContent);
+
+               if ($normalizeSpaces) {
+                       return preg_replace($this->regexps['normalize'], ' ', $textContent);
+               } else {
+                       return $textContent;
+               }
+       }
+
+       /**
+       * Get the number of times a string $s appears in the node $e.
+       *
+       * @param DOMElement $e
+       * @param string - what to count. Default is ","
+       * @return number (integer)
+       **/
+       public function getCharCount($e, $s=',') {
+               return substr_count($this->getInnerText($e), $s);
+       }
+
+       /**
+       * Remove the style attribute on every $e and under.
+       *
+       * @param DOMElement $e
+       * @return void
+       */
+       public function cleanStyles($e) {
+               if (!is_object($e)) return;
+               $elems = $e->getElementsByTagName('*');
+               foreach ($elems as $elem) {
+                       $elem->removeAttribute('style');
+               }
+       }
+       
+       /**
+       * Get the density of links as a percentage of the content
+       * This is the amount of text that is inside a link divided by the total text in the node.
+       * 
+       * @param DOMElement $e
+       * @return number (float)
+       */
+       public function getLinkDensity($e) {
+               $links      = $e->getElementsByTagName('a');
+               $textLength = strlen($this->getInnerText($e));
+               $linkLength = 0;
+               for ($i=0, $il=$links->length; $i < $il; $i++)
+               {
+                       $linkLength += strlen($this->getInnerText($links->item($i)));
+               }
+               if ($textLength > 0) {
+                       return $linkLength / $textLength;
+               } else {
+                       return 0;
+               }
+       }
+       
+       /**
+       * Get an elements class/id weight. Uses regular expressions to tell if this 
+       * element looks good or bad.
+       *
+       * @param DOMElement $e
+       * @return number (Integer)
+       */
+       public function getClassWeight($e) {
+               if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+                       return 0;
+               }
+
+               $weight = 0;
+
+               /* Look for a special classname */
+               if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
+               {
+                       if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
+                               $weight -= 25;
+                       }
+                       if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
+                               $weight += 25;
+                       }
+               }
+
+               /* Look for a special ID */
+               if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
+               {
+                       if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
+                               $weight -= 25;
+                       }
+                       if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
+                               $weight += 25;
+                       }
+               }
+               return $weight;
+       }
+
+       /**
+       * Remove extraneous break tags from a node.
+       *
+       * @param DOMElement $node
+       * @return void
+       */
+       public function killBreaks($node) {
+               $html = $node->innerHTML;
+               $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
+               $node->innerHTML = $html;
+       }
+
+       /**
+       * Clean a node of all elements of type "tag".
+       * (Unless it's a youtube/vimeo video. People love movies.)
+       *
+       * Updated 2012-09-18 to preserve youtube/vimeo iframes
+       *
+       * @param DOMElement $e
+       * @param string $tag
+       * @return void
+       */
+       public function clean($e, $tag) {
+               $targetList = $e->getElementsByTagName($tag);
+               $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
+               
+               for ($y=$targetList->length-1; $y >= 0; $y--) {
+                       /* Allow youtube and vimeo videos through as people usually want to see those. */
+                       if ($isEmbed) {
+                               $attributeValues = '';
+                               for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
+                                       $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
+                               }
+                               
+                               /* First, check the elements attributes to see if any of them contain youtube or vimeo */
+                               if (preg_match($this->regexps['video'], $attributeValues)) {
+                                       continue;
+                               }
+
+                               /* Then check the elements inside this element for the same. */
+                               if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
+                                       continue;
+                               }
+                       }
+                       $targetList->item($y)->parentNode->removeChild($targetList->item($y));
+               }
+       }
+       
+       /**
+       * Clean an element of all tags of type "tag" if they look fishy.
+       * "Fishy" is an algorithm based on content length, classnames, 
+       * link density, number of images & embeds, etc.
+       *
+       * @param DOMElement $e
+       * @param string $tag
+       * @return void
+       */
+       public function cleanConditionally($e, $tag) {
+               if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+                       return;
+               }
+
+               $tagsList = $e->getElementsByTagName($tag);
+               $curTagsLength = $tagsList->length;
+
+               /**
+               * Gather counts for other typical elements embedded within.
+               * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+               *
+               * TODO: Consider taking into account original contentScore here.
+               */
+               for ($i=$curTagsLength-1; $i >= 0; $i--) {
+                       $weight = $this->getClassWeight($tagsList->item($i));
+                       $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
+                       
+                       $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
+
+                       if ($weight + $contentScore < 0) {
+                               $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+                       }
+                       else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
+                               /**
+                               * If there are not very many commas, and the number of
+                               * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+                               **/
+                               $p      = $tagsList->item($i)->getElementsByTagName('p')->length;
+                               $img    = $tagsList->item($i)->getElementsByTagName('img')->length;
+                               $li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
+                               $input  = $tagsList->item($i)->getElementsByTagName('input')->length;
+                               $a              = $tagsList->item($i)->getElementsByTagName('a')->length;
+
+                               $embedCount = 0;
+                               $embeds = $tagsList->item($i)->getElementsByTagName('embed');
+                               for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+                                       if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+                                               $embedCount++; 
+                                       }
+                               }
+                               $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+                               for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+                                       if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+                                               $embedCount++; 
+                                       }
+                               }
+
+                               $linkDensity   = $this->getLinkDensity($tagsList->item($i));
+                               $contentLength = strlen($this->getInnerText($tagsList->item($i)));
+                               $toRemove      = false;
+
+                               if ($this->lightClean) {
+                                       $this->dbg('Light clean...');
+                                       if ( ($img > $p) && ($img > 4) ) {
+                                               $this->dbg(' more than 4 images and more image elements than paragraph elements');
+                                               $toRemove = true;
+                                       } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+                                               $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+                                               $toRemove = true;
+                                       } else if ( $input > floor($p/3) ) {
+                                               $this->dbg(' too many <input> elements');
+                                               $toRemove = true; 
+                                       } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+                                               $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
+                                               $toRemove = true;
+                                       } else if($weight < 25 && $linkDensity > 0.2) {
+                                               $this->dbg(' weight smaller than 25 and link density above 0.2');
+                                               $toRemove = true;
+                                       } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+                                               $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+                                               $toRemove = true;
+                                       } else if($embedCount > 3) {
+                                               $this->dbg(' more than 3 embeds');
+                                               $toRemove = true;
+                                       }
+                               } else {
+                                       $this->dbg('Standard clean...');
+                                       if ( $img > $p ) {
+                                               $this->dbg(' more image elements than paragraph elements');
+                                               $toRemove = true;
+                                       } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+                                               $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+                                               $toRemove = true;
+                                       } else if ( $input > floor($p/3) ) {
+                                               $this->dbg(' too many <input> elements');
+                                               $toRemove = true; 
+                                       } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+                                               $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+                                               $toRemove = true;
+                                       } else if($weight < 25 && $linkDensity > 0.2) {
+                                               $this->dbg(' weight smaller than 25 and link density above 0.2');
+                                               $toRemove = true;
+                                       } else if($weight >= 25 && $linkDensity > 0.5) {
+                                               $this->dbg(' weight above 25 but link density greater than 0.5');
+                                               $toRemove = true;
+                                       } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+                                               $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+                                               $toRemove = true;
+                                       }
+                               }
+
+                               if ($toRemove) {
+                                       //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
+                                       $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+                               }
+                       }
+               }
+       }
+
+       /**
+       * Clean out spurious headers from an Element. Checks things like classnames and link density.
+       *
+       * @param DOMElement $e
+       * @return void
+       */
+       public function cleanHeaders($e) {
+               for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+                       $headers = $e->getElementsByTagName('h' . $headerIndex);
+                       for ($i=$headers->length-1; $i >=0; $i--) {
+                               if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+                                       $headers->item($i)->parentNode->removeChild($headers->item($i));
+                               }
+                       }
+               }
+       }
+
+       public function flagIsActive($flag) {
+               return ($this->flags & $flag) > 0;
+       }
+       
+       public function addFlag($flag) {
+               $this->flags = $this->flags | $flag;
+       }
+       
+       public function removeFlag($flag) {
+               $this->flags = $this->flags & ~$flag;
+       }
+}
 ?>
\ No newline at end of file
index 4faad6d941904c5526bd4436d1793c4a52bb7b53..7a56be8c7c967cec629d0129feb3bb72d4ee8249 100755 (executable)
@@ -3,8 +3,8 @@
 // Author: Keyvan Minoukadeh\r
 // Copyright (c) 2013 Keyvan Minoukadeh\r
 // License: AGPLv3\r
-// Version: 3.1\r
-// Date: 2013-03-05\r
+// Version: 3.2\r
+// Date: 2013-05-13\r
 // More info: http://fivefilters.org/content-only/\r
 // Help: http://help.fivefilters.org\r
 \r
@@ -25,12 +25,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 \r
 // Usage\r
 // -----\r
-// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org\r
-// The following options can be passed in the querystring:\r
-// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))\r
-// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)\r
-// * API key: key=[api key] (optional, refer to config.php)\r
-// * Max entries to process: max=[max number of items] (optional)\r
+// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article\r
+// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage\r
 \r
 error_reporting(E_ALL ^ E_NOTICE);\r
 ini_set("display_errors", 1);\r
@@ -76,8 +72,8 @@ header('X-Robots-Tag: noindex, nofollow');
 ////////////////////////////////\r
 // Check if service is enabled\r
 ////////////////////////////////\r
-if (!$options->enabled) { \r
-       die('The full-text RSS service is currently disabled'); \r
+if (!$options->enabled) {\r
+       die('The full-text RSS service is currently disabled');\r
 }\r
 \r
 ////////////////////////////////\r
@@ -121,8 +117,8 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc');
 ////////////////////////////////\r
 // Check for feed URL\r
 ////////////////////////////////\r
-if (!isset($_GET['url'])) { \r
-       die('No URL supplied'); \r
+if (!isset($_GET['url'])) {\r
+       die('No URL supplied');\r
 }\r
 $url = trim($_GET['url']);\r
 if (strtolower(substr($url, 0, 7)) == 'feed://') {\r
@@ -161,10 +157,12 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
        if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);\r
        if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);\r
        if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);\r
-       if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);   \r
+       if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);\r
        if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);\r
        if (isset($_GET['xss'])) $redirect .= '&xss';\r
        if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';\r
+       if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);\r
+       if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);\r
        if (isset($_GET['debug'])) $redirect .= '&debug';\r
        if ($debug_mode) {\r
                debug('Redirecting to hide access key, follow URL below to continue');\r
@@ -177,7 +175,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
 \r
 ///////////////////////////////////////////////\r
 // Set timezone.\r
-// Prevents warnings, but needs more testing - \r
+// Prevents warnings, but needs more testing -\r
 // perhaps if timezone is set in php.ini we\r
 // don't need to set it at all...\r
 ///////////////////////////////////////////////\r
@@ -199,7 +197,7 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int
 }\r
 $key_index = ($valid_key) ? (int)$_GET['key'] : 0;\r
 if (!$valid_key && $options->key_required) {\r
-       die('A valid key must be supplied'); \r
+       die('A valid key must be supplied');\r
 }\r
 if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {\r
        die('The entered key is invalid');\r
@@ -250,6 +248,28 @@ if ($options->favour_feed_titles == 'user') {
        $favour_feed_titles = $options->favour_feed_titles;\r
 }\r
 \r
+///////////////////////////////////////////////\r
+// Include full content in output?\r
+///////////////////////////////////////////////\r
+if ($options->content === 'user') {\r
+       if (isset($_GET['content']) && $_GET['content'] === '0') {\r
+               $options->content = false;\r
+       } else {\r
+               $options->content = true;\r
+       }\r
+}\r
+\r
+///////////////////////////////////////////////\r
+// Include summaries in output?\r
+///////////////////////////////////////////////\r
+if ($options->summary === 'user') {\r
+       if (isset($_GET['summary']) && $_GET['summary'] === '1') {\r
+               $options->summary = true;\r
+       } else {\r
+               $options->summary = false;\r
+       }\r
+}\r
+\r
 ///////////////////////////////////////////////\r
 // Exclude items if extraction fails\r
 ///////////////////////////////////////////////\r
@@ -272,15 +292,6 @@ if ($options->detect_language === 'user') {
        $detect_language = $options->detect_language;\r
 }\r
 \r
-if ($detect_language >= 2) {\r
-       $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',\r
-       'cebuano' => 'ceb', // ISO 639-2\r
-       'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',\r
-       'hawaiian' => 'haw', // ISO 639-2 \r
-       'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',\r
-       'pidgin' => 'cpe', // ISO 639-2  \r
-       'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');\r
-}\r
 $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);\r
 \r
 /////////////////////////////////////\r
@@ -330,7 +341,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
 //////////////////////////////////\r
 if ($options->caching) {\r
        debug('Caching is enabled...');\r
-       $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));\r
+       $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));\r
        $check_cache = true;\r
        if ($options->apc && $options->smart_cache) {\r
                apc_add("cache.$cache_id", 0, 10*60);\r
@@ -468,7 +479,7 @@ if ($img_url = $feed->get_image_url()) {
 ////////////////////////////////////////////\r
 // Loop through feed items\r
 ////////////////////////////////////////////\r
-$items = $feed->get_items(0, $max);    \r
+$items = $feed->get_items(0, $max);\r
 // Request all feed items in parallel (if supported)\r
 $urls_sanitized = array();\r
 $urls = array();\r
@@ -550,24 +561,43 @@ foreach ($items as $key => $item) {
                        $is_single_page = false;\r
                        if ($single_page_response = getSinglePage($item, $html, $effective_url)) {\r
                                $is_single_page = true;\r
-                               $html = $single_page_response['body'];\r
-                               // remove strange things\r
-                               $html = str_replace('</[>', '', $html); \r
-                               $html = convert_to_utf8($html, $single_page_response['headers']);\r
                                $effective_url = $single_page_response['effective_url'];\r
-                               debug("Retrieved single-page view from $effective_url");\r
+                               // check if action defined for returned Content-Type\r
+                               $mime_info = get_mime_action_info($single_page_response['headers']);\r
+                               if (isset($mime_info['action'])) {\r
+                                       if ($mime_info['action'] == 'exclude') {\r
+                                               continue; // skip this feed item entry\r
+                                       } elseif ($mime_info['action'] == 'link') {\r
+                                               if ($mime_info['type'] == 'image') {\r
+                                                       $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";\r
+                                               } else {\r
+                                                       $html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";\r
+                                               }\r
+                                               $extracted_title = $mime_info['name'];\r
+                                               $do_content_extraction = false;\r
+                                       }\r
+                               }\r
+                               if ($do_content_extraction) {\r
+                                       $html = $single_page_response['body'];\r
+                                       // remove strange things\r
+                                       $html = str_replace('</[>', '', $html);\r
+                                       $html = convert_to_utf8($html, $single_page_response['headers']);\r
+                                       debug("Retrieved single-page view from $effective_url");\r
+                               }\r
                                unset($single_page_response);\r
                        }\r
+               }\r
+               if ($do_content_extraction) {\r
                        debug('--------');\r
                        debug('Attempting to extract content');\r
                        $extract_result = $extractor->process($html, $effective_url);\r
                        $readability = $extractor->readability;\r
-                       $content_block = ($extract_result) ? $extractor->getContent() : null;                   \r
+                       $content_block = ($extract_result) ? $extractor->getContent() : null;\r
                        $extracted_title = ($extract_result) ? $extractor->getTitle() : '';\r
                        // Deal with multi-page articles\r
                        //die('Next: '.$extractor->getNextPageUrl());\r
                        $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());\r
-                       if ($options->multipage && $is_multi_page) {\r
+                       if ($options->multipage && $is_multi_page && $options->content) {\r
                                debug('--------');\r
                                debug('Attempting to process multi-page article');\r
                                $multi_page_urls = array();\r
@@ -580,7 +610,7 @@ foreach ($items as $key => $item) {
                                                // check it's not what we have already!\r
                                                if (!in_array($next_page_url, $multi_page_urls)) {\r
                                                        // it's not, so let's attempt to fetch it\r
-                                                       $multi_page_urls[] = $next_page_url;                                            \r
+                                                       $multi_page_urls[] = $next_page_url;\r
                                                        $_prev_ref = $http->referer;\r
                                                        if (($response = $http->get($next_page_url, true)) && $response['status_code'] < 300) {\r
                                                                // make sure mime type is not something with a different action associated\r
@@ -605,13 +635,15 @@ foreach ($items as $key => $item) {
                                // did we successfully deal with this multi-page article?\r
                                if (empty($multi_page_content)) {\r
                                        debug('Failed to extract all parts of multi-page article, so not going to include them');\r
-                                       $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';\r
+                                       $_page = $readability->dom->createElement('p');\r
+                                       $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';\r
+                                       $multi_page_content[] = $_page;\r
                                }\r
                                foreach ($multi_page_content as $_page) {\r
                                        $_page = $content_block->ownerDocument->importNode($_page, true);\r
                                        $content_block->appendChild($_page);\r
                                }\r
-                               unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url);\r
+                               unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);\r
                        }\r
                }\r
                // use extracted title for both feed and item title if we're using single-item dummy feed\r
@@ -658,7 +690,7 @@ foreach ($items as $key => $item) {
                        } else {\r
                                $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML\r
                        }\r
-                       unset($content_block);\r
+                       //unset($content_block);\r
                        // post-processing cleanup\r
                        $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);\r
                        if ($links == 'remove') {\r
@@ -671,130 +703,155 @@ foreach ($items as $key => $item) {
                }\r
        }\r
 \r
-               if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment\r
-                       $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));\r
+       if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment\r
+               $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));\r
+       } else {\r
+               $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));\r
+       }\r
+       // filter xss?\r
+       if ($xss_filter) {\r
+               debug('Filtering HTML to remove XSS');\r
+               $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));\r
+       }\r
+\r
+       // add content\r
+       if ($options->summary === true) {\r
+               // get summary\r
+               $summary = '';\r
+               if (!$do_content_extraction) {\r
+                       $summary = $html;\r
                } else {\r
-                       $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));\r
-               }\r
-               // filter xss?\r
-               if ($xss_filter) {\r
-                       debug('Filtering HTML to remove XSS');\r
-                       $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));\r
-               }\r
-               $newitem->setDescription($html);\r
-               \r
-               // set date\r
-               if ((int)$item->get_date('U') > 0) {\r
-                       $newitem->setDate((int)$item->get_date('U'));\r
-               } elseif ($extractor->getDate()) {\r
-                       $newitem->setDate($extractor->getDate());\r
-               }\r
-               \r
-               // add authors\r
-               if ($authors = $item->get_authors()) {\r
-                       foreach ($authors as $author) {\r
-                               // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel\r
-                               if ($author->get_name() !== null) {\r
-                                       $newitem->addElement('dc:creator', $author->get_name());\r
-                               } elseif ($author->get_email() !== null) {\r
-                                       $newitem->addElement('dc:creator', $author->get_email());\r
+                       // Try to get first few paragraphs\r
+                       if (isset($content_block) && ($content_block instanceof DOMElement)) {\r
+                               $_paras = $content_block->getElementsByTagName('p');\r
+                               foreach ($_paras as $_para) {\r
+                                       $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';\r
+                                       if (strlen($summary) > 200) break;\r
                                }\r
+                       } else {\r
+                               $summary = $html;\r
                        }\r
-               } elseif ($authors = $extractor->getAuthors()) {\r
-                       //TODO: make sure the list size is reasonable\r
-                       foreach ($authors as $author) {\r
-                               // TODO: xpath often selects authors from other articles linked from the page.\r
-                               // for now choose first item\r
-                               $newitem->addElement('dc:creator', $author);\r
-                               break;\r
+               }\r
+               unset($_paras, $_para);\r
+               $summary = get_excerpt($summary);\r
+               $newitem->setDescription($summary);\r
+               if ($options->content) $newitem->setElement('content:encoded', $html);\r
+       } else {\r
+               if ($options->content) $newitem->setDescription($html);\r
+       }\r
+\r
+       // set date\r
+       if ((int)$item->get_date('U') > 0) {\r
+               $newitem->setDate((int)$item->get_date('U'));\r
+       } elseif ($extractor->getDate()) {\r
+               $newitem->setDate($extractor->getDate());\r
+       }\r
+\r
+       // add authors\r
+       if ($authors = $item->get_authors()) {\r
+               foreach ($authors as $author) {\r
+                       // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel\r
+                       if ($author->get_name() !== null) {\r
+                               $newitem->addElement('dc:creator', $author->get_name());\r
+                       } elseif ($author->get_email() !== null) {\r
+                               $newitem->addElement('dc:creator', $author->get_email());\r
                        }\r
                }\r
-               \r
-               // add language\r
-               if ($detect_language) {\r
-                       $language = $extractor->getLanguage();\r
-                       if (!$language) $language = $feed->get_language();\r
-                       if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {\r
-                               try {\r
-                                       if ($use_cld) {\r
-                                               // Use PHP-CLD extension\r
-                                               $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error\r
-                                               $res = $php_cld($text_sample);\r
-                                               if (is_array($res) && count($res) > 0) {\r
-                                                       $language = $res[0]['code'];\r
-                                               }       \r
-                                       } else {\r
-                                               //die('what');\r
-                                               // Use PEAR's Text_LanguageDetect\r
-                                               if (!isset($l)) {\r
-                                                       $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');\r
-                                               }\r
-                                               $l_result = $l->detect($text_sample, 1);\r
-                                               if (count($l_result) > 0) {\r
-                                                       $language = $language_codes[key($l_result)];\r
-                                               }\r
+       } elseif ($authors = $extractor->getAuthors()) {\r
+               //TODO: make sure the list size is reasonable\r
+               foreach ($authors as $author) {\r
+                       // TODO: xpath often selects authors from other articles linked from the page.\r
+                       // for now choose first item\r
+                       $newitem->addElement('dc:creator', $author);\r
+                       break;\r
+               }\r
+       }\r
+\r
+       // add language\r
+       if ($detect_language) {\r
+               $language = $extractor->getLanguage();\r
+               if (!$language) $language = $feed->get_language();\r
+               if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {\r
+                       try {\r
+                               if ($use_cld) {\r
+                                       // Use PHP-CLD extension\r
+                                       $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error\r
+                                       $res = $php_cld($text_sample);\r
+                                       if (is_array($res) && count($res) > 0) {\r
+                                               $language = $res[0]['code'];\r
+                                       }\r
+                               } else {\r
+                                       //die('what');\r
+                                       // Use PEAR's Text_LanguageDetect\r
+                                       if (!isset($l)) {\r
+                                         $l = new Text_LanguageDetect();\r
+                                         $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")\r
+                                       }\r
+                                       $l_result = $l->detect($text_sample, 1);\r
+                                       if (count($l_result) > 0) {\r
+                                               $language = key($l_result);\r
                                        }\r
-                               } catch (Exception $e) {\r
-                                       //die('error: '.$e);    \r
-                                       // do nothing\r
                                }\r
-                       }\r
-                       if ($language && (strlen($language) < 7)) {     \r
-                               $newitem->addElement('dc:language', $language);\r
+                       } catch (Exception $e) {\r
+                               //die('error: '.$e);\r
+                               // do nothing\r
                        }\r
                }\r
-               \r
-               // add MIME type (if it appeared in our exclusions lists)\r
-               if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);\r
-               // add effective URL (URL after redirects)\r
-               if (isset($effective_url)) {\r
-                       //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.\r
-                       //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)\r
-                       //temporary measure: use utf8_encode()\r
-                       $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));\r
-               } else {\r
-                       $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));\r
+               if ($language && (strlen($language) < 7)) {\r
+                       $newitem->addElement('dc:language', $language);\r
                }\r
-               \r
-               // add categories\r
-               if ($categories = $item->get_categories()) {\r
-                       foreach ($categories as $category) {\r
-                               if ($category->get_label() !== null) {\r
-                                       $newitem->addElement('category', $category->get_label());\r
-                               }\r
+       }\r
+\r
+       // add MIME type (if it appeared in our exclusions lists)\r
+       if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);\r
+       // add effective URL (URL after redirects)\r
+       if (isset($effective_url)) {\r
+               //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.\r
+               //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-�-25th-March-2012-Special-Program-from-Liari-(Karachi)\r
+               //temporary measure: use utf8_encode()\r
+               $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));\r
+       } else {\r
+               $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));\r
+       }\r
+\r
+       // add categories\r
+       if ($categories = $item->get_categories()) {\r
+               foreach ($categories as $category) {\r
+                       if ($category->get_label() !== null) {\r
+                               $newitem->addElement('category', $category->get_label());\r
                        }\r
                }\r
-               \r
-               // check for enclosures\r
-               if ($options->keep_enclosures) {\r
-                       if ($enclosures = $item->get_enclosures()) {\r
-                               foreach ($enclosures as $enclosure) {\r
-                                       // thumbnails\r
-                                       foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {\r
-                                               $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));\r
-                                       }\r
-                                       if (!$enclosure->get_link()) continue;\r
-                                       $enc = array();\r
-                                       // Media RSS spec ($enc): http://search.yahoo.com/mrss\r
-                                       // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4\r
-                                       $enc['url'] = $enclosure->get_link();\r
-                                       if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();\r
-                                       if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();\r
-                                       if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();\r
-                                       if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();\r
-                                       if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();\r
-                                       if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();\r
-                                       if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();\r
-                                       if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();\r
-                                       if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();\r
-                                       if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();\r
-                                       if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();\r
-                                       if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();\r
-                                       $newitem->addElement('media:content', '', $enc);\r
+       }\r
+\r
+       // check for enclosures\r
+       if ($options->keep_enclosures) {\r
+               if ($enclosures = $item->get_enclosures()) {\r
+                       foreach ($enclosures as $enclosure) {\r
+                               // thumbnails\r
+                               foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {\r
+                                       $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));\r
                                }\r
+                               if (!$enclosure->get_link()) continue;\r
+                               $enc = array();\r
+                               // Media RSS spec ($enc): http://search.yahoo.com/mrss\r
+                               // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4\r
+                               $enc['url'] = $enclosure->get_link();\r
+                               if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();\r
+                               if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();\r
+                               if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();\r
+                               if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();\r
+                               if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();\r
+                               if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();\r
+                               if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();\r
+                               if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();\r
+                               if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();\r
+                               if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();\r
+                               if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();\r
+                               if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();\r
+                               $newitem->addElement('media:content', '', $enc);\r
                        }\r
                }\r
-       /* } */\r
+       }\r
        $output->addItem($newitem);\r
        unset($html);\r
        $item_count++;\r
index 1c11b8f6b2345ea15adb93496ed97b2686a9cda6..4e985372c27b7f0780d54535acf3401b6c24087b 100755 (executable)
@@ -66,6 +66,38 @@ class DummySingleItem {
 // HELPER FUNCTIONS
 ///////////////////////////////
 
+// Adapted from WordPress\r
+// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173\r
+function get_excerpt($text, $num_words=55, $more=null) {\r
+       if (null === $more) $more = '&hellip;';\r
+       $text = strip_tags($text);\r
+       //TODO: Check if word count is based on single characters (East Asian characters)\r
+       /*\r
+       if (1==2) {\r
+       $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');\r
+       preg_match_all('/./u', $text, $words_array);\r
+       $words_array = array_slice($words_array[0], 0, $num_words + 1);\r
+       $sep = '';\r
+       } else {\r
+       $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);\r
+       $sep = ' ';\r
+       }\r
+       */\r
+       $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);\r
+       $sep = ' ';\r
+       if (count($words_array) > $num_words) {\r
+               array_pop($words_array);\r
+               $text = implode($sep, $words_array);\r
+               $text = $text.$more;\r
+       } else {\r
+               $text = implode($sep, $words_array);\r
+       }\r
+       // trim whitespace at beginning or end of string\r
+       // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2\r
+       $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);\r
+       return $text;\r
+}\r
+\r
 function url_allowed($url) {
        global $options;
        if (!empty($options->allowed_urls)) {
@@ -165,14 +197,6 @@ function convert_to_utf8($html, $header=null)
                        if (strtolower($encoding) != 'utf-8') {
                                debug('Converting to UTF-8');
                                $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
-                               /*
-                               if (function_exists('iconv')) {
-                                       // iconv appears to handle certain character encodings better than mb_convert_encoding
-                                       $html = iconv($encoding, 'utf-8', $html);
-                               } else {
-                                       $html = mb_convert_encoding($html, 'utf-8', $encoding);
-                               }
-                               */
                        }
                }
        }
@@ -196,7 +220,7 @@ function makeAbsolute($base, $elem) {
 }
 function makeAbsoluteAttr($base, $e, $attr) {
        if ($e->hasAttribute($attr)) {
-               // Trim leading and trailing white space. I don't really like this but 
+               // Trim leading and trailing white space. I don't really like this but
                // unfortunately it does appear on some sites. e.g.  <img src=" /path/to/image.jpg" />
                $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
                $url = str_replace(' ', '%20', $url);
index a1b767fd5af66ae62087ced67fdfe7218c221f28..76ca8b3cba6e57b99a95e0dff108bd5cebb97ab0 100644 (file)
@@ -1,3 +1,2 @@
-<?php\r
-// this is here to prevent directory listing over the web\r
-?>
\ No newline at end of file
+<?php
+// this is here to prevent directory listing over the web
\ No newline at end of file
index bf0d87ab1b2b0ec1a11a3973d2845b42413d9767..eaf01ebdb46bd0ae40dc572aa883350d1360131d 100644 (file)
@@ -1 +1 @@
-4
\ No newline at end of file
+2013-05-12T22:53:07Z
\ No newline at end of file