From 3ec62cf95ab4436923d4c665fad7aef226cbb822 Mon Sep 17 00:00:00 2001
From: Maryana Rozhankivska <mariroz@mr.lviv.ua>
Date: Thu, 22 May 2014 17:16:38 +0300
Subject: update to 3.2 version of full-text-rss, issue #694

---
 .../content-extractor/ContentExtractor.php         | 1455 +++++++------
 .../libraries/content-extractor/SiteConfig.php     |  681 +++---
 inc/3rdparty/libraries/feedwriter/FeedItem.php     |  100 +-
 inc/3rdparty/libraries/feedwriter/FeedWriter.php   |   17 +-
 inc/3rdparty/libraries/html5/TreeBuilder.php       |   13 +-
 .../libraries/humble-http-agent/CookieJar.php      |  807 ++++---
 .../humble-http-agent/HumbleHttpAgent.php          | 1589 +++++++-------
 .../SimplePie_HumbleHttpAgent.php                  |  157 +-
 .../libraries/language-detect/LanguageDetect.php   |  992 +++++----
 inc/3rdparty/libraries/readability/Readability.php | 2274 ++++++++++----------
 10 files changed, 4097 insertions(+), 3988 deletions(-)
 mode change 100644 => 100755 inc/3rdparty/libraries/feedwriter/FeedItem.php

(limited to 'inc/3rdparty/libraries')

diff --git a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
index ddd33bb5..21e693e7 100644
--- a/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
+++ b/inc/3rdparty/libraries/content-extractor/ContentExtractor.php
@@ -1,728 +1,727 @@
-<?php
-/**
- * Content Extractor
- * 
- * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 
- * to extract content from HTML files.
- * 
- * @version 1.0
- * @date 2013-02-05
- * @author Keyvan Minoukadeh
- * @copyright 2013 Keyvan Minoukadeh
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
- */
-
-class ContentExtractor
-{
-	protected static $tidy_config = array(
-				 'clean' => true,
-				 'output-xhtml' => true,
-				 'logical-emphasis' => true,
-				 'show-body-only' => false,
-				 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
-				 'new-inline-tags' => 'mark, time, meter, progress, data',
-				 'wrap' => 0,
-				 'drop-empty-paras' => true,
-				 'drop-proprietary-attributes' => false,
-				 'enclose-text' => true,
-				 'enclose-block-text' => true,
-				 'merge-divs' => true,
-				 'merge-spans' => true,
-				 'char-encoding' => 'utf8',
-				 'hide-comments' => true
-				 );
-	protected $html;
-	protected $config;
-	protected $title;
-	protected $author = array();
-	protected $language;
-	protected $date;
-	protected $body;
-	protected $success = false;
-	protected $nextPageUrl;
-	public $allowedParsers = array('libxml', 'html5lib');
-	public $fingerprints = array();
-	public $readability;
-	public $debug = false;
-	public $debugVerbose = false;
-
-	function __construct($path, $fallback=null) {
-		SiteConfig::set_config_path($path, $fallback);	
-	}
-	
-	protected function debug($msg) {
-		if ($this->debug) {
-			$mem = round(memory_get_usage()/1024, 2);
-			$memPeak = round(memory_get_peak_usage()/1024, 2);
-			echo '* ',$msg;
-			if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
-			echo "\n";
-			ob_flush();
-			flush();
-		}
-	}
-	
-	public function reset() {
-		$this->html = null;
-		$this->readability = null;
-		$this->config = null;
-		$this->title = null;
-		$this->body = null;
-		$this->author = array();
-		$this->language = null;
-		$this->date = null;
-		$this->nextPageUrl = null;
-		$this->success = false;
-	}
-
-	public function findHostUsingFingerprints($html) {
-		$this->debug('Checking fingerprints...');
-		$head = substr($html, 0, 8000);
-		foreach ($this->fingerprints as $_fp => $_fphost) {
-			$lookin = 'html';
-			if (is_array($_fphost)) {
-				if (isset($_fphost['head']) && $_fphost['head']) {
-					$lookin = 'head';
-				}
-				$_fphost = $_fphost['hostname'];
-			}
-			if (strpos($$lookin, $_fp) !== false) {
-				$this->debug("Found match: $_fphost");
-				return $_fphost;
-			}
-		}
-		$this->debug('No fingerprint matches');
-		return false;
-	}
-	
-	// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
-	public function buildSiteConfig($url, $html='', $add_to_cache=true) {
-		// extract host name
-		$host = @parse_url($url, PHP_URL_HOST);
-		$host = strtolower($host);
-		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
-		// is merged version already cached?
-		if (SiteConfig::is_cached("$host.merged")) {
-			$this->debug("Returning cached and merged site config for $host");
-			return SiteConfig::build("$host.merged");
-		}
-		// let's build from site_config/custom/ and standard/
-		$config = SiteConfig::build($host);
-		if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
-			SiteConfig::add_to_cache($host, $config);
-		}
-		// if no match, use defaults
-		if (!$config) $config = new SiteConfig();
-		// load fingerprint config?
-		if ($config->autodetect_on_failure()) {
-			// check HTML for fingerprints
-			if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
-				if ($config_fingerprint = SiteConfig::build($_fphost)) {
-					$this->debug("Appending site config settings from $_fphost (fingerprint match)");
-					$config->append($config_fingerprint);
-					if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
-						//$config_fingerprint->cache_in_apc = true;
-						SiteConfig::add_to_cache($_fphost, $config_fingerprint);
-					}
-				}
-			}
-		}
-		// load global config?
-		if ($config->autodetect_on_failure()) {
-			if ($config_global = SiteConfig::build('global', true)) {
-				$this->debug('Appending site config settings from global.txt');
-				$config->append($config_global);
-				if ($add_to_cache && !SiteConfig::is_cached('global')) {
-					//$config_global->cache_in_apc = true;
-					SiteConfig::add_to_cache('global', $config_global);
-				}
-			}
-		}
-		// store copy of merged config
-		if ($add_to_cache) {
-			// do not store in APC if wildcard match
-			$use_apc = ($host == $config->cache_key);
-			$config->cache_key = null;
-			SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
-		}
-		return $config;
-	}
-	
-	// returns true on success, false on failure
-	// $smart_tidy indicates that if tidy is used and no results are produced, we will
-	// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
-	// but it has problems of its own which we try to avoid with this option.
-	public function process($html, $url, $smart_tidy=true) {
-		$this->reset();
-		$this->config = $this->buildSiteConfig($url, $html);
-		
-		// do string replacements
-		if (!empty($this->config->find_string)) {
-			if (count($this->config->find_string) == count($this->config->replace_string)) {
-				$html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
-				$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
-			} else {
-				$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
-			}
-			unset($_count);
-		}
-		
-		// use tidy (if it exists)?
-		// This fixes problems with some sites which would otherwise
-		// trouble DOMDocument's HTML parsing. (Although sometimes it
-		// makes matters worse, which is why you can override it in site config files.)
-		$tidied = false;
-		if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
-			$this->debug('Using Tidy');
-			$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
-			if (tidy_clean_repair($tidy)) {
-				$original_html = $html;
-				$tidied = true;
-				$html = $tidy->value;
-			}
-			unset($tidy);
-		}
-		
-		// load and parse html
-		$_parser = $this->config->parser();
-		if (!in_array($_parser, $this->allowedParsers)) {
-			$this->debug("HTML parser $_parser not listed, using libxml instead");
-			$_parser = 'libxml';
-		}
-		$this->debug("Attempting to parse HTML with $_parser");
-		$this->readability = new Readability($html, $url, $_parser);
-		
-		// we use xpath to find elements in the given HTML document
-		// see http://en.wikipedia.org/wiki/XPath_1.0
-		$xpath = new DOMXPath($this->readability->dom);
-
-		// try to get next page link
-		foreach ($this->config->next_page_link as $pattern) {
-			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
-			if (is_string($elems)) {
-				$this->nextPageUrl = trim($elems);
-				break;
-			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
-				foreach ($elems as $item) {
-					if ($item instanceof DOMElement && $item->hasAttribute('href')) {
-						$this->nextPageUrl = $item->getAttribute('href');
-						break 2;
-					} elseif ($item instanceof DOMAttr && $item->value) {
-						$this->nextPageUrl = $item->value;
-						break 2;
-					}
-				}
-			}
-		}
-		
-		// try to get title
-		foreach ($this->config->title as $pattern) {
-			// $this->debug("Trying $pattern");
-			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
-			if (is_string($elems)) {
-				$this->title = trim($elems);
-				$this->debug('Title expression evaluated as string: '.$this->title);
-				$this->debug("...XPath match: $pattern");
-				break;
-			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
-				$this->title = $elems->item(0)->textContent;
-				$this->debug('Title matched: '.$this->title);
-				$this->debug("...XPath match: $pattern");
-				// remove title from document
-				try {
-					$elems->item(0)->parentNode->removeChild($elems->item(0));
-				} catch (DOMException $e) {
-					// do nothing
-				}
-				break;
-			}
-		}
-		
-		// try to get author (if it hasn't already been set)
-		if (empty($this->author)) {
-			foreach ($this->config->author as $pattern) {
-				$elems = @$xpath->evaluate($pattern, $this->readability->dom);
-				if (is_string($elems)) {
-					if (trim($elems) != '') {
-						$this->author[] = trim($elems);
-						$this->debug('Author expression evaluated as string: '.trim($elems));
-						$this->debug("...XPath match: $pattern");
-						break;
-					}
-				} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
-					foreach ($elems as $elem) {
-						if (!isset($elem->parentNode)) continue;
-						$this->author[] = trim($elem->textContent);
-						$this->debug('Author matched: '.trim($elem->textContent));
-					}
-					if (!empty($this->author)) {
-						$this->debug("...XPath match: $pattern");
-						break;
-					}
-				}
-			}
-		}
-		
-		// try to get language
-		$_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
-		foreach ($_lang_xpath as $pattern) {
-			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
-			if (is_string($elems)) {
-				if (trim($elems) != '') {
-					$this->language = trim($elems);
-					$this->debug('Language matched: '.$this->language);
-					break;
-				}
-			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
-				foreach ($elems as $elem) {
-					if (!isset($elem->parentNode)) continue;
-					$this->language = trim($elem->textContent);
-					$this->debug('Language matched: '.$this->language);					
-				}
-				if ($this->language) break;
-			}
-		}
-		
-		// try to get date
-		foreach ($this->config->date as $pattern) {
-			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
-			if (is_string($elems)) {
-				$this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));				
-			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
-				$this->date = $elems->item(0)->textContent;
-				$this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
-				// remove date from document
-				// $elems->item(0)->parentNode->removeChild($elems->item(0));
-			}
-			if (!$this->date) {
-				$this->date = null;
-			} else {
-				$this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
-				$this->debug("...XPath match: $pattern");
-				break;
-			}
-		}
-
-		// strip elements (using xpath expressions)
-		foreach ($this->config->strip as $pattern) {
-			$elems = @$xpath->query($pattern, $this->readability->dom);
-			// check for matches
-			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' elements (strip)');
-				for ($i=$elems->length-1; $i >= 0; $i--) {
-					$elems->item($i)->parentNode->removeChild($elems->item($i));
-				}
-			}
-		}
-		
-		// strip elements (using id and class attribute values)
-		foreach ($this->config->strip_id_or_class as $string) {
-			$string = strtr($string, array("'"=>'', '"'=>''));
-			$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
-			// check for matches
-			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
-				for ($i=$elems->length-1; $i >= 0; $i--) {
-					$elems->item($i)->parentNode->removeChild($elems->item($i));
-				}
-			}
-		}
-		
-		// strip images (using src attribute values)
-		foreach ($this->config->strip_image_src as $string) {
-			$string = strtr($string, array("'"=>'', '"'=>''));
-			$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
-			// check for matches
-			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' image elements');
-				for ($i=$elems->length-1; $i >= 0; $i--) {
-					$elems->item($i)->parentNode->removeChild($elems->item($i));
-				}
-			}
-		}
-		// strip elements using Readability.com and Instapaper.com ignore class names
-		// .entry-unrelated and .instapaper_ignore
-		// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
-		// and http://blog.instapaper.com/post/730281947
-		$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
-		// check for matches
-		if ($elems && $elems->length > 0) {
-			$this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
-			for ($i=$elems->length-1; $i >= 0; $i--) {
-				$elems->item($i)->parentNode->removeChild($elems->item($i));
-			}
-		}
-		
-		// strip elements that contain style="display: none;"
-		$elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
-		// check for matches
-		if ($elems && $elems->length > 0) {
-			$this->debug('Stripping '.$elems->length.' elements with inline display:none style');
-			for ($i=$elems->length-1; $i >= 0; $i--) {
-				$elems->item($i)->parentNode->removeChild($elems->item($i));
-			}
-		}
-		
-		// try to get body
-		foreach ($this->config->body as $pattern) {
-			$elems = @$xpath->query($pattern, $this->readability->dom);
-			// check for matches
-			if ($elems && $elems->length > 0) {
-				$this->debug('Body matched');
-				$this->debug("...XPath match: $pattern");
-				if ($elems->length == 1) {				
-					$this->body = $elems->item(0);
-					// prune (clean up elements that may not be content)
-					if ($this->config->prune()) {
-						$this->debug('...pruning content');
-						$this->readability->prepArticle($this->body);
-					}
-					break;
-				} else {
-					$this->body = $this->readability->dom->createElement('div');
-					$this->debug($elems->length.' body elems found');
-					foreach ($elems as $elem) {
-						if (!isset($elem->parentNode)) continue;
-						$isDescendant = false;
-						foreach ($this->body->childNodes as $parent) {
-							if ($this->isDescendant($parent, $elem)) {
-								$isDescendant = true;
-								break;
-							}
-						}
-						if ($isDescendant) {
-							$this->debug('...element is child of another body element, skipping.');
-						} else {
-							// prune (clean up elements that may not be content)
-							if ($this->config->prune()) {
-								$this->debug('Pruning content');
-								$this->readability->prepArticle($elem);
-							}
-							$this->debug('...element added to body');
-							$this->body->appendChild($elem);
-						}
-					}
-					if ($this->body->hasChildNodes()) break;
-				}
-			}
-		}		
-		
-		// auto detect?
-		$detect_title = $detect_body = $detect_author = $detect_date = false;
-		// detect title?
-		if (!isset($this->title)) {
-			if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
-				$detect_title = true;
-			}
-		}
-		// detect body?
-		if (!isset($this->body)) {
-			if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
-				$detect_body = true;
-			}
-		}
-		// detect author?
-		if (empty($this->author)) {
-			if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
-				$detect_author = true;
-			}
-		}
-		// detect date?
-		if (!isset($this->date)) {
-			if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
-				$detect_date = true;
-			}
-		}
-
-		// check for hNews
-		if ($detect_title || $detect_body) {
-			// check for hentry
-			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
-			if ($elems && $elems->length > 0) {
-				$this->debug('hNews: found hentry');
-				$hentry = $elems->item(0);
-				
-				if ($detect_title) {
-					// check for entry-title
-					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
-					if ($elems && $elems->length > 0) {
-						$this->title = $elems->item(0)->textContent;
-						$this->debug('hNews: found entry-title: '.$this->title);
-						// remove title from document
-						$elems->item(0)->parentNode->removeChild($elems->item(0));
-						$detect_title = false;
-					}
-				}
-				
-				if ($detect_date) {
-					// check for time element with pubdate attribute
-					$elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
-					if ($elems && $elems->length > 0) {
-						$this->date = strtotime(trim($elems->item(0)->textContent));
-						// remove date from document
-						//$elems->item(0)->parentNode->removeChild($elems->item(0));
-						if ($this->date) {
-							$this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
-							$detect_date = false;
-						} else {
-							$this->date = null;
-						}
-					}
-				}
-
-				if ($detect_author) {
-					// check for time element with pubdate attribute
-					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
-					if ($elems && $elems->length > 0) {
-						$author = $elems->item(0);
-						$fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
-						if ($fn && $fn->length > 0) {
-							foreach ($fn as $_fn) {
-								if (trim($_fn->textContent) != '') {
-									$this->author[] = trim($_fn->textContent);
-									$this->debug('hNews: found author: '.trim($_fn->textContent));
-								}
-							}
-						} else {
-							if (trim($author->textContent) != '') {
-								$this->author[] = trim($author->textContent);
-								$this->debug('hNews: found author: '.trim($author->textContent));
-							}
-						}
-						$detect_author = empty($this->author);
-					}
-				}
-				
-				// check for entry-content.
-				// according to hAtom spec, if there are multiple elements marked entry-content,
-				// we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
-				if ($detect_body) {
-					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
-					if ($elems && $elems->length > 0) {
-						$this->debug('hNews: found entry-content');
-						if ($elems->length == 1) {
-							// what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
-							$e = $elems->item(0);
-							if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
-								$this->body = $elems->item(0);
-								// prune (clean up elements that may not be content)
-								if ($this->config->prune()) {
-									$this->debug('Pruning content');
-									$this->readability->prepArticle($this->body);
-								}
-								$detect_body = false;
-							} else {
-								$this->debug('hNews: skipping entry-content - appears not to contain content');
-							}
-							unset($e);
-						} else {
-							$this->body = $this->readability->dom->createElement('div');
-							$this->debug($elems->length.' entry-content elems found');
-							foreach ($elems as $elem) {
-								if (!isset($elem->parentNode)) continue;
-								$isDescendant = false;
-								foreach ($this->body->childNodes as $parent) {
-									if ($this->isDescendant($parent, $elem)) {
-										$isDescendant = true;
-										break;
-									}
-								}
-								if ($isDescendant) {
-									$this->debug('Element is child of another body element, skipping.');
-								} else {
-									// prune (clean up elements that may not be content)
-									if ($this->config->prune()) {
-										$this->debug('Pruning content');
-										$this->readability->prepArticle($elem);
-									}								
-									$this->debug('Element added to body');									
-									$this->body->appendChild($elem);
-								}
-							}
-							$detect_body = false;
-						}
-					}
-				}
-			}
-		}
-
-		// check for elements marked with instapaper_title
-		if ($detect_title) {
-			// check for instapaper_title
-			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
-			if ($elems && $elems->length > 0) {
-				$this->title = $elems->item(0)->textContent;
-				$this->debug('Title found (.instapaper_title): '.$this->title);
-				// remove title from document
-				$elems->item(0)->parentNode->removeChild($elems->item(0));
-				$detect_title = false;
-			}
-		}
-		// check for elements marked with instapaper_body
-		if ($detect_body) {
-			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
-			if ($elems && $elems->length > 0) {
-				$this->debug('body found (.instapaper_body)');
-				$this->body = $elems->item(0);
-				// prune (clean up elements that may not be content)
-				if ($this->config->prune()) {
-					$this->debug('Pruning content');
-					$this->readability->prepArticle($this->body);
-				}
-				$detect_body = false;
-			}
-		}
-		
-		// Find author in rel="author" marked element
-		// We only use this if there's exactly one.
-		// If there's more than one, it could indicate more than
-		// one author, but it could also indicate that we're processing
-		// a page listing different articles with different authors.
-		if ($detect_author) {
-			$elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
-			if ($elems && $elems->length == 1) {
-				$author = trim($elems->item(0)->textContent);
-				if ($author != '') {
-					$this->debug("Author found (rel=\"author\"): $author");
-					$this->author[] = $author;
-					$detect_author = false;
-				}
-			}
-		}
-
-		// Find date in pubdate marked time element
-		// For the same reason given above, we only use this
-		// if there's exactly one element.
-		if ($detect_date) {
-			$elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
-			if ($elems && $elems->length == 1) {
-				$this->date = strtotime(trim($elems->item(0)->textContent));
-				// remove date from document
-				//$elems->item(0)->parentNode->removeChild($elems->item(0));
-				if ($this->date) {
-					$this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
-					$detect_date = false;
-				} else {
-					$this->date = null;
-				}
-			}
-		}
-
-		// still missing title or body, so we detect using Readability
-		if ($detect_title || $detect_body) {
-			$this->debug('Using Readability');
-			// clone body if we're only using Readability for title (otherwise it may interfere with body element)
-			if (isset($this->body)) $this->body = $this->body->cloneNode(true);
-			$success = $this->readability->init();
-		}
-		if ($detect_title) {
-			$this->debug('Detecting title');
-			$this->title = $this->readability->getTitle()->textContent;
-		}
-		if ($detect_body && $success) {
-			$this->debug('Detecting body');
-			$this->body = $this->readability->getContent();
-			if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
-				$this->body = $this->body->firstChild;
-			}
-			// prune (clean up elements that may not be content)
-			if ($this->config->prune()) {
-				$this->debug('Pruning content');
-				$this->readability->prepArticle($this->body);
-			}
-		}
-		if (isset($this->body)) {
-			// remove scripts
-			$this->readability->removeScripts($this->body);
-			// remove any h1-h6 elements that appear as first thing in the body
-			// and which match our title
-			if (isset($this->title) && ($this->title != '')) {
-				$firstChild = $this->body->firstChild;
-				while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
-					$firstChild = $firstChild->nextSibling;
-				}
-				if (($firstChild->nodeType === XML_ELEMENT_NODE)
-					&& in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
-					&& (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
-						$this->body->removeChild($firstChild);
-				}
-			}
-			// prevent self-closing iframes
-			$elems = $this->body->getElementsByTagName('iframe');
-			for ($i = $elems->length-1; $i >= 0; $i--) {
-				$e = $elems->item($i);
-				if (!$e->hasChildNodes()) {
-					$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
-				}
-			}
-			// remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
-			// the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
-			// inside the data-lazy-src attribute. It also places the original image inside a noscript element 
-			// next to the amended one.
-			$elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
-			for ($i = $elems->length-1; $i >= 0; $i--) {
-				$e = $elems->item($i);
-				// let's see if we can grab image from noscript
-				if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
-					$_new_elem = $e->ownerDocument->createDocumentFragment();
-					@$_new_elem->appendXML($e->nextSibling->innerHTML);
-					$e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
-					$e->parentNode->removeChild($e);
-				} else {
-					// Use data-lazy-src as src value
-					$e->setAttribute('src', $e->getAttribute('data-lazy-src'));
-					$e->removeAttribute('data-lazy-src');
-				}
-			}
-		
-			$this->success = true;
-		}
-		
-		// if we've had no success and we've used tidy, there's a chance
-		// that tidy has messed up. So let's try again without tidy...
-		if (!$this->success && $tidied && $smart_tidy) {
-			$this->debug('Trying again without tidy');
-			$this->process($original_html, $url, false);
-		}
-
-		return $this->success;
-	}
-	
-	private function isDescendant(DOMElement $parent, DOMElement $child) {
-		$node = $child->parentNode;
-		while ($node != null) {
-			if ($node->isSameNode($parent))	return true;
-			$node = $node->parentNode;
-		}
-		return false;
-	}
-
-	public function getContent() {
-		return $this->body;
-	}
-	
-	public function getTitle() {
-		return $this->title;
-	}
-	
-	public function getAuthors() {
-		return $this->author;
-	}
-	
-	public function getLanguage() {
-		return $this->language;
-	}
-	
-	public function getDate() {
-		return $this->date;
-	}
-	
-	public function getSiteConfig() {
-		return $this->config;
-	}
-	
-	public function getNextPageUrl() {
-		return $this->nextPageUrl;
-	}
-}
-?>
\ No newline at end of file
+<?php
+/**
+ * Content Extractor
+ * 
+ * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 
+ * to extract content from HTML files.
+ * 
+ * @version 1.0
+ * @date 2013-02-05
+ * @author Keyvan Minoukadeh
+ * @copyright 2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class ContentExtractor
+{
+	protected static $tidy_config = array(
+				 'clean' => true,
+				 'output-xhtml' => true,
+				 'logical-emphasis' => true,
+				 'show-body-only' => false,
+				 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
+				 'new-inline-tags' => 'mark, time, meter, progress, data',
+				 'wrap' => 0,
+				 'drop-empty-paras' => true,
+				 'drop-proprietary-attributes' => false,
+				 'enclose-text' => true,
+				 'enclose-block-text' => true,
+				 'merge-divs' => true,
+				 'merge-spans' => true,
+				 'char-encoding' => 'utf8',
+				 'hide-comments' => true
+				 );
+	protected $html;
+	protected $config;
+	protected $title;
+	protected $author = array();
+	protected $language;
+	protected $date;
+	protected $body;
+	protected $success = false;
+	protected $nextPageUrl;
+	public $allowedParsers = array('libxml', 'html5lib');
+	public $fingerprints = array();
+	public $readability;
+	public $debug = false;
+	public $debugVerbose = false;
+
+	function __construct($path, $fallback=null) {
+		SiteConfig::set_config_path($path, $fallback);	
+	}
+	
+	protected function debug($msg) {
+		if ($this->debug) {
+			$mem = round(memory_get_usage()/1024, 2);
+			$memPeak = round(memory_get_peak_usage()/1024, 2);
+			echo '* ',$msg;
+			if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
+			echo "\n";
+			ob_flush();
+			flush();
+		}
+	}
+	
+	public function reset() {
+		$this->html = null;
+		$this->readability = null;
+		$this->config = null;
+		$this->title = null;
+		$this->body = null;
+		$this->author = array();
+		$this->language = null;
+		$this->date = null;
+		$this->nextPageUrl = null;
+		$this->success = false;
+	}
+
+	public function findHostUsingFingerprints($html) {
+		$this->debug('Checking fingerprints...');
+		$head = substr($html, 0, 8000);
+		foreach ($this->fingerprints as $_fp => $_fphost) {
+			$lookin = 'html';
+			if (is_array($_fphost)) {
+				if (isset($_fphost['head']) && $_fphost['head']) {
+					$lookin = 'head';
+				}
+				$_fphost = $_fphost['hostname'];
+			}
+			if (strpos($$lookin, $_fp) !== false) {
+				$this->debug("Found match: $_fphost");
+				return $_fphost;
+			}
+		}
+		$this->debug('No fingerprint matches');
+		return false;
+	}
+	
+	// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
+	public function buildSiteConfig($url, $html='', $add_to_cache=true) {
+		// extract host name
+		$host = @parse_url($url, PHP_URL_HOST);
+		$host = strtolower($host);
+		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+		// is merged version already cached?
+		if (SiteConfig::is_cached("$host.merged")) {
+			$this->debug("Returning cached and merged site config for $host");
+			return SiteConfig::build("$host.merged");
+		}
+		// let's build from site_config/custom/ and standard/
+		$config = SiteConfig::build($host);
+		if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
+			SiteConfig::add_to_cache($host, $config);
+		}
+		// if no match, use defaults
+		if (!$config) $config = new SiteConfig();
+		// load fingerprint config?
+		if ($config->autodetect_on_failure()) {
+			// check HTML for fingerprints
+			if (!empty($this->fingerprints) && ($_fphost = $this->findHostUsingFingerprints($html))) {
+				if ($config_fingerprint = SiteConfig::build($_fphost)) {
+					$this->debug("Appending site config settings from $_fphost (fingerprint match)");
+					$config->append($config_fingerprint);
+					if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
+						//$config_fingerprint->cache_in_apc = true;
+						SiteConfig::add_to_cache($_fphost, $config_fingerprint);
+					}
+				}
+			}
+		}
+		// load global config?
+		if ($config->autodetect_on_failure()) {
+			if ($config_global = SiteConfig::build('global', true)) {
+				$this->debug('Appending site config settings from global.txt');
+				$config->append($config_global);
+				if ($add_to_cache && !SiteConfig::is_cached('global')) {
+					//$config_global->cache_in_apc = true;
+					SiteConfig::add_to_cache('global', $config_global);
+				}
+			}
+		}
+		// store copy of merged config
+		if ($add_to_cache) {
+			// do not store in APC if wildcard match
+			$use_apc = ($host == $config->cache_key);
+			$config->cache_key = null;
+			SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
+		}
+		return $config;
+	}
+	
+	// returns true on success, false on failure
+	// $smart_tidy indicates that if tidy is used and no results are produced, we will
+	// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
+	// but it has problems of its own which we try to avoid with this option.
+	public function process($html, $url, $smart_tidy=true) {
+		$this->reset();
+		$this->config = $this->buildSiteConfig($url, $html);
+		
+		// do string replacements
+		if (!empty($this->config->find_string)) {
+			if (count($this->config->find_string) == count($this->config->replace_string)) {
+				$html = str_replace($this->config->find_string, $this->config->replace_string, $html, $_count);
+				$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
+			} else {
+				$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
+			}
+			unset($_count);
+		}
+		
+		// use tidy (if it exists)?
+		// This fixes problems with some sites which would otherwise
+		// trouble DOMDocument's HTML parsing. (Although sometimes it
+		// makes matters worse, which is why you can override it in site config files.)
+		$tidied = false;
+		if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
+			$this->debug('Using Tidy');
+			$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
+			if (tidy_clean_repair($tidy)) {
+				$original_html = $html;
+				$tidied = true;
+				$html = $tidy->value;
+			}
+			unset($tidy);
+		}
+		
+		// load and parse html
+		$_parser = $this->config->parser();
+		if (!in_array($_parser, $this->allowedParsers)) {
+			$this->debug("HTML parser $_parser not listed, using libxml instead");
+			$_parser = 'libxml';
+		}
+		$this->debug("Attempting to parse HTML with $_parser");
+		$this->readability = new Readability($html, $url, $_parser);
+		
+		// we use xpath to find elements in the given HTML document
+		// see http://en.wikipedia.org/wiki/XPath_1.0
+		$xpath = new DOMXPath($this->readability->dom);
+
+		// try to get next page link
+		foreach ($this->config->next_page_link as $pattern) {
+			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
+			if (is_string($elems)) {
+				$this->nextPageUrl = trim($elems);
+				break;
+			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+				foreach ($elems as $item) {
+					if ($item instanceof DOMElement && $item->hasAttribute('href')) {
+						$this->nextPageUrl = $item->getAttribute('href');
+						break 2;
+					} elseif ($item instanceof DOMAttr && $item->value) {
+						$this->nextPageUrl = $item->value;
+						break 2;
+					}
+				}
+			}
+		}
+		
+		// try to get title
+		foreach ($this->config->title as $pattern) {
+			// $this->debug("Trying $pattern");
+			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
+			if (is_string($elems)) {
+				$this->title = trim($elems);
+				$this->debug('Title expression evaluated as string: '.$this->title);
+				$this->debug("...XPath match: $pattern");
+				break;
+			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+				$this->title = $elems->item(0)->textContent;
+				$this->debug('Title matched: '.$this->title);
+				$this->debug("...XPath match: $pattern");
+				// remove title from document
+				try {
+					@$elems->item(0)->parentNode->removeChild($elems->item(0));
+				} catch (DOMException $e) {
+					// do nothing
+				}
+				break;
+			}
+		}
+		
+		// try to get author (if it hasn't already been set)
+		if (empty($this->author)) {
+			foreach ($this->config->author as $pattern) {
+				$elems = @$xpath->evaluate($pattern, $this->readability->dom);
+				if (is_string($elems)) {
+					if (trim($elems) != '') {
+						$this->author[] = trim($elems);
+						$this->debug('Author expression evaluated as string: '.trim($elems));
+						$this->debug("...XPath match: $pattern");
+						break;
+					}
+				} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+					foreach ($elems as $elem) {
+						if (!isset($elem->parentNode)) continue;
+						$this->author[] = trim($elem->textContent);
+						$this->debug('Author matched: '.trim($elem->textContent));
+					}
+					if (!empty($this->author)) {
+						$this->debug("...XPath match: $pattern");
+						break;
+					}
+				}
+			}
+		}
+		
+		// try to get language
+		$_lang_xpath = array('//html[@lang]/@lang', '//meta[@name="DC.language"]/@content');
+		foreach ($_lang_xpath as $pattern) {
+			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
+			if (is_string($elems)) {
+				if (trim($elems) != '') {
+					$this->language = trim($elems);
+					$this->debug('Language matched: '.$this->language);
+					break;
+				}
+			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+				foreach ($elems as $elem) {
+					if (!isset($elem->parentNode)) continue;
+					$this->language = trim($elem->textContent);
+					$this->debug('Language matched: '.$this->language);					
+				}
+				if ($this->language) break;
+			}
+		}
+		
+		// try to get date
+		foreach ($this->config->date as $pattern) {
+			$elems = @$xpath->evaluate($pattern, $this->readability->dom);
+			if (is_string($elems)) {
+				$this->date = strtotime(trim($elems, "; \t\n\r\0\x0B"));				
+			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+				$this->date = $elems->item(0)->textContent;
+				$this->date = strtotime(trim($this->date, "; \t\n\r\0\x0B"));
+				// remove date from document
+				// $elems->item(0)->parentNode->removeChild($elems->item(0));
+			}
+			if (!$this->date) {
+				$this->date = null;
+			} else {
+				$this->debug('Date matched: '.date('Y-m-d H:i:s', $this->date));
+				$this->debug("...XPath match: $pattern");
+				break;
+			}
+		}
+
+		// strip elements (using xpath expressions)
+		foreach ($this->config->strip as $pattern) {
+			$elems = @$xpath->query($pattern, $this->readability->dom);
+			// check for matches
+			if ($elems && $elems->length > 0) {
+				$this->debug('Stripping '.$elems->length.' elements (strip)');
+				for ($i=$elems->length-1; $i >= 0; $i--) {
+					$elems->item($i)->parentNode->removeChild($elems->item($i));
+				}
+			}
+		}
+		
+		// strip elements (using id and class attribute values)
+		foreach ($this->config->strip_id_or_class as $string) {
+			$string = strtr($string, array("'"=>'', '"'=>''));
+			$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
+			// check for matches
+			if ($elems && $elems->length > 0) {
+				$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
+				for ($i=$elems->length-1; $i >= 0; $i--) {
+					$elems->item($i)->parentNode->removeChild($elems->item($i));
+				}
+			}
+		}
+		
+		// strip images (using src attribute values)
+		foreach ($this->config->strip_image_src as $string) {
+			$string = strtr($string, array("'"=>'', '"'=>''));
+			$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
+			// check for matches
+			if ($elems && $elems->length > 0) {
+				$this->debug('Stripping '.$elems->length.' image elements');
+				for ($i=$elems->length-1; $i >= 0; $i--) {
+					$elems->item($i)->parentNode->removeChild($elems->item($i));
+				}
+			}
+		}
+		// strip elements using Readability.com and Instapaper.com ignore class names
+		// .entry-unrelated and .instapaper_ignore
+		// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
+		// and http://blog.instapaper.com/post/730281947
+		$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' entry-unrelated ') or contains(concat(' ',normalize-space(@class),' '),' instapaper_ignore ')]", $this->readability->dom);
+		// check for matches
+		if ($elems && $elems->length > 0) {
+			$this->debug('Stripping '.$elems->length.' .entry-unrelated,.instapaper_ignore elements');
+			for ($i=$elems->length-1; $i >= 0; $i--) {
+				$elems->item($i)->parentNode->removeChild($elems->item($i));
+			}
+		}
+		
+		// strip elements that contain style="display: none;"
+		$elems = @$xpath->query("//*[contains(@style,'display:none')]", $this->readability->dom);
+		// check for matches
+		if ($elems && $elems->length > 0) {
+			$this->debug('Stripping '.$elems->length.' elements with inline display:none style');
+			for ($i=$elems->length-1; $i >= 0; $i--) {
+				$elems->item($i)->parentNode->removeChild($elems->item($i));
+			}
+		}
+		
+		// try to get body
+		foreach ($this->config->body as $pattern) {
+			$elems = @$xpath->query($pattern, $this->readability->dom);
+			// check for matches
+			if ($elems && $elems->length > 0) {
+				$this->debug('Body matched');
+				$this->debug("...XPath match: $pattern");
+				if ($elems->length == 1) {				
+					$this->body = $elems->item(0);
+					// prune (clean up elements that may not be content)
+					if ($this->config->prune()) {
+						$this->debug('...pruning content');
+						$this->readability->prepArticle($this->body);
+					}
+					break;
+				} else {
+					$this->body = $this->readability->dom->createElement('div');
+					$this->debug($elems->length.' body elems found');
+					foreach ($elems as $elem) {
+						if (!isset($elem->parentNode)) continue;
+						$isDescendant = false;
+						foreach ($this->body->childNodes as $parent) {
+							if ($this->isDescendant($parent, $elem)) {
+								$isDescendant = true;
+								break;
+							}
+						}
+						if ($isDescendant) {
+							$this->debug('...element is child of another body element, skipping.');
+						} else {
+							// prune (clean up elements that may not be content)
+							if ($this->config->prune()) {
+								$this->debug('Pruning content');
+								$this->readability->prepArticle($elem);
+							}
+							$this->debug('...element added to body');
+							$this->body->appendChild($elem);
+						}
+					}
+					if ($this->body->hasChildNodes()) break;
+				}
+			}
+		}		
+		
+		// auto detect?
+		$detect_title = $detect_body = $detect_author = $detect_date = false;
+		// detect title?
+		if (!isset($this->title)) {
+			if (empty($this->config->title) || $this->config->autodetect_on_failure()) {
+				$detect_title = true;
+			}
+		}
+		// detect body?
+		if (!isset($this->body)) {
+			if (empty($this->config->body) || $this->config->autodetect_on_failure()) {
+				$detect_body = true;
+			}
+		}
+		// detect author?
+		if (empty($this->author)) {
+			if (empty($this->config->author) || $this->config->autodetect_on_failure()) {
+				$detect_author = true;
+			}
+		}
+		// detect date?
+		if (!isset($this->date)) {
+			if (empty($this->config->date) || $this->config->autodetect_on_failure()) {
+				$detect_date = true;
+			}
+		}
+
+		// check for hNews
+		if ($detect_title || $detect_body) {
+			// check for hentry
+			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' hentry ')]", $this->readability->dom);
+			if ($elems && $elems->length > 0) {
+				$this->debug('hNews: found hentry');
+				$hentry = $elems->item(0);
+				
+				if ($detect_title) {
+					// check for entry-title
+					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-title ')]", $hentry);
+					if ($elems && $elems->length > 0) {
+						$this->title = $elems->item(0)->textContent;
+						$this->debug('hNews: found entry-title: '.$this->title);
+						// remove title from document
+						$elems->item(0)->parentNode->removeChild($elems->item(0));
+						$detect_title = false;
+					}
+				}
+				
+				if ($detect_date) {
+					// check for time element with pubdate attribute
+					$elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
+					if ($elems && $elems->length > 0) {
+						$this->date = strtotime(trim($elems->item(0)->textContent));
+						// remove date from document
+						//$elems->item(0)->parentNode->removeChild($elems->item(0));
+						if ($this->date) {
+							$this->debug('hNews: found publication date: '.date('Y-m-d H:i:s', $this->date));
+							$detect_date = false;
+						} else {
+							$this->date = null;
+						}
+					}
+				}
+
+				if ($detect_author) {
+					// check for time element with pubdate attribute
+					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' vcard ') and (contains(concat(' ',normalize-space(@class),' '),' author ') or contains(concat(' ',normalize-space(@class),' '),' byline '))]", $hentry);
+					if ($elems && $elems->length > 0) {
+						$author = $elems->item(0);
+						$fn = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' fn ')]", $author);
+						if ($fn && $fn->length > 0) {
+							foreach ($fn as $_fn) {
+								if (trim($_fn->textContent) != '') {
+									$this->author[] = trim($_fn->textContent);
+									$this->debug('hNews: found author: '.trim($_fn->textContent));
+								}
+							}
+						} else {
+							if (trim($author->textContent) != '') {
+								$this->author[] = trim($author->textContent);
+								$this->debug('hNews: found author: '.trim($author->textContent));
+							}
+						}
+						$detect_author = empty($this->author);
+					}
+				}
+				
+				// check for entry-content.
+				// according to hAtom spec, if there are multiple elements marked entry-content,
+				// we include all of these in the order they appear - see http://microformats.org/wiki/hatom#Entry_Content
+				if ($detect_body) {
+					$elems = @$xpath->query(".//*[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]", $hentry);
+					if ($elems && $elems->length > 0) {
+						$this->debug('hNews: found entry-content');
+						if ($elems->length == 1) {
+							// what if it's empty? (some sites misuse hNews - place their content outside an empty entry-content element)
+							$e = $elems->item(0);
+							if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
+								$this->body = $elems->item(0);
+								// prune (clean up elements that may not be content)
+								if ($this->config->prune()) {
+									$this->debug('Pruning content');
+									$this->readability->prepArticle($this->body);
+								}
+								$detect_body = false;
+							} else {
+								$this->debug('hNews: skipping entry-content - appears not to contain content');
+							}
+							unset($e);
+						} else {
+							$this->body = $this->readability->dom->createElement('div');
+							$this->debug($elems->length.' entry-content elems found');
+							foreach ($elems as $elem) {
+								if (!isset($elem->parentNode)) continue;
+								$isDescendant = false;
+								foreach ($this->body->childNodes as $parent) {
+									if ($this->isDescendant($parent, $elem)) {
+										$isDescendant = true;
+										break;
+									}
+								}
+								if ($isDescendant) {
+									$this->debug('Element is child of another body element, skipping.');
+								} else {
+									// prune (clean up elements that may not be content)
+									if ($this->config->prune()) {
+										$this->debug('Pruning content');
+										$this->readability->prepArticle($elem);
+									}								
+									$this->debug('Element added to body');									
+									$this->body->appendChild($elem);
+								}
+							}
+							$detect_body = false;
+						}
+					}
+				}
+			}
+		}
+
+		// check for elements marked with instapaper_title
+		if ($detect_title) {
+			// check for instapaper_title
+			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_title ')]", $this->readability->dom);
+			if ($elems && $elems->length > 0) {
+				$this->title = $elems->item(0)->textContent;
+				$this->debug('Title found (.instapaper_title): '.$this->title);
+				// remove title from document
+				$elems->item(0)->parentNode->removeChild($elems->item(0));
+				$detect_title = false;
+			}
+		}
+		// check for elements marked with instapaper_body
+		if ($detect_body) {
+			$elems = @$xpath->query("//*[contains(concat(' ',normalize-space(@class),' '),' instapaper_body ')]", $this->readability->dom);
+			if ($elems && $elems->length > 0) {
+				$this->debug('body found (.instapaper_body)');
+				$this->body = $elems->item(0);
+				// prune (clean up elements that may not be content)
+				if ($this->config->prune()) {
+					$this->debug('Pruning content');
+					$this->readability->prepArticle($this->body);
+				}
+				$detect_body = false;
+			}
+		}
+		
+		// Find author in rel="author" marked element
+		// We only use this if there's exactly one.
+		// If there's more than one, it could indicate more than
+		// one author, but it could also indicate that we're processing
+		// a page listing different articles with different authors.
+		if ($detect_author) {
+			$elems = @$xpath->query("//a[contains(concat(' ',normalize-space(@rel),' '),' author ')]", $this->readability->dom);
+			if ($elems && $elems->length == 1) {
+				$author = trim($elems->item(0)->textContent);
+				if ($author != '') {
+					$this->debug("Author found (rel=\"author\"): $author");
+					$this->author[] = $author;
+					$detect_author = false;
+				}
+			}
+		}
+
+		// Find date in pubdate marked time element
+		// For the same reason given above, we only use this
+		// if there's exactly one element.
+		if ($detect_date) {
+			$elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
+			if ($elems && $elems->length == 1) {
+				$this->date = strtotime(trim($elems->item(0)->textContent));
+				// remove date from document
+				//$elems->item(0)->parentNode->removeChild($elems->item(0));
+				if ($this->date) {
+					$this->debug('Date found (pubdate marked time element): '.date('Y-m-d H:i:s', $this->date));
+					$detect_date = false;
+				} else {
+					$this->date = null;
+				}
+			}
+		}
+
+		// still missing title or body, so we detect using Readability
+		if ($detect_title || $detect_body) {
+			$this->debug('Using Readability');
+			// clone body if we're only using Readability for title (otherwise it may interfere with body element)
+			if (isset($this->body)) $this->body = $this->body->cloneNode(true);
+			$success = $this->readability->init();
+		}
+		if ($detect_title) {
+			$this->debug('Detecting title');
+			$this->title = $this->readability->getTitle()->textContent;
+		}
+		if ($detect_body && $success) {
+			$this->debug('Detecting body');
+			$this->body = $this->readability->getContent();
+			if ($this->body->childNodes->length == 1 && $this->body->firstChild->nodeType === XML_ELEMENT_NODE) {
+				$this->body = $this->body->firstChild;
+			}
+			// prune (clean up elements that may not be content)
+			if ($this->config->prune()) {
+				$this->debug('Pruning content');
+				$this->readability->prepArticle($this->body);
+			}
+		}
+		if (isset($this->body)) {
+			// remove scripts
+			$this->readability->removeScripts($this->body);
+			// remove any h1-h6 elements that appear as first thing in the body
+			// and which match our title
+			if (isset($this->title) && ($this->title != '')) {
+				$firstChild = $this->body->firstChild;
+				while ($firstChild->nodeType && ($firstChild->nodeType !== XML_ELEMENT_NODE)) {
+					$firstChild = $firstChild->nextSibling;
+				}
+				if (($firstChild->nodeType === XML_ELEMENT_NODE)
+					&& in_array(strtolower($firstChild->tagName), array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
+					&& (strtolower(trim($firstChild->textContent)) == strtolower(trim($this->title)))) {
+						$this->body->removeChild($firstChild);
+				}
+			}
+			// prevent self-closing iframes
+			$elems = $this->body->getElementsByTagName('iframe');
+			for ($i = $elems->length-1; $i >= 0; $i--) {
+				$e = $elems->item($i);
+				if (!$e->hasChildNodes()) {
+					$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
+				}
+			}
+			// remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
+			// the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
+			// inside the data-lazy-src attribute. It also places the original image inside a noscript element 
+			// next to the amended one.
+			$elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
+			for ($i = $elems->length-1; $i >= 0; $i--) {
+				$e = $elems->item($i);
+				// let's see if we can grab image from noscript
+				if ($e->nextSibling !== null && $e->nextSibling->nodeName === 'noscript') {
+					$_new_elem = $e->ownerDocument->createDocumentFragment();
+					@$_new_elem->appendXML($e->nextSibling->innerHTML);
+					$e->nextSibling->parentNode->replaceChild($_new_elem, $e->nextSibling);
+					$e->parentNode->removeChild($e);
+				} else {
+					// Use data-lazy-src as src value
+					$e->setAttribute('src', $e->getAttribute('data-lazy-src'));
+					$e->removeAttribute('data-lazy-src');
+				}
+			}
+		
+			$this->success = true;
+		}
+		
+		// if we've had no success and we've used tidy, there's a chance
+		// that tidy has messed up. So let's try again without tidy...
+		if (!$this->success && $tidied && $smart_tidy) {
+			$this->debug('Trying again without tidy');
+			$this->process($original_html, $url, false);
+		}
+
+		return $this->success;
+	}
+	
+	private function isDescendant(DOMElement $parent, DOMElement $child) {
+		$node = $child->parentNode;
+		while ($node != null) {
+			if ($node->isSameNode($parent))	return true;
+			$node = $node->parentNode;
+		}
+		return false;
+	}
+
+	public function getContent() {
+		return $this->body;
+	}
+	
+	public function getTitle() {
+		return $this->title;
+	}
+	
+	public function getAuthors() {
+		return $this->author;
+	}
+	
+	public function getLanguage() {
+		return $this->language;
+	}
+	
+	public function getDate() {
+		return $this->date;
+	}
+	
+	public function getSiteConfig() {
+		return $this->config;
+	}
+	
+	public function getNextPageUrl() {
+		return $this->nextPageUrl;
+	}
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
index c5e300d7..1f6a7603 100644
--- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php
+++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php
@@ -1,338 +1,343 @@
-<?php
-/**
- * Site Config
- * 
- * Each instance of this class should hold extraction patterns and other directives
- * for a website. See ContentExtractor class to see how it's used.
- * 
- * @version 0.7
- * @date 2012-08-27
- * @author Keyvan Minoukadeh
- * @copyright 2012 Keyvan Minoukadeh
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
- */
-
-class SiteConfig
-{
-	// Use first matching element as title (0 or more xpath expressions)
-	public $title = array();
-	
-	// Use first matching element as body (0 or more xpath expressions)
-	public $body = array();
-	
-	// Use first matching element as author (0 or more xpath expressions)
-	public $author = array();
-	
-	// Use first matching element as date (0 or more xpath expressions)
-	public $date = array();
-	
-	// Strip elements matching these xpath expressions (0 or more)
-	public $strip = array();
-	
-	// Strip elements which contain these strings (0 or more) in the id or class attribute 
-	public $strip_id_or_class = array();
-	
-	// Strip images which contain these strings (0 or more) in the src attribute 
-	public $strip_image_src = array();
-	
-	// Additional HTTP headers to send
-	// NOT YET USED
-	public $http_header = array();
-	
-	// Process HTML with tidy before creating DOM (bool or null if undeclared)
-	public $tidy = null;
-	
-	protected $default_tidy = true; // used if undeclared
-	
-	// Autodetect title/body if xpath expressions fail to produce results.
-	// Note that this applies to title and body separately, ie. 
-	//   * if we get a body match but no title match, this option will determine whether we autodetect title 
-	//   * if neither match, this determines whether we autodetect title and body.
-	// Also note that this only applies when there is at least one xpath expression in title or body, ie.
-	//   * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
-	//   * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
-	// Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
-	// bool or null if undeclared
-	public $autodetect_on_failure = null;
-	protected $default_autodetect_on_failure = true; // used if undeclared
-	
-	// Clean up content block - attempt to remove elements that appear to be superfluous
-	// bool or null if undeclared
-	public $prune = null;
-	protected $default_prune = true; // used if undeclared
-	
-	// Test URL - if present, can be used to test the config above
-	public $test_url = array();
-	
-	// Single-page link - should identify a link element or URL pointing to the page holding the entire article
-	// This is useful for sites which split their articles across multiple pages. Links to such pages tend to 
-	// display the first page with links to the other pages at the bottom. Often there is also a link to a page
-	// which displays the entire article on one page (e.g. 'print view').
-	// This should be an XPath expression identifying the link to that page. If present and we find a match,
-	// we will retrieve that page and the rest of the options in this config will be applied to the new page.
-	public $single_page_link = array();
-	
-	public $next_page_link = array();
-	
-	// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
-	public $single_page_link_in_feed = array();
-	
-	// Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
-	// string or null if undeclared
-	public $parser = null;
-	protected $default_parser = 'libxml'; // used if undeclared
-	
-	// Strings to search for in HTML before processing begins (used with $replace_string)
-	public $find_string = array();
-	// Strings to replace those found in $find_string before HTML processing begins
-	public $replace_string = array();
-	
-	// the options below cannot be set in the config files which this class represents
-	
-	//public $cache_in_apc = false; // used to decide if we should cache in apc or not
-	public $cache_key = null;
-	public static $debug = false;
-	protected static $apc = false;
-	protected static $config_path;
-	protected static $config_path_fallback;
-	protected static $config_cache = array();
-	const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
-	
-	protected static function debug($msg) {
-		if (self::$debug) {
-			//$mem = round(memory_get_usage()/1024, 2);
-			//$memPeak = round(memory_get_peak_usage()/1024, 2);
-			echo '* ',$msg;
-			//echo ' - mem used: ',$mem," (peak: $memPeak)\n";
-			echo "\n";
-			ob_flush();
-			flush();
-		}
-	}
-	
-	// enable APC caching of certain site config files?
-	// If enabled the following site config files will be 
-	// cached in APC cache (when requested for first time):
-	// * anything in site_config/custom/ and its corresponding file in site_config/standard/
-	// * the site config files associated with HTML fingerprints
-	// * the global site config file
-	// returns true if enabled, false otherwise
-	public static function use_apc($apc=true) {
-		if (!function_exists('apc_add')) {
-			if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
-			return false;
-		}
-		self::$apc = $apc;
-		return $apc;
-	}
-	
-	// return bool or null
-	public function tidy($use_default=true) {
-		if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
-		return $this->tidy;
-	}
-	
-	// return bool or null
-	public function prune($use_default=true) {
-		if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
-		return $this->prune;
-	}
-	
-	// return string or null
-	public function parser($use_default=true) {
-		if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
-		return $this->parser;
-	}
-
-	// return bool or null
-	public function autodetect_on_failure($use_default=true) {
-		if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
-		return $this->autodetect_on_failure;
-	}
-	
-	public static function set_config_path($path, $fallback=null) {
-		self::$config_path = $path;
-		self::$config_path_fallback = $fallback;
-	}
-	
-	public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
-		$key = strtolower($key);
-		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
-		if ($config->cache_key) $key = $config->cache_key;
-		self::$config_cache[$key] = $config;
-		if (self::$apc && $use_apc) {
-			self::debug("Adding site config to APC cache with key sc.$key");
-			apc_add("sc.$key", $config);
-		}
-		self::debug("Cached site config with key $key");
-	}
-	
-	public static function is_cached($key) {
-		$key = strtolower($key);
-		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
-		if (array_key_exists($key, self::$config_cache)) {
-			return true;
-		} elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
-			return true;
-		}
-		return false;
-	}
-	
-	public function append(SiteConfig $newconfig) {
-		// check for commands where we accept multiple statements (no test_url)
-		foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {
-			// append array elements for this config variable from $newconfig to this config
-			//$this->$var = $this->$var + $newconfig->$var;
-			$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
-		}
-		// check for single statement commands
-		// we do not overwrite existing non null values
-		foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
-			if ($this->$var === null) $this->$var = $newconfig->$var;
-		}
-	}
-	
-	// returns SiteConfig instance if an appropriate one is found, false otherwise
-	// if $exact_host_match is true, we will not look for wildcard config matches
-	// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
-	public static function build($host, $exact_host_match=false) {
-		$host = strtolower($host);
-		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
-		if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
-		// check for site configuration
-		$try = array($host);
-		// should we look for wildcard matches 
-		if (!$exact_host_match) {
-			$split = explode('.', $host);
-			if (count($split) > 1) {
-				array_shift($split);
-				$try[] = '.'.implode('.', $split);
-			}
-		}
-		
-		// look for site config file in primary folder
-		self::debug(". looking for site config for $host in primary folder");
-		foreach ($try as $h) {
-			if (array_key_exists($h, self::$config_cache)) {
-				self::debug("... site config for $h already loaded in this request");
-				return self::$config_cache[$h];
-			} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
-				self::debug("... site config for $h in APC cache");
-				return $sconfig;
-			} elseif (file_exists(self::$config_path."/$h.txt")) {
-				self::debug("... found site config ($h.txt)");
-				$file_primary = self::$config_path."/$h.txt";
-				$matched_name = $h;
-				break;
-			}
-		}
-		
-		// if we found site config, process it
-		if (isset($file_primary)) {
-			$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
-			if (!$config_lines || !is_array($config_lines)) return false;
-			$config = self::build_from_array($config_lines);
-			// if APC caching is available and enabled, mark this for cache
-			//$config->cache_in_apc = true;
-			$config->cache_key = $matched_name;
-			
-			// if autodetec on failure is off (on by default) we do not need to look
-			// in secondary folder
-			if (!$config->autodetect_on_failure()) {
-				self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
-				return $config;
-			}
-		}
-		
-		// look for site config file in secondary folder
-		if (isset(self::$config_path_fallback)) {
-			self::debug(". looking for site config for $host in secondary folder");
-			foreach ($try as $h) {
-				if (file_exists(self::$config_path_fallback."/$h.txt")) {
-					self::debug("... found site config in secondary folder ($h.txt)");
-					$file_secondary = self::$config_path_fallback."/$h.txt";
-					$matched_name = $h;
-					break;
-				}
-			}
-			if (!isset($file_secondary)) {
-				self::debug("... no site config match in secondary folder");
-			}
-		}
-		
-		// return false if no config file found
-		if (!isset($file_primary) && !isset($file_secondary)) {
-			self::debug("... no site config match for $host");
-			return false;
-		}
-		
-		// return primary config if secondary not found
-		if (!isset($file_secondary) && isset($config)) {
-			return $config;
-		}
-		
-		// process secondary config file
-		$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
-		if (!$config_lines || !is_array($config_lines)) {
-			// failed to process secondary
-			if (isset($config)) {
-				// return primary config
-				return $config;
-			} else {
-				return false;
-			}
-		}
-		
-		// merge with primary and return
-		if (isset($config)) {
-			self::debug('. merging config files');
-			$config->append(self::build_from_array($config_lines));
-			return $config;
-		} else {
-			// return just secondary
-			$config = self::build_from_array($config_lines);
-			// if APC caching is available and enabled, mark this for cache
-			//$config->cache_in_apc = true;
-			$config->cache_key = $matched_name;
-			return $config;
-		}
-	}
-	
-	public static function build_from_array(array $lines) {
-		$config = new SiteConfig();
-		foreach ($lines as $line) {
-			$line = trim($line);
-			
-			// skip comments, empty lines
-			if ($line == '' || $line[0] == '#') continue;
-			
-			// get command
-			$command = explode(':', $line, 2);
-			// if there's no colon ':', skip this line
-			if (count($command) != 2) continue;
-			$val = trim($command[1]);
-			$command = trim($command[0]);
-			if ($command == '' || $val == '') continue;
-			
-			// check for commands where we accept multiple statements
-			if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
-				array_push($config->$command, $val);
-			// check for single statement commands that evaluate to true or false
-			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
-				$config->$command = ($val == 'yes');
-			// check for single statement commands stored as strings
-			} elseif (in_array($command, array('parser'))) {
-				$config->$command = $val;
-			// check for replace_string(find): replace
-			} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
-				if (in_array($match[1], array('replace_string'))) {
-					$command = $match[1];
-					array_push($config->find_string, $match[2]);
-					array_push($config->$command, $val);
-				}
-			}
-		}
-		return $config;
-	}
-}
-?>
\ No newline at end of file
+<?php
+/**
+ * Site Config
+ * 
+ * Each instance of this class should hold extraction patterns and other directives
+ * for a website. See ContentExtractor class to see how it's used.
+ * 
+ * @version 0.8
+ * @date 2013-04-16
+ * @author Keyvan Minoukadeh
+ * @copyright 2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class SiteConfig
+{
+	// Use first matching element as title (0 or more xpath expressions)
+	public $title = array();
+	
+	// Use first matching element as body (0 or more xpath expressions)
+	public $body = array();
+	
+	// Use first matching element as author (0 or more xpath expressions)
+	public $author = array();
+	
+	// Use first matching element as date (0 or more xpath expressions)
+	public $date = array();
+	
+	// Strip elements matching these xpath expressions (0 or more)
+	public $strip = array();
+	
+	// Strip elements which contain these strings (0 or more) in the id or class attribute 
+	public $strip_id_or_class = array();
+	
+	// Strip images which contain these strings (0 or more) in the src attribute 
+	public $strip_image_src = array();
+	
+	// Additional HTTP headers to send
+	// NOT YET USED
+	public $http_header = array();
+	
+	// Process HTML with tidy before creating DOM (bool or null if undeclared)
+	public $tidy = null;
+	
+	protected $default_tidy = true; // used if undeclared
+	
+	// Autodetect title/body if xpath expressions fail to produce results.
+	// Note that this applies to title and body separately, ie. 
+	//   * if we get a body match but no title match, this option will determine whether we autodetect title 
+	//   * if neither match, this determines whether we autodetect title and body.
+	// Also note that this only applies when there is at least one xpath expression in title or body, ie.
+	//   * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
+	//   * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
+	// Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
+	// bool or null if undeclared
+	public $autodetect_on_failure = null;
+	protected $default_autodetect_on_failure = true; // used if undeclared
+	
+	// Clean up content block - attempt to remove elements that appear to be superfluous
+	// bool or null if undeclared
+	public $prune = null;
+	protected $default_prune = true; // used if undeclared
+	
+	// Test URL - if present, can be used to test the config above
+	public $test_url = array();
+	
+	// Single-page link - should identify a link element or URL pointing to the page holding the entire article
+	// This is useful for sites which split their articles across multiple pages. Links to such pages tend to 
+	// display the first page with links to the other pages at the bottom. Often there is also a link to a page
+	// which displays the entire article on one page (e.g. 'print view').
+	// This should be an XPath expression identifying the link to that page. If present and we find a match,
+	// we will retrieve that page and the rest of the options in this config will be applied to the new page.
+	public $single_page_link = array();
+	
+	public $next_page_link = array();
+	
+	// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
+	public $single_page_link_in_feed = array();
+	
+	// Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+	// string or null if undeclared
+	public $parser = null;
+	protected $default_parser = 'libxml'; // used if undeclared
+	
+	// Strings to search for in HTML before processing begins (used with $replace_string)
+	public $find_string = array();
+	// Strings to replace those found in $find_string before HTML processing begins
+	public $replace_string = array();
+	
+	// the options below cannot be set in the config files which this class represents
+	
+	//public $cache_in_apc = false; // used to decide if we should cache in apc or not
+	public $cache_key = null;
+	public static $debug = false;
+	protected static $apc = false;
+	protected static $config_path;
+	protected static $config_path_fallback;
+	protected static $config_cache = array();
+	const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
+	
+	protected static function debug($msg) {
+		if (self::$debug) {
+			//$mem = round(memory_get_usage()/1024, 2);
+			//$memPeak = round(memory_get_peak_usage()/1024, 2);
+			echo '* ',$msg;
+			//echo ' - mem used: ',$mem," (peak: $memPeak)\n";
+			echo "\n";
+			ob_flush();
+			flush();
+		}
+	}
+	
+	// enable APC caching of certain site config files?
+	// If enabled the following site config files will be 
+	// cached in APC cache (when requested for first time):
+	// * anything in site_config/custom/ and its corresponding file in site_config/standard/
+	// * the site config files associated with HTML fingerprints
+	// * the global site config file
+	// returns true if enabled, false otherwise
+	public static function use_apc($apc=true) {
+		if (!function_exists('apc_add')) {
+			if ($apc) self::debug('APC will not be used (function apc_add does not exist)');
+			return false;
+		}
+		self::$apc = $apc;
+		return $apc;
+	}
+	
+	// return bool or null
+	public function tidy($use_default=true) {
+		if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
+		return $this->tidy;
+	}
+	
+	// return bool or null
+	public function prune($use_default=true) {
+		if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;
+		return $this->prune;
+	}
+	
+	// return string or null
+	public function parser($use_default=true) {
+		if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;
+		return $this->parser;
+	}
+
+	// return bool or null
+	public function autodetect_on_failure($use_default=true) {
+		if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;
+		return $this->autodetect_on_failure;
+	}
+	
+	public static function set_config_path($path, $fallback=null) {
+		self::$config_path = $path;
+		self::$config_path_fallback = $fallback;
+	}
+	
+	public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
+		$key = strtolower($key);
+		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+		if ($config->cache_key) $key = $config->cache_key;
+		self::$config_cache[$key] = $config;
+		if (self::$apc && $use_apc) {
+			self::debug("Adding site config to APC cache with key sc.$key");
+			apc_add("sc.$key", $config);
+		}
+		self::debug("Cached site config with key $key");
+	}
+	
+	public static function is_cached($key) {
+		$key = strtolower($key);
+		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+		if (array_key_exists($key, self::$config_cache)) {
+			return true;
+		} elseif (self::$apc && (bool)apc_fetch("sc.$key")) {
+			return true;
+		}
+		return false;
+	}
+	
+	public function append(SiteConfig $newconfig) {
+		// check for commands where we accept multiple statements (no test_url)
+		foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
+			// append array elements for this config variable from $newconfig to this config
+			//$this->$var = $this->$var + $newconfig->$var;
+			$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
+		}
+		// check for single statement commands
+		// we do not overwrite existing non null values
+		foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
+			if ($this->$var === null) $this->$var = $newconfig->$var;
+		}
+		// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
+		foreach (array('find_string', 'replace_string') as $var) {
+			// append array elements for this config variable from $newconfig to this config
+			//$this->$var = $this->$var + $newconfig->$var;
+			$this->$var = array_merge($this->$var, $newconfig->$var);
+		}
+	}
+	
+	// returns SiteConfig instance if an appropriate one is found, false otherwise
+	// if $exact_host_match is true, we will not look for wildcard config matches
+	// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
+	public static function build($host, $exact_host_match=false) {
+		$host = strtolower($host);
+		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+		if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
+		// check for site configuration
+		$try = array($host);
+		// should we look for wildcard matches 
+		if (!$exact_host_match) {
+			$split = explode('.', $host);
+			if (count($split) > 1) {
+				array_shift($split);
+				$try[] = '.'.implode('.', $split);
+			}
+		}
+		
+		// look for site config file in primary folder
+		self::debug(". looking for site config for $host in primary folder");
+		foreach ($try as $h) {
+			if (array_key_exists($h, self::$config_cache)) {
+				self::debug("... site config for $h already loaded in this request");
+				return self::$config_cache[$h];
+			} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
+				self::debug("... site config for $h in APC cache");
+				return $sconfig;
+			} elseif (file_exists(self::$config_path."/$h.txt")) {
+				self::debug("... found site config ($h.txt)");
+				$file_primary = self::$config_path."/$h.txt";
+				$matched_name = $h;
+				break;
+			}
+		}
+		
+		// if we found site config, process it
+		if (isset($file_primary)) {
+			$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+			if (!$config_lines || !is_array($config_lines)) return false;
+			$config = self::build_from_array($config_lines);
+			// if APC caching is available and enabled, mark this for cache
+			//$config->cache_in_apc = true;
+			$config->cache_key = $matched_name;
+			
+			// if autodetec on failure is off (on by default) we do not need to look
+			// in secondary folder
+			if (!$config->autodetect_on_failure()) {
+				self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
+				return $config;
+			}
+		}
+		
+		// look for site config file in secondary folder
+		if (isset(self::$config_path_fallback)) {
+			self::debug(". looking for site config for $host in secondary folder");
+			foreach ($try as $h) {
+				if (file_exists(self::$config_path_fallback."/$h.txt")) {
+					self::debug("... found site config in secondary folder ($h.txt)");
+					$file_secondary = self::$config_path_fallback."/$h.txt";
+					$matched_name = $h;
+					break;
+				}
+			}
+			if (!isset($file_secondary)) {
+				self::debug("... no site config match in secondary folder");
+			}
+		}
+		
+		// return false if no config file found
+		if (!isset($file_primary) && !isset($file_secondary)) {
+			self::debug("... no site config match for $host");
+			return false;
+		}
+		
+		// return primary config if secondary not found
+		if (!isset($file_secondary) && isset($config)) {
+			return $config;
+		}
+		
+		// process secondary config file
+		$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+		if (!$config_lines || !is_array($config_lines)) {
+			// failed to process secondary
+			if (isset($config)) {
+				// return primary config
+				return $config;
+			} else {
+				return false;
+			}
+		}
+		
+		// merge with primary and return
+		if (isset($config)) {
+			self::debug('. merging config files');
+			$config->append(self::build_from_array($config_lines));
+			return $config;
+		} else {
+			// return just secondary
+			$config = self::build_from_array($config_lines);
+			// if APC caching is available and enabled, mark this for cache
+			//$config->cache_in_apc = true;
+			$config->cache_key = $matched_name;
+			return $config;
+		}
+	}
+	
+	public static function build_from_array(array $lines) {
+		$config = new SiteConfig();
+		foreach ($lines as $line) {
+			$line = trim($line);
+			
+			// skip comments, empty lines
+			if ($line == '' || $line[0] == '#') continue;
+			
+			// get command
+			$command = explode(':', $line, 2);
+			// if there's no colon ':', skip this line
+			if (count($command) != 2) continue;
+			$val = trim($command[1]);
+			$command = trim($command[0]);
+			if ($command == '' || $val == '') continue;
+			
+			// check for commands where we accept multiple statements
+			if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
+				array_push($config->$command, $val);
+			// check for single statement commands that evaluate to true or false
+			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+				$config->$command = ($val == 'yes');
+			// check for single statement commands stored as strings
+			} elseif (in_array($command, array('parser'))) {
+				$config->$command = $val;
+			// check for replace_string(find): replace
+			} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
+				if (in_array($match[1], array('replace_string'))) {
+					$command = $match[1];
+					array_push($config->find_string, $match[2]);
+					array_push($config->$command, $val);
+				}
+			}
+		}
+		return $config;
+	}
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedItem.php b/inc/3rdparty/libraries/feedwriter/FeedItem.php
old mode 100644
new mode 100755
index 54a56f22..40786598
--- a/inc/3rdparty/libraries/feedwriter/FeedItem.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedItem.php
@@ -1,7 +1,7 @@
 <?php
  /**
  * Univarsel Feed Writer
- * 
+ *
  * FeedItem class - Used as feed element in FeedWriter class
  *
  * @package         UnivarselFeedWriter
@@ -12,20 +12,20 @@
  {
     private $elements = array();    //Collection of feed elements
     private $version;
-    
+
     /**
-    * Constructor 
-    * 
-    * @param    contant     (RSS1/RSS2/ATOM) RSS2 is default. 
-    */ 
+    * Constructor
+    *
+    * @param    contant     (RSS1/RSS2/ATOM) RSS2 is default.
+    */
     function __construct($version = RSS2)
-    {    
+    {
         $this->version = $version;
     }
 
     /**
     * Set element (overwrites existing elements with $elementName)
-    * 
+    *
     * @access   public
     * @param    srting  The tag name of an element
     * @param    srting  The content of tag
@@ -38,11 +38,11 @@
             unset($this->elements[$elementName]);
         }
         $this->addElement($elementName, $content, $attributes);
-    }    
-    
+    }
+
     /**
     * Add an element to elements array
-    * 
+    *
     * @access   public
     * @param    srting  The tag name of an element
     * @param    srting  The content of tag
@@ -61,11 +61,11 @@
         $this->elements[$elementName][$i]['content']    = $content;
         $this->elements[$elementName][$i]['attributes'] = $attributes;
     }
-    
+
     /**
-    * Set multiple feed elements from an array. 
+    * Set multiple feed elements from an array.
     * Elements which have attributes cannot be added by this method
-    * 
+    *
     * @access   public
     * @param    array   array of elements in 'tagName' => 'tagContent' format.
     * @return   void
@@ -73,15 +73,15 @@
     public function addElementArray($elementArray)
     {
         if(! is_array($elementArray)) return;
-        foreach ($elementArray as $elementName => $content) 
+        foreach ($elementArray as $elementName => $content)
         {
             $this->addElement($elementName, $content);
         }
     }
-    
+
     /**
     * Return the collection of elements in this feed item
-    * 
+    *
     * @access   public
     * @return   array
     */
@@ -89,68 +89,74 @@
     {
         return $this->elements;
     }
-    
+
     // Wrapper functions ------------------------------------------------------
-    
+
     /**
     * Set the 'dscription' element of feed item
-    * 
+    *
     * @access   public
     * @param    string  The content of 'description' element
     * @return   void
     */
-    public function setDescription($description) 
+    public function setDescription($description)
     {
-        $this->setElement('description', $description);
+        $tag = ($this->version == ATOM)? 'summary' : 'description';
+        $this->setElement($tag, $description);
     }
-    
+
     /**
     * @desc     Set the 'title' element of feed item
     * @access   public
     * @param    string  The content of 'title' element
     * @return   void
     */
-    public function setTitle($title) 
+    public function setTitle($title)
     {
-        $this->setElement('title', $title);      
+        $this->setElement('title', $title);
     }
-    
+
     /**
     * Set the 'date' element of feed item
-    * 
+    *
     * @access   public
     * @param    string  The content of 'date' element
     * @return   void
     */
-    public function setDate($date) 
+    public function setDate($date)
     {
         if(! is_numeric($date))
         {
             $date = strtotime($date);
         }
-      
-        if($this->version == RSS2) 
+
+        if($this->version == ATOM)
+        {
+        	$tag    = 'updated';
+        	$value  = date(DATE_ATOM, $date);
+        }
+        elseif($this->version == RSS2)
         {
-            $tag    = 'pubDate';
-            $value  = date(DATE_RSS, $date);
+        	$tag    = 'pubDate';
+        	$value  = date(DATE_RSS, $date);
         }
-        else                                
+        else
         {
-            $tag    = 'dc:date';
-            $value  = date("Y-m-d", $date);
+        	$tag    = 'dc:date';
+        	$value  = date("Y-m-d", $date);
         }
-        
-        $this->setElement($tag, $value);    
+
+        $this->setElement($tag, $value);
     }
-    
+
     /**
     * Set the 'link' element of feed item
-    * 
+    *
     * @access   public
     * @param    string  The content of 'link' element
     * @return   void
     */
-    public function setLink($link) 
+    public function setLink($link)
     {
         if($this->version == RSS2 || $this->version == RSS1)
         {
@@ -161,27 +167,27 @@
         {
             $this->setElement('link','',array('href'=>$link));
             $this->setElement('id', FeedWriter::uuid($link,'urn:uuid:'));
-        } 
-        
+        }
+
     }
 
     /**
     * Set the 'source' element of feed item
-    * 
+    *
     * @access   public
     * @param    string  The content of 'source' element
     * @return   void
     */
-    public function setSource($link) 
+    public function setSource($link)
     {
         $attributes = array('url'=>$link);
         $this->setElement('source', "wallabag",$attributes);
     }
-    
+
     /**
     * Set the 'encloser' element of feed item
     * For RSS 2.0 only
-    * 
+    *
     * @access   public
     * @param    string  The url attribute of encloser tag
     * @param    string  The length attribute of encloser tag
@@ -193,6 +199,6 @@
         $attributes = array('url'=>$url, 'length'=>$length, 'type'=>$type);
         $this->setElement('enclosure','',$attributes);
     }
-    
+
  } // end of class FeedItem
 ?>
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
index d708e99b..77755690 100755
--- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
@@ -97,15 +97,12 @@ define('JSONP', 3, true);
               header('X-content-type-options: nosniff');
           } elseif ($this->version == JSON) {
               header('Content-type: application/json; charset=UTF-8');
+              $this->json = new stdClass();
           } elseif ($this->version == JSONP) {
               header('Content-type: application/javascript; charset=UTF-8');
+              $this->json = new stdClass();
           }
         }
-      
-        if ($this->version == JSON || $this->version == JSONP) {
-          $this->json = new stdClass();
-        }
-      
 
         $this->printHead();
         $this->printChannels();
@@ -116,6 +113,11 @@ define('JSONP', 3, true);
         }
     }
 
+    public function &getItems()
+    {
+    	return $this->items;
+    }
+
     /**
     * Create a new FeedItem.
     *
@@ -199,7 +201,8 @@ define('JSONP', 3, true);
     */
     public function setDescription($description)
     {
-        $this->setChannelElement('description', $description);
+        $tag = ($this->version == ATOM)? 'subtitle' : 'description';
+        $this->setChannelElement($tag, $desciption);
     }
 
     /**
@@ -244,7 +247,7 @@ define('JSONP', 3, true);
         {
             $out  = '<?xml version="1.0" encoding="utf-8"?>'."\n";
             if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
-            $out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
+            $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
             echo $out;
         }
         elseif ($this->version == JSON || $this->version == JSONP)
diff --git a/inc/3rdparty/libraries/html5/TreeBuilder.php b/inc/3rdparty/libraries/html5/TreeBuilder.php
index 2f5244f9..c4a48b21 100644
--- a/inc/3rdparty/libraries/html5/TreeBuilder.php
+++ b/inc/3rdparty/libraries/html5/TreeBuilder.php
@@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
 
     // Namespaces for foreign content
     const NS_HTML   = null; // to prevent DOM from requiring NS on everything
+    const NS_XHTML  = 'http://www.w3.org/1999/xhtml';
     const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
     const NS_SVG    = 'http://www.w3.org/2000/svg';
     const NS_XLINK  = 'http://www.w3.org/1999/xlink';
@@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
         }
 
     private function insertElement($token, $append = true) {
-        $el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
+        //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
+        $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
+        $el = $this->dom->createElementNS($namespaceURI, $token['name']);
 
         if (!empty($token['attr'])) {
             foreach($token['attr'] as $attr) {
-                if(!$el->hasAttribute($attr['name'])) {
+
+				// mike@macgirvin.com 2011-11-17, check attribute name for
+				// validity (ignoring extenders and combiners) as illegal chars in names
+				// causes everything to abort
+
+ 				$valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
+                if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
                     $el->setAttribute($attr['name'], $attr['value']);
                 }
             }
diff --git a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
index 83e94f14..e4d5f495 100644
--- a/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
+++ b/inc/3rdparty/libraries/humble-http-agent/CookieJar.php
@@ -1,404 +1,403 @@
-<?php
-/**
- * Cookie Jar
- * 
- * PHP class for handling cookies, as defined by the Netscape spec: 
- * <http://curl.haxx.se/rfc/cookie_spec.html>
- *
- * This class should be used to handle cookies (storing cookies from HTTP response messages, and
- * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 
- * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
- * 
- * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
- * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
- * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
- * 
- * @version 0.5
- * @date 2011-03-15
- * @see http://php.net/HttpRequestPool
- * @author Keyvan Minoukadeh
- * @copyright 2011 Keyvan Minoukadeh
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
- */
-
-class CookieJar
-{
-    /**
-    * Cookies - array containing all cookies.
-    *
-    * <pre>
-    * Cookies are stored like this:
-    *   [domain][path][name] = array
-    * where array is:
-    *   0 => value, 1 => secure, 2 => expires
-    * </pre>
-    * @var array
-    * @access private
-    */
-    public $cookies = array();
-	public $debug = false;
-
-    /**
-    * Constructor
-    */
-    function __construct() {
-    }
-
-	protected function debug($msg, $file=null, $line=null) {
-		if ($this->debug) {
-			$mem = round(memory_get_usage()/1024, 2);
-			$memPeak = round(memory_get_peak_usage()/1024, 2);
-			echo '* ',$msg;
-			if (isset($file, $line)) echo " ($file line $line)";
-			echo ' - mem used: ',$mem," (peak: $memPeak)\n";	
-			ob_flush();
-			flush();
-		}
-	}	
-	
-    /**
-    * Get matching cookies
-    *
-    * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
-    * this cookie jar class without using the request class.
-    *
-    * @param array $param associative array containing 'domain', 'path', 'secure' keys
-    * @return string
-    * @see add_cookie_header()
-    */
-    public function getMatchingCookies($url)
-    {
-		if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
-			$param['domain'] = $parts['host'];
-			$param['path'] = $parts['path'];
-			$param['secure'] = (strtolower($parts['scheme']) == 'https');
-			unset($parts);
-		} else {
-			return false;
-		}
-        // RFC 2965 notes:
-        //  If multiple cookies satisfy the criteria above, they are ordered in
-        //  the Cookie header such that those with more specific Path attributes
-        //  precede those with less specific.  Ordering with respect to other
-        //  attributes (e.g., Domain) is unspecified.
-        $domain = $param['domain'];
-        if (strpos($domain, '.') === false) $domain .= '.local';
-        $request_path = $param['path'];
-        if ($request_path == '') $request_path = '/';
-        $request_secure = $param['secure'];
-        $now = time();
-        $matched_cookies = array();
-        // domain - find matching domains
-        $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
-        while (strpos($domain, '.') !== false) {
-            if (isset($this->cookies[$domain])) {
-                $this->debug(' domain match found: '.$domain);
-                $cookies =& $this->cookies[$domain];
-            } else {
-                $domain = $this->_reduce_domain($domain);
-                continue;
-            }
-            // paths - find matching paths starting from most specific
-            $this->debug('  - Finding matching paths for '.$request_path);
-            $paths = array_keys($cookies);
-            usort($paths, array($this, '_cmp_length'));
-            foreach ($paths as $path) {
-                // continue to next cookie if request path does not path-match cookie path
-                if (!$this->_path_match($request_path, $path)) continue;
-                // loop through cookie names
-                $this->debug('     path match found: '.$path);
-                foreach ($cookies[$path] as $name => $values) {
-                    // if this cookie is secure but request isn't, continue to next cookie
-                    if ($values[1] && !$request_secure) continue;
-                    // if cookie is not a session cookie and has expired, continue to next cookie
-                    if (is_int($values[2]) && ($values[2] < $now)) continue;
-                    // cookie matches request
-                    $this->debug('      cookie match: '.$name.'='.$values[0]);
-                    $matched_cookies[] = $name.'='.$values[0];
-                }
-            }
-            $domain = $this->_reduce_domain($domain);
-        }
-        // return cookies
-        return implode('; ', $matched_cookies);
-    }
-
-    /**
-    * Parse Set-Cookie values.
-    *
-    * Only use this method if you cannot use extract_cookies(), for example, if you want to use
-    * this cookie jar class without using the response class.
-    *
-    * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
-    * @param array $param associative array containing 'host', 'path' keys
-    * @return void
-    * @see extract_cookies()
-    */
-    public function storeCookies($url, $set_cookies)
-    {
-        if (count($set_cookies) == 0) return;
-		$param = @parse_url($url);
-		if (!is_array($param) || !isset($param['host'])) return;
-        $request_host = $param['host'];
-        if (strpos($request_host, '.') === false) $request_host .= '.local';
-        $request_path = @$param['path'];
-        if ($request_path == '') $request_path = '/';
-        //
-        // loop through set-cookie headers
-        //
-        foreach ($set_cookies as $set_cookie) {
-            $this->debug('Parsing: '.$set_cookie);
-            // temporary cookie store (before adding to jar)
-            $tmp_cookie = array();
-            $param = explode(';', $set_cookie);
-            // loop through params
-            for ($x=0; $x<count($param); $x++) {
-                $key_val = explode('=', $param[$x], 2);
-                if (count($key_val) != 2) {
-                    // if the first param isn't a name=value pair, continue to the next set-cookie
-                    // header
-                    if ($x == 0) continue 2;
-                    // check for secure flag
-                    if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
-                    // continue to next param
-                    continue;
-                }
-                list($key, $val) = array_map('trim', $key_val);
-                // first name=value pair is the cookie name and value
-                // the name and value are stored under 'name' and 'value' to avoid conflicts
-                // with later parameters.
-                if ($x == 0) {
-                    $tmp_cookie = array('name'=>$key, 'value'=>$val);
-                    continue;
-                }
-                $key = strtolower($key);
-                if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
-                    $tmp_cookie[$key] = $val;
-                }
-            }
-            //
-            // set cookie
-            //
-            // check domain
-            if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
-                    ($tmp_cookie['domain'] != ".$request_host")) {
-                $domain = $tmp_cookie['domain'];
-                if ((strpos($domain, '.') === false) && ($domain != 'local')) {
-                    $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
-                    continue;
-                }
-                if (preg_match('/\.[0-9]+$/', $domain)) {
-                    $this->debug(' - domain "'.$domain.'" appears to be an ip address');
-                    continue;
-                }
-                if (substr($domain, 0, 1) != '.') $domain = ".$domain";
-                if (!$this->_domain_match($request_host, $domain)) {
-                    $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
-                    continue;
-                }
-            } else {
-                // if domain is not specified in the set-cookie header, domain will default to
-                // the request host
-                $domain = $request_host;
-            }
-            // check path
-            if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
-                $path = urldecode($tmp_cookie['path']);
-                if (!$this->_path_match($request_path, $path)) {
-                    $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
-                    continue;
-                }
-            } else {
-                $path = $request_path;
-                $path = substr($path, 0, strrpos($path, '/'));
-                if ($path == '') $path = '/';
-            }
-            // check if secure
-            $secure = (isset($tmp_cookie['secure'])) ? true : false;
-            // check expiry
-            if (isset($tmp_cookie['expires'])) {
-                if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
-                    $expires = null;
-                }
-            } else {
-                $expires = null;
-            }
-            // set cookie
-            $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
-        }
-    }
-	
-	// return array of set-cookie values extracted from HTTP response headers (string $h)
-	public function extractCookies($h) {
-        $x = 0;
-        $lines = 0;
-        $headers = array();
-        $last_match = false;
-		$h = explode("\n", $h);
-        foreach ($h as $line) {
-			$line = rtrim($line);
-            $lines++;
-
-            $trimmed_line = trim($line);
-            if (isset($line_last)) {
-                // check if we have \r\n\r\n (indicating the end of headers)
-                // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
-                // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
-                //     break;
-                // }
-                // As an alternative, we can check if the current trimmed line is empty
-                if ($trimmed_line == '') {
-                    break;
-                }
-
-                // check for continuation line...
-                // RFC 2616 Section 2.2 "Basic Rules":
-                // HTTP/1.1 header field values can be folded onto multiple lines if the
-                // continuation line begins with a space or horizontal tab. All linear
-                // white space, including folding, has the same semantics as SP. A
-                // recipient MAY replace any linear white space with a single SP before
-                // interpreting the field value or forwarding the message downstream.
-                if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
-                    // append to previous header value
-                    $headers[$x-1] .= ' '.rtrim($match[1]);
-                    continue;
-                }
-            }
-            $line_last = $line;
-
-            // split header name and value
-            if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
-                $headers[$x++] = rtrim($match[1]);
-                $last_match = true;
-            } else {
-                $last_match = false;
-            }
-        }
-        return $headers;
-	}
-
-    /**
-    * Set Cookie
-    * @param string $domain
-    * @param string $path
-    * @param string $name cookie name
-    * @param string $value cookie value
-    * @param bool $secure
-    * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
-    * @return void
-    */
-    function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
-    {
-        if ($domain == '') return;
-        if ($path == '') return;
-        if ($name == '') return;
-        // check if cookie needs to go
-        if (isset($expires) && ($expires <= 0)) {
-            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
-            return;
-        }
-        if ($value == '') return;
-        $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
-        return;
-    }
-
-    /**
-    * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
-    * @param string $domain
-    * @param string $path
-    * @param string $name
-    * @return void
-    */
-    function clear($domain=null, $path=null, $name=null)
-    {
-        if (!isset($domain)) {
-            $this->cookies = array();
-        } elseif (!isset($path)) {
-            if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
-        } elseif (!isset($name)) {
-            if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
-        } elseif (isset($name)) {
-            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
-        }
-    }
-
-    /**
-    * Compare string length - used for sorting
-    * @access private
-    * @return int
-    */
-    function _cmp_length($a, $b)
-    {
-        $la = strlen($a); $lb = strlen($b);
-        if ($la == $lb) return 0;
-        return ($la > $lb) ? -1 : 1;
-    }
-
-    /**
-    * Reduce domain
-    * @param string $domain
-    * @return string
-    * @access private
-    */
-    function _reduce_domain($domain)
-    {
-        if ($domain == '') return '';
-        if (substr($domain, 0, 1) == '.') return substr($domain, 1);
-        return substr($domain, strpos($domain, '.'));
-    }
-
-    /**
-    * Path match - check if path1 path-matches path2
-    *
-    * From RFC 2965: 
-    *   <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
-    *   if P2 is a prefix of P1 (including the case where P1 and P2 string-
-    *   compare equal).  Thus, the string /tec/waldo path-matches /tec.</i>
-    * @param string $path1
-    * @param string $path2
-    * @return bool
-    * @access private
-    */
-    function _path_match($path1, $path2)
-    {
-        return (substr($path1, 0, strlen($path2)) == $path2);
-    }
-
-    /**
-    * Domain match - check if domain1 domain-matches domain2
-    *
-    * A few extracts from RFC 2965: 
-    *  -  A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
-    *     would be rejected, because H is y.x and contains a dot.
-    *
-    *  -  A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
-    *     would be accepted.
-    *
-    *  -  A Set-Cookie2 with Domain=.com or Domain=.com., will always be
-    *     rejected, because there is no embedded dot.
-    *
-    *  -  A Set-Cookie2 from request-host example for Domain=.local will
-    *     be accepted, because the effective host name for the request-
-    *     host is example.local, and example.local domain-matches .local.
-    *
-    * I'm ignoring the first point for now (must check to see how other browsers handle
-    * this rule for Set-Cookie headers)
-    *
-    * @param string $domain1
-    * @param string $domain2
-    * @return bool
-    * @access private
-    */
-    function _domain_match($domain1, $domain2)
-    {
-        $domain1 = strtolower($domain1);
-        $domain2 = strtolower($domain2);
-        while (strpos($domain1, '.') !== false) {
-            if ($domain1 == $domain2) return true;
-            $domain1 = $this->_reduce_domain($domain1);
-            continue;
-        }
-        return false;
-    }
-}
-?>
\ No newline at end of file
+<?php
+/**
+ * Cookie Jar
+ * 
+ * PHP class for handling cookies, as defined by the Netscape spec: 
+ * <http://curl.haxx.se/rfc/cookie_spec.html>
+ *
+ * This class should be used to handle cookies (storing cookies from HTTP response messages, and
+ * sending out cookies in HTTP request messages). This has been adapted for FiveFilters.org 
+ * from the original version used in HTTP Navigator. See http://www.keyvan.net/code/http-navigator/
+ * 
+ * This class is mainly based on Cookies.pm <http://search.cpan.org/author/GAAS/libwww-perl-5.65/
+ * lib/HTTP/Cookies.pm> from the libwww-perl collection <http://www.linpro.no/lwp/>.
+ * Unlike Cookies.pm, this class only supports the Netscape cookie spec, not RFC 2965.
+ * 
+ * @version 0.5
+ * @date 2011-03-15
+ * @see http://php.net/HttpRequestPool
+ * @author Keyvan Minoukadeh
+ * @copyright 2011 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class CookieJar
+{
+    /**
+    * Cookies - array containing all cookies.
+    *
+    * <pre>
+    * Cookies are stored like this:
+    *   [domain][path][name] = array
+    * where array is:
+    *   0 => value, 1 => secure, 2 => expires
+    * </pre>
+    * @var array
+    * @access private
+    */
+    public $cookies = array();
+	public $debug = false;
+
+    /**
+    * Constructor
+    */
+    function __construct() {
+    }
+
+	protected function debug($msg, $file=null, $line=null) {
+		if ($this->debug) {
+			$mem = round(memory_get_usage()/1024, 2);
+			$memPeak = round(memory_get_peak_usage()/1024, 2);
+			echo '* ',$msg;
+			if (isset($file, $line)) echo " ($file line $line)";
+			echo ' - mem used: ',$mem," (peak: $memPeak)\n";	
+			ob_flush();
+			flush();
+		}
+	}	
+	
+    /**
+    * Get matching cookies
+    *
+    * Only use this method if you cannot use add_cookie_header(), for example, if you want to use
+    * this cookie jar class without using the request class.
+    *
+    * @param array $param associative array containing 'domain', 'path', 'secure' keys
+    * @return string
+    * @see add_cookie_header()
+    */
+    public function getMatchingCookies($url)
+    {
+		if (($parts = @parse_url($url)) && isset($parts['scheme'], $parts['host'], $parts['path'])) {
+			$param['domain'] = $parts['host'];
+			$param['path'] = $parts['path'];
+			$param['secure'] = (strtolower($parts['scheme']) == 'https');
+			unset($parts);
+		} else {
+			return false;
+		}
+        // RFC 2965 notes:
+        //  If multiple cookies satisfy the criteria above, they are ordered in
+        //  the Cookie header such that those with more specific Path attributes
+        //  precede those with less specific.  Ordering with respect to other
+        //  attributes (e.g., Domain) is unspecified.
+        $domain = $param['domain'];
+        if (strpos($domain, '.') === false) $domain .= '.local';
+        $request_path = $param['path'];
+        if ($request_path == '') $request_path = '/';
+        $request_secure = $param['secure'];
+        $now = time();
+        $matched_cookies = array();
+        // domain - find matching domains
+        $this->debug('Finding matching domains for '.$domain, __FILE__, __LINE__);
+        while (strpos($domain, '.') !== false) {
+            if (isset($this->cookies[$domain])) {
+                $this->debug(' domain match found: '.$domain);
+                $cookies =& $this->cookies[$domain];
+            } else {
+                $domain = $this->_reduce_domain($domain);
+                continue;
+            }
+            // paths - find matching paths starting from most specific
+            $this->debug('  - Finding matching paths for '.$request_path);
+            $paths = array_keys($cookies);
+            usort($paths, array($this, '_cmp_length'));
+            foreach ($paths as $path) {
+                // continue to next cookie if request path does not path-match cookie path
+                if (!$this->_path_match($request_path, $path)) continue;
+                // loop through cookie names
+                $this->debug('     path match found: '.$path);
+                foreach ($cookies[$path] as $name => $values) {
+                    // if this cookie is secure but request isn't, continue to next cookie
+                    if ($values[1] && !$request_secure) continue;
+                    // if cookie is not a session cookie and has expired, continue to next cookie
+                    if (is_int($values[2]) && ($values[2] < $now)) continue;
+                    // cookie matches request
+                    $this->debug('      cookie match: '.$name.'='.$values[0]);
+                    $matched_cookies[] = $name.'='.$values[0];
+                }
+            }
+            $domain = $this->_reduce_domain($domain);
+        }
+        // return cookies
+        return implode('; ', $matched_cookies);
+    }
+
+    /**
+    * Parse Set-Cookie values.
+    *
+    * Only use this method if you cannot use extract_cookies(), for example, if you want to use
+    * this cookie jar class without using the response class.
+    *
+    * @param array $set_cookies array holding 1 or more "Set-Cookie" header values
+    * @param array $param associative array containing 'host', 'path' keys
+    * @return void
+    * @see extract_cookies()
+    */
+    public function storeCookies($url, $set_cookies)
+    {
+        if (count($set_cookies) == 0) return;
+		$param = @parse_url($url);
+		if (!is_array($param) || !isset($param['host'])) return;
+        $request_host = $param['host'];
+        if (strpos($request_host, '.') === false) $request_host .= '.local';
+        $request_path = @$param['path'];
+        if ($request_path == '') $request_path = '/';
+        //
+        // loop through set-cookie headers
+        //
+        foreach ($set_cookies as $set_cookie) {
+            $this->debug('Parsing: '.$set_cookie);
+            // temporary cookie store (before adding to jar)
+            $tmp_cookie = array();
+            $param = explode(';', $set_cookie);
+            // loop through params
+            for ($x=0; $x<count($param); $x++) {
+                $key_val = explode('=', $param[$x], 2);
+                if (count($key_val) != 2) {
+                    // if the first param isn't a name=value pair, continue to the next set-cookie
+                    // header
+                    if ($x == 0) continue 2;
+                    // check for secure flag
+                    if (strtolower(trim($key_val[0])) == 'secure') $tmp_cookie['secure'] = true;
+                    // continue to next param
+                    continue;
+                }
+                list($key, $val) = array_map('trim', $key_val);
+                // first name=value pair is the cookie name and value
+                // the name and value are stored under 'name' and 'value' to avoid conflicts
+                // with later parameters.
+                if ($x == 0) {
+                    $tmp_cookie = array('name'=>$key, 'value'=>$val);
+                    continue;
+                }
+                $key = strtolower($key);
+                if (in_array($key, array('expires', 'path', 'domain', 'secure'))) {
+                    $tmp_cookie[$key] = $val;
+                }
+            }
+            //
+            // set cookie
+            //
+            // check domain
+            if (isset($tmp_cookie['domain']) && ($tmp_cookie['domain'] != $request_host) &&
+                    ($tmp_cookie['domain'] != ".$request_host")) {
+                $domain = $tmp_cookie['domain'];
+                if ((strpos($domain, '.') === false) && ($domain != 'local')) {
+                    $this->debug(' - domain "'.$domain.'" has no dot and is not a local domain');
+                    continue;
+                }
+                if (preg_match('/\.[0-9]+$/', $domain)) {
+                    $this->debug(' - domain "'.$domain.'" appears to be an ip address');
+                    continue;
+                }
+                if (substr($domain, 0, 1) != '.') $domain = ".$domain";
+                if (!$this->_domain_match($request_host, $domain)) {
+                    $this->debug(' - request host "'.$request_host.'" does not domain-match "'.$domain.'"');
+                    continue;
+                }
+            } else {
+                // if domain is not specified in the set-cookie header, domain will default to
+                // the request host
+                $domain = $request_host;
+            }
+            // check path
+            if (isset($tmp_cookie['path']) && ($tmp_cookie['path'] != '')) {
+                $path = urldecode($tmp_cookie['path']);
+                if (!$this->_path_match($request_path, $path)) {
+                    $this->debug(' - request path "'.$request_path.'" does not path-match "'.$path.'"');
+                    continue;
+                }
+            } else {
+                $path = $request_path;
+                $path = substr($path, 0, strrpos($path, '/'));
+                if ($path == '') $path = '/';
+            }
+            // check if secure
+            $secure = (isset($tmp_cookie['secure'])) ? true : false;
+            // check expiry
+            if (isset($tmp_cookie['expires'])) {
+                if (($expires = strtotime($tmp_cookie['expires'])) < 0) {
+                    $expires = null;
+                }
+            } else {
+                $expires = null;
+            }
+            // set cookie
+            $this->set_cookie($domain, $path, $tmp_cookie['name'], $tmp_cookie['value'], $secure, $expires);
+        }
+    }
+	
+	// return array of set-cookie values extracted from HTTP response headers (string $h)
+	public function extractCookies($h) {
+        $x = 0;
+        $lines = 0;
+        $headers = array();
+        $last_match = false;
+		$h = explode("\n", $h);
+        foreach ($h as $line) {
+			$line = rtrim($line);
+            $lines++;
+
+            $trimmed_line = trim($line);
+            if (isset($line_last)) {
+                // check if we have \r\n\r\n (indicating the end of headers)
+                // some servers will not use CRLF (\r\n), so we make CR (\r) optional.
+                // if (preg_match('/\015?\012\015?\012/', $line_last.$line)) {
+                //     break;
+                // }
+                // As an alternative, we can check if the current trimmed line is empty
+                if ($trimmed_line == '') {
+                    break;
+                }
+
+                // check for continuation line...
+                // RFC 2616 Section 2.2 "Basic Rules":
+                // HTTP/1.1 header field values can be folded onto multiple lines if the
+                // continuation line begins with a space or horizontal tab. All linear
+                // white space, including folding, has the same semantics as SP. A
+                // recipient MAY replace any linear white space with a single SP before
+                // interpreting the field value or forwarding the message downstream.
+                if ($last_match && preg_match('/^\s+(.*)/', $line, $match)) {
+                    // append to previous header value
+                    $headers[$x-1] .= ' '.rtrim($match[1]);
+                    continue;
+                }
+            }
+            $line_last = $line;
+
+            // split header name and value
+            if (preg_match('/^Set-Cookie\s*:\s*(.*)/i', $line, $match)) {
+                $headers[$x++] = rtrim($match[1]);
+                $last_match = true;
+            } else {
+                $last_match = false;
+            }
+        }
+        return $headers;
+	}
+
+    /**
+    * Set Cookie
+    * @param string $domain
+    * @param string $path
+    * @param string $name cookie name
+    * @param string $value cookie value
+    * @param bool $secure
+    * @param int $expires expiry time (null if session cookie, <= 0 will delete cookie)
+    * @return void
+    */
+    function set_cookie($domain, $path, $name, $value, $secure=false, $expires=null)
+    {
+        if ($domain == '') return;
+        if ($path == '') return;
+        if ($name == '') return;
+        // check if cookie needs to go
+        if (isset($expires) && ($expires <= 0)) {
+            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
+            return;
+        }
+        if ($value == '') return;
+        $this->cookies[$domain][$path][$name] = array($value, $secure, $expires);
+        return;
+    }
+
+    /**
+    * Clear cookies - [domain [,path [,name]]] - call method with no arguments to clear all cookies.
+    * @param string $domain
+    * @param string $path
+    * @param string $name
+    * @return void
+    */
+    function clear($domain=null, $path=null, $name=null)
+    {
+        if (!isset($domain)) {
+            $this->cookies = array();
+        } elseif (!isset($path)) {
+            if (isset($this->cookies[$domain])) unset($this->cookies[$domain]);
+        } elseif (!isset($name)) {
+            if (isset($this->cookies[$domain][$path])) unset($this->cookies[$domain][$path]);
+        } elseif (isset($name)) {
+            if (isset($this->cookies[$domain][$path][$name])) unset($this->cookies[$domain][$path][$name]);
+        }
+    }
+
+    /**
+    * Compare string length - used for sorting
+    * @access private
+    * @return int
+    */
+    function _cmp_length($a, $b)
+    {
+        $la = strlen($a); $lb = strlen($b);
+        if ($la == $lb) return 0;
+        return ($la > $lb) ? -1 : 1;
+    }
+
+    /**
+    * Reduce domain
+    * @param string $domain
+    * @return string
+    * @access private
+    */
+    function _reduce_domain($domain)
+    {
+        if ($domain == '') return '';
+        if (substr($domain, 0, 1) == '.') return substr($domain, 1);
+        return substr($domain, strpos($domain, '.'));
+    }
+
+    /**
+    * Path match - check if path1 path-matches path2
+    *
+    * From RFC 2965: 
+    *   <i>For two strings that represent paths, P1 and P2, P1 path-matches P2
+    *   if P2 is a prefix of P1 (including the case where P1 and P2 string-
+    *   compare equal).  Thus, the string /tec/waldo path-matches /tec.</i>
+    * @param string $path1
+    * @param string $path2
+    * @return bool
+    * @access private
+    */
+    function _path_match($path1, $path2)
+    {
+        return (substr($path1, 0, strlen($path2)) == $path2);
+    }
+
+    /**
+    * Domain match - check if domain1 domain-matches domain2
+    *
+    * A few extracts from RFC 2965: 
+    *  -  A Set-Cookie2 from request-host y.x.foo.com for Domain=.foo.com
+    *     would be rejected, because H is y.x and contains a dot.
+    *
+    *  -  A Set-Cookie2 from request-host x.foo.com for Domain=.foo.com
+    *     would be accepted.
+    *
+    *  -  A Set-Cookie2 with Domain=.com or Domain=.com., will always be
+    *     rejected, because there is no embedded dot.
+    *
+    *  -  A Set-Cookie2 from request-host example for Domain=.local will
+    *     be accepted, because the effective host name for the request-
+    *     host is example.local, and example.local domain-matches .local.
+    *
+    * I'm ignoring the first point for now (must check to see how other browsers handle
+    * this rule for Set-Cookie headers)
+    *
+    * @param string $domain1
+    * @param string $domain2
+    * @return bool
+    * @access private
+    */
+    function _domain_match($domain1, $domain2)
+    {
+        $domain1 = strtolower($domain1);
+        $domain2 = strtolower($domain2);
+        while (strpos($domain1, '.') !== false) {
+            if ($domain1 == $domain2) return true;
+            $domain1 = $this->_reduce_domain($domain1);
+            continue;
+        }
+        return false;
+    }
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
index e4f1b3b3..963f0c05 100644
--- a/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -1,779 +1,810 @@
-<?php
-/**
- * Humble HTTP Agent
- * 
- * This class is designed to take advantage of parallel HTTP requests
- * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 
- * For environments which do not have these options, it reverts to standard sequential 
- * requests (using file_get_contents())
- * 
- * @version 1.1
- * @date 2012-08-20
- * @see http://php.net/HttpRequestPool
- * @author Keyvan Minoukadeh
- * @copyright 2011-2012 Keyvan Minoukadeh
- * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
- */
-
-class HumbleHttpAgent
-{
-	const METHOD_REQUEST_POOL = 1;
-	const METHOD_CURL_MULTI = 2;
-	const METHOD_FILE_GET_CONTENTS = 4;
-	//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
-	const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
-	const UA_PHP = 'PHP/5.2';
-	const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
-	
-	protected $requests = array();
-	protected $redirectQueue = array();
-	protected $requestOptions;
-	protected $maxParallelRequests = 5;
-	protected $cache = null; //TODO
-	protected $httpContext;
-	protected $minimiseMemoryUse = false; //TODO
-	protected $method;
-	protected $cookieJar;
-	public $debug = false;
-	public $debugVerbose = false;
-	public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
-	public $maxRedirects = 5;
-	public $userAgentMap = array();
-	public $rewriteUrls = array();
-	public $userAgentDefault;
-	public $referer;
-	//public $userAgent = 'Mozilla/5.0';
-	
-	// Prevent certain file/mime types
-	// HTTP responses which match these content types will
-	// be returned without body.
-	public $headerOnlyTypes = array();
-	// URLs ending with one of these extensions will
-	// prompt Humble HTTP Agent to send a HEAD request first
-	// to see if returned content type matches $headerOnlyTypes.
-	public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
-	// AJAX triggers to search for.
-	// for AJAX sites, e.g. Blogger with its dynamic views templates.
-	public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
-	
-	//TODO: set max file size
-	//TODO: normalise headers
-	
-	function __construct($requestOptions=null, $method=null) {
-		$this->userAgentDefault = self::UA_BROWSER;
-		$this->referer = self::REF_GOOGLE;
-		// set the request method
-		if (in_array($method, array(1,2,4))) {
-			$this->method = $method;
-		} else {
-			if (class_exists('HttpRequestPool')) {
-				$this->method = self::METHOD_REQUEST_POOL;
-			} elseif (function_exists('curl_multi_init')) {
-				$this->method = self::METHOD_CURL_MULTI;
-			} else {
-				$this->method = self::METHOD_FILE_GET_CONTENTS;
-			}
-		}
-		if ($this->method == self::METHOD_CURL_MULTI) {
-			require_once(dirname(__FILE__).'/RollingCurl.php');
-		}
-		// create cookie jar
-		$this->cookieJar = new CookieJar();
-		// set request options (redirect must be 0)
-		$this->requestOptions = array(
-			'timeout' => 15,
-			'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
-			// TODO: test onprogress?
-		);
-		if (is_array($requestOptions)) {
-			$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
-		}
-		$this->httpContext = array(
-			'http' => array(
-				'ignore_errors' => true,
-				'timeout' => $this->requestOptions['timeout'],
-				'max_redirects' => $this->requestOptions['redirect'],
-				'header' => "Accept: */*\r\n"
-				)
-			);
-	}
-	
-	protected function debug($msg) {
-		if ($this->debug) {
-			$mem = round(memory_get_usage()/1024, 2);
-			$memPeak = round(memory_get_peak_usage()/1024, 2);
-			echo '* ',$msg;
-			if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
-			echo "\n";
-			ob_flush();
-			flush();
-		}
-	}
-	
-	protected function getUserAgent($url, $asArray=false) {
-		$host = @parse_url($url, PHP_URL_HOST);
-		if (strtolower(substr($host, 0, 4)) == 'www.') {
-			$host = substr($host, 4);
-		}
-		if ($host) {
-			$try = array($host);
-			$split = explode('.', $host);
-			if (count($split) > 1) {
-				array_shift($split);
-				$try[] = '.'.implode('.', $split);
-			}
-			foreach ($try as $h) {
-				if (isset($this->userAgentMap[$h])) {
-					$ua = $this->userAgentMap[$h];
-					break;
-				}
-			}
-		}
-		if (!isset($ua)) $ua = $this->userAgentDefault;
-		if ($asArray) {
-			return array('User-Agent' => $ua);
-		} else {
-			return 'User-Agent: '.$ua;
-		}
-	}
-	
-	public function rewriteHashbangFragment($url) {
-		// return $url if there's no '#!'
-		if (strpos($url, '#!') === false) return $url;
-		// split $url and rewrite
-		// TODO: is SimplePie_IRI included?
-		$iri = new SimplePie_IRI($url);
-		$fragment = substr($iri->fragment, 1); // strip '!'
-		$iri->fragment = null;
-		if (isset($iri->query)) {
-			parse_str($iri->query, $query);
-		} else {
-			$query = array();
-		}
-		$query['_escaped_fragment_'] = (string)$fragment;
-		$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
-		return $iri->get_iri();
-	}
-	
-	public function getUglyURL($url, $html) {
-		if ($html == '') return false;
-		$found = false;
-		foreach ($this->ajaxTriggers as $string) {
-			if (stripos($html, $string)) {
-				$found = true;
-				break;
-			}
-		}
-		if (!$found) return false;
-		$iri = new SimplePie_IRI($url);
-		if (isset($iri->query)) {
-			parse_str($iri->query, $query);
-		} else {
-			$query = array();
-		}
-		$query['_escaped_fragment_'] = '';
-		$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
-		return $iri->get_iri();
-	}
-	
-	public function removeFragment($url) {
-		$pos = strpos($url, '#');
-		if ($pos === false) {
-			return $url;
-		} else {
-			return substr($url, 0, $pos);
-		}
-	}
-	
-	public function rewriteUrls($url) {
-		foreach ($this->rewriteUrls as $find => $action) {
-			if (strpos($url, $find) !== false) {
-				if (is_array($action)) {
-					return strtr($url, $action);
-				}
-			}
-		}
-		return $url;
-	}
-	
-	public function enableDebug($bool=true) {
-		$this->debug = (bool)$bool;
-	}
-	
-	public function minimiseMemoryUse($bool = true) {
-		$this->minimiseMemoryUse = $bool;
-	}
-	
-	public function setMaxParallelRequests($max) {
-		$this->maxParallelRequests = $max;
-	}
-	
-	public function validateUrl($url) {
-		$url = filter_var($url, FILTER_SANITIZE_URL);
-		$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
-		// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
-		if ($test === false) {
-			$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
-		}
-		if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
-			return $url;
-		} else {
-			return false;
-		}
-	}
-	
-	public function fetchAll(array $urls) {
-		$this->fetchAllOnce($urls, $isRedirect=false);
-		$redirects = 0;
-		while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
-			$this->debug("Following redirects #$redirects...");
-			$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
-		}
-	}
-	
-	// fetch all URLs without following redirects
-	public function fetchAllOnce(array $urls, $isRedirect=false) {
-		if (!$isRedirect) $urls = array_unique($urls);
-		if (empty($urls)) return;
-		
-		//////////////////////////////////////////////////////
-		// parallel (HttpRequestPool)
-		if ($this->method == self::METHOD_REQUEST_POOL) {
-			$this->debug('Starting parallel fetch (HttpRequestPool)');
-			try {
-				while (count($urls) > 0) {
-					$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
-					$subset = array_splice($urls, 0, $this->maxParallelRequests);
-					$pool = new HttpRequestPool();
-					foreach ($subset as $orig => $url) {
-						if (!$isRedirect) $orig = $url;
-						unset($this->redirectQueue[$orig]);
-						$this->debug("...$url");
-						if (!$isRedirect && isset($this->requests[$url])) {
-							$this->debug("......in memory");
-						/*
-						} elseif ($this->isCached($url)) {
-							$this->debug("......is cached");
-							if (!$this->minimiseMemoryUse) {
-								$this->requests[$url] = $this->getCached($url);
-							}
-						*/
-						} else {
-							$this->debug("......adding to pool");
-							$req_url = $this->rewriteUrls($url);
-							$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
-							$req_url = $this->removeFragment($req_url);
-							if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
-								$_meth = HttpRequest::METH_HEAD;
-							} else {
-								$_meth = HttpRequest::METH_GET;
-								unset($this->requests[$orig]['wrongGuess']);
-							}
-							$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
-							// send cookies, if we have any
-							if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
-								$this->debug("......sending cookies: $cookies");
-								$httpRequest->addHeaders(array('Cookie' => $cookies));
-							}
-							//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
-							$httpRequest->addHeaders($this->getUserAgent($req_url, true));
-							// add referer for picky sites
-							$httpRequest->addheaders(array('Referer' => $this->referer));
-							$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
-							$this->requests[$orig]['original_url'] = $orig;
-							$pool->attach($httpRequest);
-						}
-					}
-					// did we get anything into the pool?
-					if (count($pool) > 0) {
-						$this->debug('Sending request...');
-						try {
-							$pool->send();
-						} catch (HttpRequestPoolException $e) {
-							// do nothing
-						}
-						$this->debug('Received responses');
-						foreach($subset as $orig => $url) {
-							if (!$isRedirect) $orig = $url;
-							$request = $this->requests[$orig]['httpRequest'];
-							//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
-							// getResponseHeader() doesn't return status line, so, for consistency...
-							$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
-							// check content type
-							// TODO: use getResponseHeader('content-type') or getResponseInfo()
-							if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
-								$this->requests[$orig]['body'] = '';
-								$_header_only_type = true;
-								$this->debug('Header only type returned');
-							} else {
-								$this->requests[$orig]['body'] = $request->getResponseBody();
-								$_header_only_type = false;
-							}
-							$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
-							$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
-							// is redirect?
-							if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
-								$redirectURL = $request->getResponseHeader('location');
-								if (!preg_match('!^https?://!i', $redirectURL)) {
-									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
-								}
-								if ($this->validateURL($redirectURL)) {
-									$this->debug('Redirect detected. Valid URL: '.$redirectURL);
-									// store any cookies
-									$cookies = $request->getResponseHeader('set-cookie');
-									if ($cookies && !is_array($cookies)) $cookies = array($cookies);
-									if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
-									$this->redirectQueue[$orig] = $redirectURL;
-								} else {
-									$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
-								}
-							} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
-								// the response content-type did not match our 'header only' types, 
-								// but we'd issues a HEAD request because we assumed it would. So
-								// let's queue a proper GET request for this item...
-								$this->debug('Wrong guess at content-type, queing GET request');
-								$this->requests[$orig]['wrongGuess'] = true;
-								$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
-							} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
-								// check for <meta name='fragment' content='!'/>
-								// for AJAX sites, e.g. Blogger with its dynamic views templates.
-								// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
-								if (isset($this->requests[$orig]['body'])) {
-									$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
-									if ($redirectURL) {
-										$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
-										$this->redirectQueue[$orig] = $redirectURL;
-									}
-								}
-							}
-							//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
-							$pool->detach($request);
-							unset($this->requests[$orig]['httpRequest'], $request);
-							/*
-							if ($this->minimiseMemoryUse) {
-								if ($this->cache($url)) {
-									unset($this->requests[$url]);
-								}
-							}
-							*/
-						}
-					}
-				}
-			} catch (HttpException $e) {
-				$this->debug($e);
-				return false;
-			}
-		}
-		
-		//////////////////////////////////////////////////////////
-		// parallel (curl_multi_*)
-		elseif ($this->method == self::METHOD_CURL_MULTI) {
-			$this->debug('Starting parallel fetch (curl_multi_*)');
-			while (count($urls) > 0) {
-				$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
-				$subset = array_splice($urls, 0, $this->maxParallelRequests);
-				$pool = new RollingCurl(array($this, 'handleCurlResponse'));
-				$pool->window_size = count($subset);		
-				
-				foreach ($subset as $orig => $url) {
-					if (!$isRedirect) $orig = $url;
-					unset($this->redirectQueue[$orig]);
-					$this->debug("...$url");
-					if (!$isRedirect && isset($this->requests[$url])) {
-						$this->debug("......in memory");
-					/*
-					} elseif ($this->isCached($url)) {
-						$this->debug("......is cached");
-						if (!$this->minimiseMemoryUse) {
-							$this->requests[$url] = $this->getCached($url);
-						}
-					*/
-					} else {
-						$this->debug("......adding to pool");
-						$req_url = $this->rewriteUrls($url);
-						$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
-						$req_url = $this->removeFragment($req_url);
-						if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
-							$_meth = 'HEAD';
-						} else {
-							$_meth = 'GET';
-							unset($this->requests[$orig]['wrongGuess']);
-						}						
-						$headers = array();
-						//$headers[] = 'User-Agent: '.$this->userAgent;
-						$headers[] = $this->getUserAgent($req_url);
-						// add referer for picky sites
-						$headers[] = 'Referer: '.$this->referer;
-						// send cookies, if we have any
-						if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
-							$this->debug("......sending cookies: $cookies");
-							$headers[] = 'Cookie: '.$cookies;
-						}
-						$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
-							CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
-							CURLOPT_TIMEOUT => $this->requestOptions['timeout']
-							));
-						$httpRequest->set_original_url($orig);
-						$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
-						$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
-						$pool->add($httpRequest);
-					}
-				}
-				// did we get anything into the pool?
-				if (count($pool) > 0) {
-					$this->debug('Sending request...');
-					$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
-					$this->debug('Received responses');
-					foreach($subset as $orig => $url) {
-						if (!$isRedirect) $orig = $url;
-						// $this->requests[$orig]['headers']
-						// $this->requests[$orig]['body']
-						// $this->requests[$orig]['effective_url']
-						// check content type
-						if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
-							$this->requests[$orig]['body'] = '';
-							$_header_only_type = true;
-							$this->debug('Header only type returned');
-						} else {
-							$_header_only_type = false;
-						}
-						$status_code = $this->requests[$orig]['status_code'];
-						if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
-							$redirectURL = $this->requests[$orig]['location'];
-							if (!preg_match('!^https?://!i', $redirectURL)) {
-								$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
-							}
-							if ($this->validateURL($redirectURL)) {
-								$this->debug('Redirect detected. Valid URL: '.$redirectURL);
-								// store any cookies
-								$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
-								if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);							
-								$this->redirectQueue[$orig] = $redirectURL;
-							} else {
-								$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
-							}
-						} elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
-							// the response content-type did not match our 'header only' types, 
-							// but we'd issues a HEAD request because we assumed it would. So
-							// let's queue a proper GET request for this item...
-							$this->debug('Wrong guess at content-type, queing GET request');
-							$this->requests[$orig]['wrongGuess'] = true;
-							$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
-						} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
-							// check for <meta name='fragment' content='!'/>
-							// for AJAX sites, e.g. Blogger with its dynamic views templates.
-							// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
-							if (isset($this->requests[$orig]['body'])) {
-								$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
-								if ($redirectURL) {
-									$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
-									$this->redirectQueue[$orig] = $redirectURL;
-								}
-							}
-						}
-						// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
-						unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
-					}
-				}
-			}
-		}
-
-		//////////////////////////////////////////////////////
-		// sequential (file_get_contents)
-		else {
-			$this->debug('Starting sequential fetch (file_get_contents)');
-			$this->debug('Processing set of '.count($urls));
-			foreach ($urls as $orig => $url) {
-				if (!$isRedirect) $orig = $url;
-				unset($this->redirectQueue[$orig]);
-				$this->debug("...$url");
-				if (!$isRedirect && isset($this->requests[$url])) {
-					$this->debug("......in memory");
-				/*
-				} elseif ($this->isCached($url)) {
-					$this->debug("......is cached");
-					if (!$this->minimiseMemoryUse) {
-						$this->requests[$url] = $this->getCached($url);
-					}
-				*/
-				} else {
-					$this->debug("Sending request for $url");
-					$this->requests[$orig]['original_url'] = $orig;
-					$req_url = $this->rewriteUrls($url);
-					$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
-					$req_url = $this->removeFragment($req_url);
-					// send cookies, if we have any
-					$httpContext = $this->httpContext;
-					$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
-					// add referer for picky sites
-					$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
-					if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
-						$this->debug("......sending cookies: $cookies");
-						$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
-					}
-					if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
-						$this->debug('Received response');
-						// get status code
-						if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
-							$this->debug('Error: no status code found');
-							// TODO: handle error - no status code
-						} else {
-							$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
-							// check content type
-							if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
-								$this->requests[$orig]['body'] = '';
-							} else {
-								$this->requests[$orig]['body'] = $html;
-							}
-							$this->requests[$orig]['effective_url'] = $req_url;
-							$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
-							unset($match);
-							// handle redirect
-							if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
-								$this->requests[$orig]['location'] =  trim($match[1]);
-							}
-							if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
-								$redirectURL = $this->requests[$orig]['location'];
-								if (!preg_match('!^https?://!i', $redirectURL)) {
-									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
-								}
-								if ($this->validateURL($redirectURL)) {
-									$this->debug('Redirect detected. Valid URL: '.$redirectURL);
-									// store any cookies
-									$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
-									if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
-									$this->redirectQueue[$orig] = $redirectURL;
-								} else {
-									$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
-								}
-							} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
-								// check for <meta name='fragment' content='!'/>
-								// for AJAX sites, e.g. Blogger with its dynamic views templates.
-								// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
-								if (isset($this->requests[$orig]['body'])) {
-									$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
-									if ($redirectURL) {
-										$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
-										$this->redirectQueue[$orig] = $redirectURL;
-									}
-								}
-							}
-						}
-					} else {
-						$this->debug('Error retrieving URL');
-						//print_r($req_url);
-						//print_r($http_response_header);
-						//print_r($html);
-						
-						// TODO: handle error - failed to retrieve URL
-					}
-				}
-			}
-		}
-	}
-	
-	public function handleCurlResponse($response, $info, $request) {
-		$orig = $request->url_original;
-		$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
-		$this->requests[$orig]['body'] = substr($response, $info['header_size']);
-		$this->requests[$orig]['method'] = $request->method;
-		$this->requests[$orig]['effective_url'] = $info['url'];
-		$this->requests[$orig]['status_code'] = (int)$info['http_code'];
-		if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
-			$this->requests[$orig]['location'] =  trim($match[1]);
-		}
-	}
-	
-	protected function headersToString(array $headers, $associative=true) {
-		if (!$associative) {
-			return implode("\n", $headers);
-		} else {
-			$str = '';
-			foreach ($headers as $key => $val) {
-				if (is_array($val)) {
-					foreach ($val as $v) $str .= "$key: $v\n";
-				} else {
-					$str .= "$key: $val\n";
-				}
-			}
-			return rtrim($str);
-		}
-	}
-	
-	public function get($url, $remove=false, $gzdecode=true) {
-		$url = "$url";
-		if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
-			$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
-			$response = $this->requests[$url];
-		/*
-		} elseif ($this->isCached($url)) {
-			$this->debug("URL already fetched - in disk cache ($url)");
-			$response = $this->getCached($url);
-			$this->requests[$url] = $response;
-		*/
-		} else {
-			$this->debug("Fetching URL ($url)");
-			$this->fetchAll(array($url));
-			if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
-				$response = $this->requests[$url];
-			} else {
-				$this->debug("Request failed");
-				$response = false;
-			}
-		}
-		/*
-		if ($this->minimiseMemoryUse && $response) {
-			$this->cache($url);
-			unset($this->requests[$url]);
-		}
-		*/
-		if ($remove && $response) unset($this->requests[$url]);
-		if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
-			if ($html = gzdecode($response['body'])) {
-				$response['body'] = $html;
-			}
-		}
-		return $response;
-	}
-	
-	public function parallelSupport() {
-		return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
-	}
-	
-	private function headerOnlyType($headers) {
-		if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
-			// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
-			$match[1] = strtolower(trim($match[1]));
-			$match[2] = strtolower(trim($match[2]));
-			foreach (array($match[1], $match[2]) as $mime) {
-				if (in_array($mime, $this->headerOnlyTypes)) return true;
-			}
-		}
-		return false;
-	}
-	
-	private function possibleUnsupportedType($url) {
-		$path = @parse_url($url, PHP_URL_PATH);
-		if ($path && strpos($path, '.') !== false) {
-			$ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
-			return in_array($ext, $this->headerOnlyClues);
-		}
-		return false;
-	}
-}
-
-// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
-if (!function_exists('gzdecode')) {
-	function gzdecode($data,&$filename='',&$error='',$maxlength=null) 
-	{
-		$len = strlen($data);
-		if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
-			$error = "Not in GZIP format.";
-			return null;  // Not GZIP format (See RFC 1952)
-		}
-		$method = ord(substr($data,2,1));  // Compression method
-		$flags  = ord(substr($data,3,1));  // Flags
-		if ($flags & 31 != $flags) {
-			$error = "Reserved bits not allowed.";
-			return null;
-		}
-		// NOTE: $mtime may be negative (PHP integer limitations)
-		$mtime = unpack("V", substr($data,4,4));
-		$mtime = $mtime[1];
-		$xfl   = substr($data,8,1);
-		$os    = substr($data,8,1);
-		$headerlen = 10;
-		$extralen  = 0;
-		$extra     = "";
-		if ($flags & 4) {
-			// 2-byte length prefixed EXTRA data in header
-			if ($len - $headerlen - 2 < 8) {
-				return false;  // invalid
-			}
-			$extralen = unpack("v",substr($data,8,2));
-			$extralen = $extralen[1];
-			if ($len - $headerlen - 2 - $extralen < 8) {
-				return false;  // invalid
-			}
-			$extra = substr($data,10,$extralen);
-			$headerlen += 2 + $extralen;
-		}
-		$filenamelen = 0;
-		$filename = "";
-		if ($flags & 8) {
-			// C-style string
-			if ($len - $headerlen - 1 < 8) {
-				return false; // invalid
-			}
-			$filenamelen = strpos(substr($data,$headerlen),chr(0));
-			if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
-				return false; // invalid
-			}
-			$filename = substr($data,$headerlen,$filenamelen);
-			$headerlen += $filenamelen + 1;
-		}
-		$commentlen = 0;
-		$comment = "";
-		if ($flags & 16) {
-			// C-style string COMMENT data in header
-			if ($len - $headerlen - 1 < 8) {
-				return false;    // invalid
-			}
-			$commentlen = strpos(substr($data,$headerlen),chr(0));
-			if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
-				return false;    // Invalid header format
-			}
-			$comment = substr($data,$headerlen,$commentlen);
-			$headerlen += $commentlen + 1;
-		}
-		$headercrc = "";
-		if ($flags & 2) {
-			// 2-bytes (lowest order) of CRC32 on header present
-			if ($len - $headerlen - 2 < 8) {
-				return false;    // invalid
-			}
-			$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
-			$headercrc = unpack("v", substr($data,$headerlen,2));
-			$headercrc = $headercrc[1];
-			if ($headercrc != $calccrc) {
-				$error = "Header checksum failed.";
-				return false;    // Bad header CRC
-			}
-			$headerlen += 2;
-		}
-		// GZIP FOOTER
-		$datacrc = unpack("V",substr($data,-8,4));
-		$datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
-		$isize = unpack("V",substr($data,-4));
-		$isize = $isize[1];
-		// decompression:
-		$bodylen = $len-$headerlen-8;
-		if ($bodylen < 1) {
-			// IMPLEMENTATION BUG!
-			return null;
-		}
-		$body = substr($data,$headerlen,$bodylen);
-		$data = "";
-		if ($bodylen > 0) {
-			switch ($method) {
-			case 8:
-				// Currently the only supported compression method:
-				$data = gzinflate($body,$maxlength);
-				break;
-			default:
-				$error = "Unknown compression method.";
-				return false;
-			}
-		}  // zero-byte body content is allowed
-		// Verifiy CRC32
-		$crc   = sprintf("%u",crc32($data));
-		$crcOK = $crc == $datacrc;
-		$lenOK = $isize == strlen($data);
-		if (!$lenOK || !$crcOK) {
-			$error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
-			return false;
-		}
-		return $data;
-	}
-}
-?>
\ No newline at end of file
+<?php
+/**
+ * Humble HTTP Agent
+ * 
+ * This class is designed to take advantage of parallel HTTP requests
+ * offered by PHP's PECL HTTP extension or the curl_multi_* functions. 
+ * For environments which do not have these options, it reverts to standard sequential 
+ * requests (using file_get_contents())
+ * 
+ * @version 1.4
+ * @date 2013-05-10
+ * @see http://php.net/HttpRequestPool
+ * @author Keyvan Minoukadeh
+ * @copyright 2011-2013 Keyvan Minoukadeh
+ * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
+ */
+
+class HumbleHttpAgent
+{
+	const METHOD_REQUEST_POOL = 1;
+	const METHOD_CURL_MULTI = 2;
+	const METHOD_FILE_GET_CONTENTS = 4;
+	//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
+	const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
+	const UA_PHP = 'PHP/5.4';
+	const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
+	
+	protected $requests = array();
+	protected $redirectQueue = array();
+	protected $requestOptions;
+	protected $maxParallelRequests = 5;
+	protected $cache = null; //TODO
+	protected $httpContext;
+	protected $minimiseMemoryUse = false; //TODO
+	protected $method;
+	protected $cookieJar;
+	public $debug = false;
+	public $debugVerbose = false;
+	public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
+	public $maxRedirects = 5;
+	public $userAgentMap = array();
+	public $rewriteUrls = array();
+	public $userAgentDefault;
+	public $referer;
+	//public $userAgent = 'Mozilla/5.0';
+	
+	// Prevent certain file/mime types
+	// HTTP responses which match these content types will
+	// be returned without body.
+	public $headerOnlyTypes = array();
+	// URLs ending with one of these extensions will
+	// prompt Humble HTTP Agent to send a HEAD request first
+	// to see if returned content type matches $headerOnlyTypes.
+	public $headerOnlyClues = array('pdf','mp3','zip','exe','gif','gzip','gz','jpeg','jpg','mpg','mpeg','png','ppt','mov');
+	// AJAX triggers to search for.
+	// for AJAX sites, e.g. Blogger with its dynamic views templates.
+	public $ajaxTriggers = array("<meta name='fragment' content='!'",'<meta name="fragment" content="!"',"<meta content='!' name='fragment'",'<meta content="!" name="fragment"');
+	
+	//TODO: set max file size
+	//TODO: normalise headers
+	
+	function __construct($requestOptions=null, $method=null) {
+		$this->userAgentDefault = self::UA_BROWSER;
+		$this->referer = self::REF_GOOGLE;
+		// set the request method
+		if (in_array($method, array(1,2,4))) {
+			$this->method = $method;
+		} else {
+			if (class_exists('HttpRequestPool')) {
+				$this->method = self::METHOD_REQUEST_POOL;
+			} elseif (function_exists('curl_multi_init')) {
+				$this->method = self::METHOD_CURL_MULTI;
+			} else {
+				$this->method = self::METHOD_FILE_GET_CONTENTS;
+			}
+		}
+		if ($this->method == self::METHOD_CURL_MULTI) {
+			require_once(dirname(__FILE__).'/RollingCurl.php');
+		}
+		// create cookie jar
+		$this->cookieJar = new CookieJar();
+		// set request options (redirect must be 0)
+		$this->requestOptions = array(
+			'timeout' => 15,
+			'connecttimeout' => 15,
+			'dns_cache_timeout' => 300,
+			'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
+			// TODO: test onprogress?
+		);
+		if (is_array($requestOptions)) {
+			$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
+		}
+		$this->httpContext = array(
+			'http' => array(
+				'ignore_errors' => true,
+				'timeout' => $this->requestOptions['timeout'],
+				'max_redirects' => $this->requestOptions['redirect'],
+				'header' => "Accept: */*\r\n"
+				)
+			);
+	}
+	
+	protected function debug($msg) {
+		if ($this->debug) {
+			$mem = round(memory_get_usage()/1024, 2);
+			$memPeak = round(memory_get_peak_usage()/1024, 2);
+			echo '* ',$msg;
+			if ($this->debugVerbose) echo ' - mem used: ',$mem," (peak: $memPeak)";
+			echo "\n";
+			ob_flush();
+			flush();
+		}
+	}
+	
+	protected function getUserAgent($url, $asArray=false) {
+		$host = @parse_url($url, PHP_URL_HOST);
+		if (strtolower(substr($host, 0, 4)) == 'www.') {
+			$host = substr($host, 4);
+		}
+		if ($host) {
+			$try = array($host);
+			$split = explode('.', $host);
+			if (count($split) > 1) {
+				array_shift($split);
+				$try[] = '.'.implode('.', $split);
+			}
+			foreach ($try as $h) {
+				if (isset($this->userAgentMap[$h])) {
+					$ua = $this->userAgentMap[$h];
+					break;
+				}
+			}
+		}
+		if (!isset($ua)) $ua = $this->userAgentDefault;
+		if ($asArray) {
+			return array('User-Agent' => $ua);
+		} else {
+			return 'User-Agent: '.$ua;
+		}
+	}
+	
+	public function rewriteHashbangFragment($url) {
+		// return $url if there's no '#!'
+		if (strpos($url, '#!') === false) return $url;
+		// split $url and rewrite
+		// TODO: is SimplePie_IRI included?
+		$iri = new SimplePie_IRI($url);
+		$fragment = substr($iri->fragment, 1); // strip '!'
+		$iri->fragment = null;
+		if (isset($iri->query)) {
+			parse_str($iri->query, $query);
+		} else {
+			$query = array();
+		}
+		$query['_escaped_fragment_'] = (string)$fragment;
+		$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+		return $iri->get_iri();
+	}
+	
+	public function getRedirectURLfromHTML($url, $html) {
+		$redirect_url = $this->getMetaRefreshURL($url, $html);
+		if (!$redirect_url) {
+			$redirect_url = $this->getUglyURL($url, $html);
+		}
+		return $redirect_url;
+	}
+	
+	public function getMetaRefreshURL($url, $html) {
+		if ($html == '') return false;
+		// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
+		if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
+			return false;
+		}
+		$redirect_url = $match[1];
+		if (preg_match('!^https?://!i', $redirect_url)) {
+			// already absolute
+			$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
+			return $redirect_url;
+		}
+		// absolutize redirect URL
+		$base = new SimplePie_IRI($url);
+		// remove '//' in URL path (causes URLs not to resolve properly)
+		if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
+		if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
+			$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
+			return $absolute;
+		}
+		return false;
+	}	
+	
+	public function getUglyURL($url, $html) {
+		if ($html == '') return false;
+		$found = false;
+		foreach ($this->ajaxTriggers as $string) {
+			if (stripos($html, $string)) {
+				$found = true;
+				break;
+			}
+		}
+		if (!$found) return false;
+		$iri = new SimplePie_IRI($url);
+		if (isset($iri->query)) {
+			parse_str($iri->query, $query);
+		} else {
+			$query = array();
+		}
+		$query['_escaped_fragment_'] = '';
+		$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
+		$ugly_url = $iri->get_iri();
+		$this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
+		return $ugly_url;
+	}
+	
+	public function removeFragment($url) {
+		$pos = strpos($url, '#');
+		if ($pos === false) {
+			return $url;
+		} else {
+			return substr($url, 0, $pos);
+		}
+	}
+	
+	public function rewriteUrls($url) {
+		foreach ($this->rewriteUrls as $find => $action) {
+			if (strpos($url, $find) !== false) {
+				if (is_array($action)) {
+					return strtr($url, $action);
+				}
+			}
+		}
+		return $url;
+	}
+	
+	public function enableDebug($bool=true) {
+		$this->debug = (bool)$bool;
+	}
+	
+	public function minimiseMemoryUse($bool = true) {
+		$this->minimiseMemoryUse = $bool;
+	}
+	
+	public function setMaxParallelRequests($max) {
+		$this->maxParallelRequests = $max;
+	}
+	
+	public function validateUrl($url) {
+		$url = filter_var($url, FILTER_SANITIZE_URL);
+		$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+		// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
+		if ($test === false) {
+			$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
+		}
+		if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
+			return $url;
+		} else {
+			return false;
+		}
+	}
+	
+	public function fetchAll(array $urls) {
+		$this->fetchAllOnce($urls, $isRedirect=false);
+		$redirects = 0;
+		while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
+			$this->debug("Following redirects #$redirects...");
+			$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
+		}
+	}
+	
+	// fetch all URLs without following redirects
+	public function fetchAllOnce(array $urls, $isRedirect=false) {
+		if (!$isRedirect) $urls = array_unique($urls);
+		if (empty($urls)) return;
+		
+		//////////////////////////////////////////////////////
+		// parallel (HttpRequestPool)
+		if ($this->method == self::METHOD_REQUEST_POOL) {
+			$this->debug('Starting parallel fetch (HttpRequestPool)');
+			try {
+				while (count($urls) > 0) {
+					$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+					$subset = array_splice($urls, 0, $this->maxParallelRequests);
+					$pool = new HttpRequestPool();
+					foreach ($subset as $orig => $url) {
+						if (!$isRedirect) $orig = $url;
+						unset($this->redirectQueue[$orig]);
+						$this->debug("...$url");
+						if (!$isRedirect && isset($this->requests[$url])) {
+							$this->debug("......in memory");
+						/*
+						} elseif ($this->isCached($url)) {
+							$this->debug("......is cached");
+							if (!$this->minimiseMemoryUse) {
+								$this->requests[$url] = $this->getCached($url);
+							}
+						*/
+						} else {
+							$this->debug("......adding to pool");
+							$req_url = $this->rewriteUrls($url);
+							$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+							$req_url = $this->removeFragment($req_url);
+							if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+								$_meth = HttpRequest::METH_HEAD;
+							} else {
+								$_meth = HttpRequest::METH_GET;
+								unset($this->requests[$orig]['wrongGuess']);
+							}
+							$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
+							// send cookies, if we have any
+							if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+								$this->debug("......sending cookies: $cookies");
+								$httpRequest->addHeaders(array('Cookie' => $cookies));
+							}
+							//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
+							$httpRequest->addHeaders($this->getUserAgent($req_url, true));
+							// add referer for picky sites
+							$httpRequest->addheaders(array('Referer' => $this->referer));
+							$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+							$this->requests[$orig]['original_url'] = $orig;
+							$pool->attach($httpRequest);
+						}
+					}
+					// did we get anything into the pool?
+					if (count($pool) > 0) {
+						$this->debug('Sending request...');
+						try {
+							$pool->send();
+						} catch (HttpRequestPoolException $e) {
+							// do nothing
+						}
+						$this->debug('Received responses');
+						foreach($subset as $orig => $url) {
+							if (!$isRedirect) $orig = $url;
+							$request = $this->requests[$orig]['httpRequest'];
+							//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
+							// getResponseHeader() doesn't return status line, so, for consistency...
+							$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
+							// check content type
+							// TODO: use getResponseHeader('content-type') or getResponseInfo()
+							if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+								$this->requests[$orig]['body'] = '';
+								$_header_only_type = true;
+								$this->debug('Header only type returned');
+							} else {
+								$this->requests[$orig]['body'] = $request->getResponseBody();
+								$_header_only_type = false;
+							}
+							$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
+							$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
+							// is redirect?
+							if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
+								$redirectURL = $request->getResponseHeader('location');
+								if (!preg_match('!^https?://!i', $redirectURL)) {
+									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+								}
+								if ($this->validateURL($redirectURL)) {
+									$this->debug('Redirect detected. Valid URL: '.$redirectURL);
+									// store any cookies
+									$cookies = $request->getResponseHeader('set-cookie');
+									if ($cookies && !is_array($cookies)) $cookies = array($cookies);
+									if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
+									$this->redirectQueue[$orig] = $redirectURL;
+								} else {
+									$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+								}
+							} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
+								// the response content-type did not match our 'header only' types, 
+								// but we'd issues a HEAD request because we assumed it would. So
+								// let's queue a proper GET request for this item...
+								$this->debug('Wrong guess at content-type, queing GET request');
+								$this->requests[$orig]['wrongGuess'] = true;
+								$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+							} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+								// check for <meta name='fragment' content='!'/>
+								// for AJAX sites, e.g. Blogger with its dynamic views templates.
+								// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+								if (isset($this->requests[$orig]['body'])) {
+									$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+									if ($redirectURL) {
+										$this->redirectQueue[$orig] = $redirectURL;
+									}
+								}
+							}
+							//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+							$pool->detach($request);
+							unset($this->requests[$orig]['httpRequest'], $request);
+							/*
+							if ($this->minimiseMemoryUse) {
+								if ($this->cache($url)) {
+									unset($this->requests[$url]);
+								}
+							}
+							*/
+						}
+					}
+				}
+			} catch (HttpException $e) {
+				$this->debug($e);
+				return false;
+			}
+		}
+		
+		//////////////////////////////////////////////////////////
+		// parallel (curl_multi_*)
+		elseif ($this->method == self::METHOD_CURL_MULTI) {
+			$this->debug('Starting parallel fetch (curl_multi_*)');
+			while (count($urls) > 0) {
+				$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
+				$subset = array_splice($urls, 0, $this->maxParallelRequests);
+				$pool = new RollingCurl(array($this, 'handleCurlResponse'));
+				$pool->window_size = count($subset);		
+				
+				foreach ($subset as $orig => $url) {
+					if (!$isRedirect) $orig = $url;
+					unset($this->redirectQueue[$orig]);
+					$this->debug("...$url");
+					if (!$isRedirect && isset($this->requests[$url])) {
+						$this->debug("......in memory");
+					/*
+					} elseif ($this->isCached($url)) {
+						$this->debug("......is cached");
+						if (!$this->minimiseMemoryUse) {
+							$this->requests[$url] = $this->getCached($url);
+						}
+					*/
+					} else {
+						$this->debug("......adding to pool");
+						$req_url = $this->rewriteUrls($url);
+						$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+						$req_url = $this->removeFragment($req_url);
+						if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
+							$_meth = 'HEAD';
+						} else {
+							$_meth = 'GET';
+							unset($this->requests[$orig]['wrongGuess']);
+						}						
+						$headers = array();
+						//$headers[] = 'User-Agent: '.$this->userAgent;
+						$headers[] = $this->getUserAgent($req_url);
+						// add referer for picky sites
+						$headers[] = 'Referer: '.$this->referer;
+						// send cookies, if we have any
+						if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+							$this->debug("......sending cookies: $cookies");
+							$headers[] = 'Cookie: '.$cookies;
+						}
+						$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
+							CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
+							CURLOPT_TIMEOUT => $this->requestOptions['timeout']
+							));
+						$httpRequest->set_original_url($orig);
+						$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
+						$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
+						$pool->add($httpRequest);
+					}
+				}
+				// did we get anything into the pool?
+				if (count($pool) > 0) {
+					$this->debug('Sending request...');
+					$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
+					$this->debug('Received responses');
+					foreach($subset as $orig => $url) {
+						if (!$isRedirect) $orig = $url;
+						// $this->requests[$orig]['headers']
+						// $this->requests[$orig]['body']
+						// $this->requests[$orig]['effective_url']
+						// check content type
+						if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+							$this->requests[$orig]['body'] = '';
+							$_header_only_type = true;
+							$this->debug('Header only type returned');
+						} else {
+							$_header_only_type = false;
+						}
+						$status_code = $this->requests[$orig]['status_code'];
+						if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+							$redirectURL = $this->requests[$orig]['location'];
+							if (!preg_match('!^https?://!i', $redirectURL)) {
+								$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+							}
+							if ($this->validateURL($redirectURL)) {
+								$this->debug('Redirect detected. Valid URL: '.$redirectURL);
+								// store any cookies
+								$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+								if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);							
+								$this->redirectQueue[$orig] = $redirectURL;
+							} else {
+								$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+							}
+						} elseif (!$_header_only_type && $this->requests[$orig]['method'] == 'HEAD') {
+							// the response content-type did not match our 'header only' types, 
+							// but we'd issues a HEAD request because we assumed it would. So
+							// let's queue a proper GET request for this item...
+							$this->debug('Wrong guess at content-type, queing GET request');
+							$this->requests[$orig]['wrongGuess'] = true;
+							$this->redirectQueue[$orig] = $this->requests[$orig]['effective_url'];
+						} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+							// check for <meta name='fragment' content='!'/>
+							// for AJAX sites, e.g. Blogger with its dynamic views templates.
+							// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+							if (isset($this->requests[$orig]['body'])) {
+								$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+								if ($redirectURL) {
+									$this->redirectQueue[$orig] = $redirectURL;
+								}
+							}
+						}
+						// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
+						unset($this->requests[$orig]['httpRequest'], $this->requests[$orig]['method']);
+					}
+				}
+			}
+		}
+
+		//////////////////////////////////////////////////////
+		// sequential (file_get_contents)
+		else {
+			$this->debug('Starting sequential fetch (file_get_contents)');
+			$this->debug('Processing set of '.count($urls));
+			foreach ($urls as $orig => $url) {
+				if (!$isRedirect) $orig = $url;
+				unset($this->redirectQueue[$orig]);
+				$this->debug("...$url");
+				if (!$isRedirect && isset($this->requests[$url])) {
+					$this->debug("......in memory");
+				/*
+				} elseif ($this->isCached($url)) {
+					$this->debug("......is cached");
+					if (!$this->minimiseMemoryUse) {
+						$this->requests[$url] = $this->getCached($url);
+					}
+				*/
+				} else {
+					$this->debug("Sending request for $url");
+					$this->requests[$orig]['original_url'] = $orig;
+					$req_url = $this->rewriteUrls($url);
+					$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
+					$req_url = $this->removeFragment($req_url);
+					// send cookies, if we have any
+					$httpContext = $this->httpContext;
+					$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
+					// add referer for picky sites
+					$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
+					if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
+						$this->debug("......sending cookies: $cookies");
+						$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
+					}
+					if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
+						$this->debug('Received response');
+						// get status code
+						if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
+							$this->debug('Error: no status code found');
+							// TODO: handle error - no status code
+						} else {
+							$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
+							// check content type
+							if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
+								$this->requests[$orig]['body'] = '';
+							} else {
+								$this->requests[$orig]['body'] = $html;
+							}
+							$this->requests[$orig]['effective_url'] = $req_url;
+							$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
+							unset($match);
+							// handle redirect
+							if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+								$this->requests[$orig]['location'] =  trim($match[1]);
+							}
+							if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
+								$redirectURL = $this->requests[$orig]['location'];
+								if (!preg_match('!^https?://!i', $redirectURL)) {
+									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+								}
+								if ($this->validateURL($redirectURL)) {
+									$this->debug('Redirect detected. Valid URL: '.$redirectURL);
+									// store any cookies
+									$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
+									if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
+									$this->redirectQueue[$orig] = $redirectURL;
+								} else {
+									$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
+								}
+							} elseif (strpos($this->requests[$orig]['effective_url'], '_escaped_fragment_') === false) {
+								// check for <meta name='fragment' content='!'/>
+								// for AJAX sites, e.g. Blogger with its dynamic views templates.
+								// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
+								if (isset($this->requests[$orig]['body'])) {
+									$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
+									if ($redirectURL) {
+										$this->redirectQueue[$orig] = $redirectURL;
+									}
+								}
+							}
+						}
+					} else {
+						$this->debug('Error retrieving URL');
+						//print_r($req_url);
+						//print_r($http_response_header);
+						//print_r($html);
+						
+						// TODO: handle error - failed to retrieve URL
+					}
+				}
+			}
+		}
+	}
+	
+	public function handleCurlResponse($response, $info, $request) {
+		$orig = $request->url_original;
+		$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
+		$this->requests[$orig]['body'] = substr($response, $info['header_size']);
+		$this->requests[$orig]['method'] = $request->method;
+		$this->requests[$orig]['effective_url'] = $info['url'];
+		$this->requests[$orig]['status_code'] = (int)$info['http_code'];
+		if (preg_match('/^Location:(.*?)$/mi', $this->requests[$orig]['headers'], $match)) {
+			$this->requests[$orig]['location'] =  trim($match[1]);
+		}
+	}
+	
+	protected function headersToString(array $headers, $associative=true) {
+		if (!$associative) {
+			return implode("\n", $headers);
+		} else {
+			$str = '';
+			foreach ($headers as $key => $val) {
+				if (is_array($val)) {
+					foreach ($val as $v) $str .= "$key: $v\n";
+				} else {
+					$str .= "$key: $val\n";
+				}
+			}
+			return rtrim($str);
+		}
+	}
+	
+	public function get($url, $remove=false, $gzdecode=true) {
+		$url = "$url";
+		if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+			$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
+			$response = $this->requests[$url];
+		/*
+		} elseif ($this->isCached($url)) {
+			$this->debug("URL already fetched - in disk cache ($url)");
+			$response = $this->getCached($url);
+			$this->requests[$url] = $response;
+		*/
+		} else {
+			$this->debug("Fetching URL ($url)");
+			$this->fetchAll(array($url));
+			if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
+				$response = $this->requests[$url];
+			} else {
+				$this->debug("Request failed");
+				$response = false;
+			}
+		}
+		/*
+		if ($this->minimiseMemoryUse && $response) {
+			$this->cache($url);
+			unset($this->requests[$url]);
+		}
+		*/
+		if ($remove && $response) unset($this->requests[$url]);
+		if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
+			if ($html = gzdecode($response['body'])) {
+				$response['body'] = $html;
+			}
+		}
+		return $response;
+	}
+	
+	public function parallelSupport() {
+		return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
+	}
+	
+	private function headerOnlyType($headers) {
+		if (preg_match('!^Content-Type:\s*(([a-z-]+)/([^;\r\n ]+))!im', $headers, $match)) {
+			// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
+			$match[1] = strtolower(trim($match[1]));
+			$match[2] = strtolower(trim($match[2]));
+			foreach (array($match[1], $match[2]) as $mime) {
+				if (in_array($mime, $this->headerOnlyTypes)) return true;
+			}
+		}
+		return false;
+	}
+	
+	private function possibleUnsupportedType($url) {
+		$path = @parse_url($url, PHP_URL_PATH);
+		if ($path && strpos($path, '.') !== false) {
+			$ext = strtolower(trim(pathinfo($path, PATHINFO_EXTENSION)));
+			return in_array($ext, $this->headerOnlyClues);
+		}
+		return false;
+	}
+}
+
+// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
+if (!function_exists('gzdecode')) {
+	function gzdecode($data,&$filename='',&$error='',$maxlength=null) 
+	{
+		$len = strlen($data);
+		if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
+			$error = "Not in GZIP format.";
+			return null;  // Not GZIP format (See RFC 1952)
+		}
+		$method = ord(substr($data,2,1));  // Compression method
+		$flags  = ord(substr($data,3,1));  // Flags
+		if ($flags & 31 != $flags) {
+			$error = "Reserved bits not allowed.";
+			return null;
+		}
+		// NOTE: $mtime may be negative (PHP integer limitations)
+		$mtime = unpack("V", substr($data,4,4));
+		$mtime = $mtime[1];
+		$xfl   = substr($data,8,1);
+		$os    = substr($data,8,1);
+		$headerlen = 10;
+		$extralen  = 0;
+		$extra     = "";
+		if ($flags & 4) {
+			// 2-byte length prefixed EXTRA data in header
+			if ($len - $headerlen - 2 < 8) {
+				return false;  // invalid
+			}
+			$extralen = unpack("v",substr($data,8,2));
+			$extralen = $extralen[1];
+			if ($len - $headerlen - 2 - $extralen < 8) {
+				return false;  // invalid
+			}
+			$extra = substr($data,10,$extralen);
+			$headerlen += 2 + $extralen;
+		}
+		$filenamelen = 0;
+		$filename = "";
+		if ($flags & 8) {
+			// C-style string
+			if ($len - $headerlen - 1 < 8) {
+				return false; // invalid
+			}
+			$filenamelen = strpos(substr($data,$headerlen),chr(0));
+			if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
+				return false; // invalid
+			}
+			$filename = substr($data,$headerlen,$filenamelen);
+			$headerlen += $filenamelen + 1;
+		}
+		$commentlen = 0;
+		$comment = "";
+		if ($flags & 16) {
+			// C-style string COMMENT data in header
+			if ($len - $headerlen - 1 < 8) {
+				return false;    // invalid
+			}
+			$commentlen = strpos(substr($data,$headerlen),chr(0));
+			if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
+				return false;    // Invalid header format
+			}
+			$comment = substr($data,$headerlen,$commentlen);
+			$headerlen += $commentlen + 1;
+		}
+		$headercrc = "";
+		if ($flags & 2) {
+			// 2-bytes (lowest order) of CRC32 on header present
+			if ($len - $headerlen - 2 < 8) {
+				return false;    // invalid
+			}
+			$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
+			$headercrc = unpack("v", substr($data,$headerlen,2));
+			$headercrc = $headercrc[1];
+			if ($headercrc != $calccrc) {
+				$error = "Header checksum failed.";
+				return false;    // Bad header CRC
+			}
+			$headerlen += 2;
+		}
+		// GZIP FOOTER
+		$datacrc = unpack("V",substr($data,-8,4));
+		$datacrc = sprintf('%u',$datacrc[1] & 0xFFFFFFFF);
+		$isize = unpack("V",substr($data,-4));
+		$isize = $isize[1];
+		// decompression:
+		$bodylen = $len-$headerlen-8;
+		if ($bodylen < 1) {
+			// IMPLEMENTATION BUG!
+			return null;
+		}
+		$body = substr($data,$headerlen,$bodylen);
+		$data = "";
+		if ($bodylen > 0) {
+			switch ($method) {
+			case 8:
+				// Currently the only supported compression method:
+				$data = gzinflate($body,$maxlength);
+				break;
+			default:
+				$error = "Unknown compression method.";
+				return false;
+			}
+		}  // zero-byte body content is allowed
+		// Verifiy CRC32
+		$crc   = sprintf("%u",crc32($data));
+		$crcOK = $crc == $datacrc;
+		$lenOK = $isize == strlen($data);
+		if (!$lenOK || !$crcOK) {
+			$error = ( $lenOK ? '' : 'Length check FAILED. ') . ( $crcOK ? '' : 'Checksum FAILED.');
+			return false;
+		}
+		return $data;
+	}
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
index ecd46d5f..c524a1ee 100644
--- a/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
+++ b/inc/3rdparty/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php
@@ -1,79 +1,78 @@
-<?php
-/**
- * Humble HTTP Agent extension for SimplePie_File
- * 
- * This class is designed to extend and override SimplePie_File
- * in order to prevent duplicate HTTP requests being sent out.
- * The idea is to initialise an instance of Humble HTTP Agent
- * and attach it, to a static class variable, of this class.
- * SimplePie will then automatically initialise this class
- * 
- * @date 2011-02-28
- */
-
-class SimplePie_HumbleHttpAgent extends SimplePie_File
-{
-	protected static $agent;
-	var $url;
-	var $useragent;
-	var $success = true;
-	var $headers = array();
-	var $body;
-	var $status_code;
-	var $redirects = 0;
-	var $error;
-	var $method = SIMPLEPIE_FILE_SOURCE_NONE;
-
-	public static function set_agent(HumbleHttpAgent $agent) {
-		self::$agent = $agent;
-	}
-	
-	public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
-		if (class_exists('idna_convert'))
-		{
-			$idn = new idna_convert();
-			$parsed = SimplePie_Misc::parse_url($url);
-			$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
-		}
-		$this->url = $url;
-		$this->useragent = $useragent;
-		if (preg_match('/^http(s)?:\/\//i', $url))
-		{
-			if (!is_array($headers))
-			{
-				$headers = array();
-			}
-			$this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
-			$headers2 = array();
-			foreach ($headers as $key => $value) {
-				$headers2[] = "$key: $value";
-			}
-			//TODO: allow for HTTP headers
-			// curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
-
-			$response = self::$agent->get($url);
-			
-			if ($response === false || !isset($response['status_code'])) {
-				$this->error = 'failed to fetch URL';
-				$this->success = false;
-			} else {
-				// The extra lines at the end are there to satisfy SimplePie's HTTP parser.
-				// The class expects a full HTTP message, whereas we're giving it only
-				// headers - the new lines indicate the start of the body.
-				$parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
-				if ($parser->parse()) {
-					$this->headers = $parser->headers;
-					//$this->body = $parser->body;
-					$this->body = $response['body'];
-					$this->status_code = $parser->status_code;
-				}
-			}
-		}
-		else
-		{
-			$this->error = 'invalid URL';
-			$this->success = false;
-		}
-	}
-}
-?>
\ No newline at end of file
+<?php
+/**
+ * Humble HTTP Agent extension for SimplePie_File
+ * 
+ * This class is designed to extend and override SimplePie_File
+ * in order to prevent duplicate HTTP requests being sent out.
+ * The idea is to initialise an instance of Humble HTTP Agent
+ * and attach it, to a static class variable, of this class.
+ * SimplePie will then automatically initialise this class
+ * 
+ * @date 2011-02-28
+ */
+
+class SimplePie_HumbleHttpAgent extends SimplePie_File
+{
+	protected static $agent;
+	var $url;
+	var $useragent;
+	var $success = true;
+	var $headers = array();
+	var $body;
+	var $status_code;
+	var $redirects = 0;
+	var $error;
+	var $method = SIMPLEPIE_FILE_SOURCE_NONE;
+
+	public static function set_agent(HumbleHttpAgent $agent) {
+		self::$agent = $agent;
+	}
+	
+	public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
+		if (class_exists('idna_convert'))
+		{
+			$idn = new idna_convert();
+			$parsed = SimplePie_Misc::parse_url($url);
+			$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
+		}
+		$this->url = $url;
+		$this->useragent = $useragent;
+		if (preg_match('/^http(s)?:\/\//i', $url))
+		{
+			if (!is_array($headers))
+			{
+				$headers = array();
+			}
+			$this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
+			$headers2 = array();
+			foreach ($headers as $key => $value) {
+				$headers2[] = "$key: $value";
+			}
+			//TODO: allow for HTTP headers
+			// curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
+
+			$response = self::$agent->get($url);
+			
+			if ($response === false || !isset($response['status_code'])) {
+				$this->error = 'failed to fetch URL';
+				$this->success = false;
+			} else {
+				// The extra lines at the end are there to satisfy SimplePie's HTTP parser.
+				// The class expects a full HTTP message, whereas we're giving it only
+				// headers - the new lines indicate the start of the body.
+				$parser = new SimplePie_HTTP_Parser($response['headers']."\r\n\r\n");
+				if ($parser->parse()) {
+					$this->headers = $parser->headers;
+					//$this->body = $parser->body;
+					$this->body = $response['body'];
+					$this->status_code = $parser->status_code;
+				}
+			}
+		}
+		else
+		{
+			$this->error = 'invalid URL';
+			$this->success = false;
+		}
+	}
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect.php b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
index 09b11546..382d869c 100644
--- a/inc/3rdparty/libraries/language-detect/LanguageDetect.php
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect.php
@@ -6,23 +6,24 @@
  * Attempts to detect the language of a sample of text by correlating ranked
  * 3-gram frequencies to a table of 3-gram frequencies of known languages.
  *
- * Implements a version of a technique originally proposed by Cavnar & Trenkle 
- * (1994): "N-Gram-Based Text Categorization" 
+ * Implements a version of a technique originally proposed by Cavnar & Trenkle
+ * (1994): "N-Gram-Based Text Categorization"
  *
- * PHP versions 4 and 5
+ * PHP version 5
  *
- * @category   Text
- * @package    Text_LanguageDetect
- * @author     Nicholas Pisarro <infinityminusnine+pear@gmail.com>
- * @copyright  2005-2006 Nicholas Pisarro
- * @license    http://www.debian.org/misc/bsd.license BSD
- * @version    CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $
- * @link       http://pear.php.net/package/Text_LanguageDetect/
- * @link       http://langdetect.blogspot.com/
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2005-2006 Nicholas Pisarro
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ * @link      http://langdetect.blogspot.com/
  */
 
-//require_once 'PEAR.php';
-require_once 'Parser.php';
+require_once 'LanguageDetect/Exception.php';
+require_once 'LanguageDetect/Parser.php';
+require_once 'LanguageDetect/ISO639.php';
 
 /**
  * Language detection class
@@ -41,9 +42,10 @@ require_once 'Parser.php';
  *
  * echo "Supported languages:\n";
  *
- * $langs = $l->getLanguages();
- * if (PEAR::isError($langs)) {
- *     die($langs->getMessage());
+ * try {
+ *     $langs = $l->getLanguages();
+ * } catch (Text_LanguageDetect_Exception $e) {
+ *     die($e->getMessage());
  * }
  *
  * sort($langs);
@@ -54,38 +56,38 @@ require_once 'Parser.php';
  * }
  * </code>
  *
- * @category   Text
- * @package    Text_LanguageDetect
- * @author     Nicholas Pisarro <infinityminusnine+pear@gmail.com>
- * @copyright  2005 Nicholas Pisarro
- * @license    http://www.debian.org/misc/bsd.license BSD
- * @version    Release: @package_version@
- * @todo       allow users to generate their own language models
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2005 Nicholas Pisarro
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   Release: @package_version@
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ * @todo      allow users to generate their own language models
  */
- 
 class Text_LanguageDetect
 {
-    /** 
+    /**
      * The filename that stores the trigram data for the detector
      *
-     * If this value starts with a slash (/) or a dot (.) the value of 
+     * If this value starts with a slash (/) or a dot (.) the value of
      * $this->_data_dir will be ignored
-     * 
+     *
      * @var      string
      * @access   private
      */
-    var $_db_filename = './lang.dat';
+    var $_db_filename = 'lang.dat';
 
     /**
      * The filename that stores the unicode block definitions
      *
-     * If this value starts with a slash (/) or a dot (.) the value of 
+     * If this value starts with a slash (/) or a dot (.) the value of
      * $this->_data_dir will be ignored
-     * 
+     *
      * @var string
      * @access private
      */
-    var $_unicode_db_filename = './unicode_blocks.dat';
+    var $_unicode_db_filename = 'unicode_blocks.dat';
 
     /**
      * The data directory
@@ -99,11 +101,8 @@ class Text_LanguageDetect
 
     /**
      * The trigram data for comparison
-     * 
-     * Will be loaded on start from $this->_db_filename
      *
-     * May be set to a PEAR_Error object if there is an error during its 
-     * initialization
+     * Will be loaded on start from $this->_db_filename
      *
      * @var      array
      * @access   private
@@ -120,7 +119,7 @@ class Text_LanguageDetect
 
     /**
      * The size of the trigram data arrays
-     * 
+     *
      * @var      int
      * @access   private
      */
@@ -140,7 +139,7 @@ class Text_LanguageDetect
 
     /**
      * Whether or not to simulate perl's Language::Guess exactly
-     * 
+     *
      * @access  private
      * @var     bool
      * @see     setPerlCompatible()
@@ -164,19 +163,25 @@ class Text_LanguageDetect
      */
     var $_clusters;
 
+    /**
+     * Which type of "language names" are accepted and returned:
+     *
+     * 0 - language name ("english")
+     * 2 - 2-letter ISO 639-1 code ("en")
+     * 3 - 3-letter ISO 639-2 code ("eng")
+     */
+    var $_name_mode = 0;
+
     /**
      * Constructor
      *
      * Will attempt to load the language database. If it fails, you will get
-     * a PEAR_Error object returned when you try to use detect()
-     *
+     * an exception.
      */
-    function Text_LanguageDetect($db=null, $unicode_db=null)
+    function __construct()
     {
-		if (isset($db)) $this->_db_filename = $db;
-		if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
-		
         $data = $this->_readdb($this->_db_filename);
+        $this->_checkTrigram($data['trigram']);
         $this->_lang_db = $data['trigram'];
 
         if (isset($data['trigram-unicodemap'])) {
@@ -186,29 +191,32 @@ class Text_LanguageDetect
         // Not yet implemented:
         if (isset($data['trigram-clusters'])) {
             $this->_clusters = $data['trigram-clusters'];
-        }		
+        }
     }
 
     /**
      * Returns the path to the location of the database
      *
-     * @access    private
-     * @return    string    expected path to the language model database
+     * @param string $fname File name to load
+     *
+     * @return string expected path to the language model database
+     * @access private
      */
     function _get_data_loc($fname)
     {
-        return $fname;
+        return dirname(__FILE__).'/'.$fname;
     }
 
     /**
      * Loads the language trigram database from filename
      *
      * Trigram datbase should be a serialize()'d array
-     * 
-     * @access    private
-     * @param     string      $fname   the filename where the data is stored
-     * @return    array                the language model data
-     * @throws    PEAR_Error
+     *
+     * @param string $fname the filename where the data is stored
+     *
+     * @return array the language model data
+     * @throws Text_LanguageDetect_Exception
+     * @access private
      */
     function _readdb($fname)
     {
@@ -217,79 +225,74 @@ class Text_LanguageDetect
 
         // input check
         if (!file_exists($fname)) {
-            throw new Exception('Language database does not exist.');
+            throw new Text_LanguageDetect_Exception(
+                'Language database does not exist: ' . $fname,
+                Text_LanguageDetect_Exception::DB_NOT_FOUND
+            );
         } elseif (!is_readable($fname)) {
-            throw new Exception('Language database is not readable.');
+            throw new Text_LanguageDetect_Exception(
+                'Language database is not readable: ' . $fname,
+                Text_LanguageDetect_Exception::DB_NOT_READABLE
+            );
         }
 
-        if (function_exists('file_get_contents')) {
-            return unserialize(file_get_contents($fname));
-        } else {
-            // if you don't have file_get_contents(), 
-            // then this is the next fastest way
-            ob_start();
-            readfile($fname);
-            $contents = ob_get_contents();
-            ob_end_clean();
-            return unserialize($contents);
-        }
+        return unserialize(file_get_contents($fname));
     }
 
 
     /**
      * Checks if this object is ready to detect languages
-     * 
-     * @access   private
-     * @param    mixed   &$err  error object to be returned by reference, if any
-     * @return   bool           true if no errors
+     *
+     * @param array $trigram Trigram data from database
+     *
+     * @return void
+     * @access private
      */
-    function _setup_ok(&$err)
+    function _checkTrigram($trigram)
     {
-        if (!is_array($this->_lang_db)) {
+        if (!is_array($trigram)) {
             if (ini_get('magic_quotes_runtime')) {
-                throw new Exception('Error loading database. Try turning magic_quotes_runtime off.');
-            } else {
-                throw new Exception('Language database is not an array.');
+                throw new Text_LanguageDetect_Exception(
+                    'Error loading database. Try turning magic_quotes_runtime off.',
+                    Text_LanguageDetect_Exception::MAGIC_QUOTES
+                );
             }
-            return false;
-
-        } elseif (empty($this->_lang_db)) {
-            throw new Exception('Language database has no elements.');
-            return false;
-
-        } else {
-            return true;
+            throw new Text_LanguageDetect_Exception(
+                'Language database is not an array.',
+                Text_LanguageDetect_Exception::DB_NOT_ARRAY
+            );
+        } elseif (empty($trigram)) {
+            throw new Text_LanguageDetect_Exception(
+                'Language database has no elements.',
+                Text_LanguageDetect_Exception::DB_EMPTY
+            );
         }
     }
 
     /**
      * Omits languages
      *
-     * Pass this function the name of or an array of names of 
+     * Pass this function the name of or an array of names of
      * languages that you don't want considered
      *
-     * If you're only expecting a limited set of languages, this can greatly 
+     * If you're only expecting a limited set of languages, this can greatly
      * speed up processing
      *
-     * @access   public
-     * @param    mixed  $omit_list      language name or array of names to omit
-     * @param    bool   $include_only   if true will include (rather than 
-     *                                  exclude) only those in the list
-     * @return   int                    number of languages successfully deleted
-     * @throws   PEAR_Error
+     * @param mixed $omit_list    language name or array of names to omit
+     * @param bool  $include_only if true will include (rather than
+     *                            exclude) only those in the list
+     *
+     * @return int number of languages successfully deleted
+     * @throws Text_LanguageDetect_Exception
      */
-    function omitLanguages($omit_list, $include_only = false)
+    public function omitLanguages($omit_list, $include_only = false)
     {
-
-        // setup check
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         $deleted = 0;
 
-        // deleting the given languages
+        $omit_list = $this->_convertFromNameMode($omit_list);
+
         if (!$include_only) {
+            // deleting the given languages
             if (!is_array($omit_list)) {
                 $omit_list = strtolower($omit_list); // case desensitize
                 if (isset($this->_lang_db[$omit_list])) {
@@ -301,12 +304,12 @@ class Text_LanguageDetect
                     if (isset($this->_lang_db[$omit_lang])) {
                         unset($this->_lang_db[$omit_lang]);
                         $deleted++;
-                    } 
+                    }
                 }
             }
 
-        // deleting all except the given languages
         } else {
+            // deleting all except the given languages
             if (!is_array($omit_list)) {
                 $omit_list = array($omit_list);
             }
@@ -327,7 +330,7 @@ class Text_LanguageDetect
         // reset the cluster cache if the number of languages changes
         // this will then have to be recalculated
         if (isset($this->_clusters) && $deleted > 0) {
-            unset($this->_clusters);
+            $this->_clusters = null;
         }
 
         return $deleted;
@@ -339,49 +342,40 @@ class Text_LanguageDetect
      *
      * @access public
      * @return int            the number of languages
-     * @throws PEAR_Error
+     * @throws   Text_LanguageDetect_Exception
      */
     function getLanguageCount()
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            return count($this->_lang_db);
-        }
+        return count($this->_lang_db);
     }
 
     /**
-     * Returns true if a given language exists
+     * Checks if the language with the given name exists in the database
      *
-     * If passed an array of names, will return true only if all exist
+     * @param mixed $lang Language name or array of language names
      *
-     * @access    public
-     * @param     mixed       $lang    language name or array of language names
-     * @return    bool                 true if language model exists
-     * @throws    PEAR_Error
+     * @return bool true if language model exists
      */
-    function languageExists($lang)
+    public function languageExists($lang)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            // string
-            if (is_string($lang)) {
-                return isset($this->_lang_db[strtolower($lang)]);
-
-            // array
-            } elseif (is_array($lang)) {
-                foreach ($lang as $test_lang) {
-                    if (!isset($this->_lang_db[strtolower($test_lang)])) {
-                        return false;
-                    } 
-                }
-                return true;
+        $lang = $this->_convertFromNameMode($lang);
 
-            // other (error)
-            } else {
-                throw new Exception('Unknown type passed to languageExists()');
+        if (is_string($lang)) {
+            return isset($this->_lang_db[strtolower($lang)]);
+
+        } elseif (is_array($lang)) {
+            foreach ($lang as $test_lang) {
+                if (!isset($this->_lang_db[strtolower($test_lang)])) {
+                    return false;
+                }
             }
+            return true;
+
+        } else {
+            throw new Text_LanguageDetect_Exception(
+                'Unsupported parameter type passed to languageExists()',
+                Text_LanguageDetect_Exception::PARAM_TYPE
+            );
         }
     }
 
@@ -389,25 +383,24 @@ class Text_LanguageDetect
      * Returns the list of detectable languages
      *
      * @access public
-     * @return array        the names of the languages known to this object
-     * @throws PEAR_Error
+     * @return array        the names of the languages known to this object<<<<<<<
+     * @throws   Text_LanguageDetect_Exception
      */
     function getLanguages()
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        } else {
-            return array_keys($this->_lang_db);
-        }
+        return $this->_convertToNameMode(
+            array_keys($this->_lang_db)
+        );
     }
 
     /**
      * Make this object behave like Language::Guess
-     * 
-     * @access    public
-     * @param     bool     $setting     false to turn off perl compatibility
+     *
+     * @param bool $setting false to turn off perl compatibility
+     *
+     * @return void
      */
-    function setPerlCompatible($setting = true)
+    public function setPerlCompatible($setting = true)
     {
         if (is_bool($setting)) { // input check
             $this->_perl_compatible = $setting;
@@ -421,6 +414,21 @@ class Text_LanguageDetect
 
     }
 
+    /**
+     * Sets the way how language names are accepted and returned.
+     *
+     * @param integer $name_mode One of the following modes:
+     *                           0 - language name ("english")
+     *                           2 - 2-letter ISO 639-1 code ("en")
+     *                           3 - 3-letter ISO 639-2 code ("eng")
+     *
+     * @return void
+     */
+    function setNameMode($name_mode)
+    {
+        $this->_name_mode = $name_mode;
+    }
+
     /**
      * Whether to use unicode block ranges in detection
      *
@@ -429,10 +437,11 @@ class Text_LanguageDetect
      * in languages that use latin scripts. In other cases it should speed up
      * detection noticeably.
      *
-     * @access  public
-     * @param   bool    $setting    false to turn off
+     * @param bool $setting false to turn off
+     *
+     * @return void
      */
-    function useUnicodeBlocks($setting = true)
+    public function useUnicodeBlocks($setting = true)
     {
         if (is_bool($setting)) {
             $this->_use_unicode_narrowing = $setting;
@@ -442,15 +451,15 @@ class Text_LanguageDetect
     /**
      * Converts a piece of text into trigrams
      *
-     * Superceded by the Text_LanguageDetect_Parser class 
+     * @param string $text text to convert
      *
-     * @access    private
-     * @param     string    $text    text to convert
-     * @return    array              array of trigram frequencies
+     * @return     array array of trigram frequencies
+     * @access     private
+     * @deprecated Superceded by the Text_LanguageDetect_Parser class
      */
     function _trigram($text)
     {
-        $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename);
+        $s = new Text_LanguageDetect_Parser($text);
         $s->prepareTrigram();
         $s->prepareUnicode(false);
         $s->setPadStart(!$this->_perl_compatible);
@@ -463,11 +472,12 @@ class Text_LanguageDetect
      *
      * Thresholds (cuts off) the list at $this->_threshold
      *
-     * @access    protected
-     * @param     array     $arr     array of trgram 
-     * @return    array              ranks of trigrams
+     * @param array $arr array of trigram
+     *
+     * @return array ranks of trigrams
+     * @access protected
      */
-    function _arr_rank(&$arr)
+    function _arr_rank($arr)
     {
 
         // sorts alphabetically first as a standard way of breaking rank ties
@@ -494,14 +504,17 @@ class Text_LanguageDetect
 
     /**
      * Sorts an array by value breaking ties alphabetically
-     * 
-     * @access   private
-     * @param    array     &$arr     the array to sort
+     *
+     * @param array &$arr the array to sort
+     *
+     * @return void
+     * @access private
      */
     function _bub_sort(&$arr)
     {
         // should do the same as this perl statement:
-        // sort { $trigrams{$b} == $trigrams{$a} ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
+        // sort { $trigrams{$b} == $trigrams{$a}
+        //   ?  $a cmp $b : $trigrams{$b} <=> $trigrams{$a} }
 
         // needs to sort by both key and value at once
         // using the key to break ties for the value
@@ -528,13 +541,14 @@ class Text_LanguageDetect
     /**
      * Sort function used by bubble sort
      *
-     * Callback function for usort(). 
+     * Callback function for usort().
      *
-     * @access   private
-     * @param    array        first param passed by usort()
-     * @param    array        second param passed by usort()
-     * @return   int          1 if $a is greater, -1 if not
-     * @see      _bub_sort()
+     * @param array $a first param passed by usort()
+     * @param array $b second param passed by usort()
+     *
+     * @return int 1 if $a is greater, -1 if not
+     * @see    _bub_sort()
+     * @access private
      */
     function _sort_func($a, $b)
     {
@@ -542,12 +556,12 @@ class Text_LanguageDetect
         list($a_key, $a_value) = $a;
         list($b_key, $b_value) = $b;
 
-        // if the values are the same, break ties using the key
         if ($a_value == $b_value) {
+            // if the values are the same, break ties using the key
             return strcmp($a_key, $b_key);
 
-        // if not, just sort normally
         } else {
+            // if not, just sort normally
             if ($a_value > $b_value) {
                 return -1;
             } else {
@@ -559,23 +573,24 @@ class Text_LanguageDetect
     }
 
     /**
-     * Calculates a linear rank-order distance statistic between two sets of 
+     * Calculates a linear rank-order distance statistic between two sets of
      * ranked trigrams
      *
-     * Sums the differences in rank for each trigram. If the trigram does not 
+     * Sums the differences in rank for each trigram. If the trigram does not
      * appear in both, consider it a difference of $this->_threshold.
      *
      * This distance measure was proposed by Cavnar & Trenkle (1994). Despite
      * its simplicity it has been shown to be highly accurate for language
      * identification tasks.
      *
-     * @access  private
-     * @param   array    $arr1  the reference set of trigram ranks
-     * @param   array    $arr2  the target set of trigram ranks
-     * @return  int             the sum of the differences between the ranks of
-     *                          the two trigram sets
+     * @param array $arr1 the reference set of trigram ranks
+     * @param array $arr2 the target set of trigram ranks
+     *
+     * @return int the sum of the differences between the ranks of
+     *             the two trigram sets
+     * @access private
      */
-    function _distance(&$arr1, &$arr2)
+    function _distance($arr1, $arr2)
     {
         $sumdist = 0;
 
@@ -598,14 +613,15 @@ class Text_LanguageDetect
 
     /**
      * Normalizes the score returned by _distance()
-     * 
+     *
      * Different if perl compatible or not
      *
-     * @access    private
-     * @param     int    $score          the score from _distance()
-     * @param     int    $base_count     the number of trigrams being considered
-     * @return    float                  the normalized score
-     * @see       _distance()
+     * @param int $score      the score from _distance()
+     * @param int $base_count the number of trigrams being considered
+     *
+     * @return float the normalized score
+     * @see    _distance()
+     * @access private
      */
     function _normalize_score($score, $base_count = null)
     {
@@ -630,29 +646,24 @@ class Text_LanguageDetect
      *
      * If perl compatible, the score is 300-0, 0 being most similar.
      * Otherwise, it's 0-1 with 1 being most similar.
-     * 
+     *
      * The $sample text should be at least a few sentences in length;
      * should be ascii-7 or utf8 encoded, if another and the mbstring extension
      * is present it will try to detect and convert. However, experience has
-     * shown that mb_detect_encoding() *does not work very well* with at least 
+     * shown that mb_detect_encoding() *does not work very well* with at least
      * some types of encoding.
      *
-     * @access  public
-     * @param   string  $sample a sample of text to compare.
-     * @param   int     $limit  if specified, return an array of the most likely
-     *                           $limit languages and their scores.
-     * @return  mixed       sorted array of language scores, blank array if no 
-     *                      useable text was found, or PEAR_Error if error 
-     *                      with the object setup
-     * @see     _distance()
-     * @throws  PEAR_Error
+     * @param string $sample a sample of text to compare.
+     * @param int    $limit  if specified, return an array of the most likely
+     *                       $limit languages and their scores.
+     *
+     * @return mixed sorted array of language scores, blank array if no
+     *               useable text was found
+     * @see    _distance()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detect($sample, $limit = 0)
+    public function detect($sample, $limit = 0)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         // input check
         if (!Text_LanguageDetect_Parser::validateString($sample)) {
             return array();
@@ -660,36 +671,27 @@ class Text_LanguageDetect
 
         // check char encoding
         // (only if mbstring extension is compiled and PHP > 4.0.6)
-        if (function_exists('mb_detect_encoding') 
-            && function_exists('mb_convert_encoding')) {
-
+        if (function_exists('mb_detect_encoding')
+            && function_exists('mb_convert_encoding')
+        ) {
             // mb_detect_encoding isn't very reliable, to say the least
-            // detection should still work with a sufficient sample of ascii characters
+            // detection should still work with a sufficient sample
+            //  of ascii characters
             $encoding = mb_detect_encoding($sample);
 
             // mb_detect_encoding() will return FALSE if detection fails
             // don't attempt conversion if that's the case
-            if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) {
-            
-                if (function_exists('mb_list_encodings')) {
- 
-                    // verify the encoding exists in mb_list_encodings
-                    if (in_array($encoding, mb_list_encodings())) {
-                        $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
-                    }
-
-                    // if the previous condition failed:
-                    // somehow we detected an encoding that also we don't support
-
-                } else {
-                    // php 4 doesnt have mb_list_encodings()
-                    // so attempt with error suppression
-                    $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding);
+            if ($encoding != 'ASCII' && $encoding != 'UTF-8'
+                && $encoding !== false
+            ) {
+                // verify the encoding exists in mb_list_encodings
+                if (in_array($encoding, mb_list_encodings())) {
+                    $sample = mb_convert_encoding($sample, 'UTF-8', $encoding);
                 }
             }
         }
 
-        $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($sample);
         $sample_obj->prepareTrigram();
         if ($this->_use_unicode_narrowing) {
             $sample_obj->prepareUnicode();
@@ -713,7 +715,10 @@ class Text_LanguageDetect
             if (is_array($blocks)) {
                 $present_blocks = array_keys($blocks);
             } else {
-                throw new Exception('Error during block detection');
+                throw new Text_LanguageDetect_Exception(
+                    'Error during block detection',
+                    Text_LanguageDetect_Exception::BLOCK_DETECTION
+                );
             }
 
             $possible_langs = array();
@@ -731,30 +736,30 @@ class Text_LanguageDetect
             }
 
             // could also try an intersect operation rather than a union
-            // in other words, choose languages whose trigrams contain 
+            // in other words, choose languages whose trigrams contain
             // ALL of the unicode blocks found in this sample
             // would improve speed but would be completely thrown off by an
             // unexpected character, like an umlaut appearing in english text
 
             $possible_langs = array_intersect(
-                        array_keys($this->_lang_db),
-                        array_unique($possible_langs)
+                array_keys($this->_lang_db),
+                array_unique($possible_langs)
             );
 
-            // needs to intersect it with the keys of _lang_db in case 
+            // needs to intersect it with the keys of _lang_db in case
             // languages have been omitted
 
-        // or just try 'em all
         } else {
+            // or just try 'em all
             $possible_langs = array_keys($this->_lang_db);
         }
 
 
         foreach ($possible_langs as $lang) {
-            $scores[$lang] =
-                $this->_normalize_score(
-                        $this->_distance($this->_lang_db[$lang], $trigram_freqs),
-                        $trigram_count);
+            $scores[$lang] = $this->_normalize_score(
+                $this->_distance($this->_lang_db[$lang], $trigram_freqs),
+                $trigram_count
+            );
         }
 
         unset($sample_obj);
@@ -772,7 +777,6 @@ class Text_LanguageDetect
             $limited_scores = array();
 
             $i = 0;
-
             foreach ($scores as $key => $value) {
                 if ($i++ >= $limit) {
                     break;
@@ -781,9 +785,9 @@ class Text_LanguageDetect
                 $limited_scores[$key] = $value;
             }
 
-            return $limited_scores;
+            return $this->_convertToNameMode($limited_scores, true);
         } else {
-            return $scores;
+            return $this->_convertToNameMode($scores, true);
         }
     }
 
@@ -791,35 +795,33 @@ class Text_LanguageDetect
      * Returns only the most similar language to the text sample
      *
      * Calls $this->detect() and returns only the top result
-     * 
-     * @access   public
-     * @param    string    $sample    text to detect the language of
-     * @return   string               the name of the most likely language
-     *                                or null if no language is similar
-     * @see      detect()
-     * @throws   PEAR_Error
+     *
+     * @param string $sample text to detect the language of
+     *
+     * @return string the name of the most likely language
+     *                or null if no language is similar
+     * @see    detect()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectSimple($sample)
+    public function detectSimple($sample)
     {
         $scores = $this->detect($sample, 1);
 
         // if top language has the maximum possible score,
         // then the top score will have been picked at random
-        if (    !is_array($scores) 
-                || empty($scores) 
-                || current($scores) == $this->_max_score) {
-
+        if (!is_array($scores) || empty($scores)
+            || current($scores) == $this->_max_score
+        ) {
             return null;
-
         } else {
-            return ucfirst(key($scores));
+            return key($scores);
         }
     }
 
     /**
      * Returns an array containing the most similar language and a confidence
      * rating
-     * 
+     *
      * Confidence is a simple measure calculated from the similarity score
      * minus the similarity score from the next most similar language
      * divided by the highest possible score. Languages that have closely
@@ -827,46 +829,43 @@ class Text_LanguageDetect
      * confidence scores.
      *
      * The similarity score answers the question "How likely is the text the
-     * returned language regardless of the other languages considered?" The 
+     * returned language regardless of the other languages considered?" The
      * confidence score is one way of answering the question "how likely is the
      * text the detected language relative to the rest of the language model
      * set?"
      *
      * To see how similar languages are a priori, see languageSimilarity()
-     * 
-     * @access   public
-     * @param    string    $sample    text for which language will be detected
-     * @return   array     most similar language, score and confidence rating
-     *                     or null if no language is similar
-     * @see      detect()
-     * @throws   PEAR_Error
+     *
+     * @param string $sample text for which language will be detected
+     *
+     * @return array most similar language, score and confidence rating
+     *               or null if no language is similar
+     * @see    detect()
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectConfidence($sample)
+    public function detectConfidence($sample)
     {
         $scores = $this->detect($sample, 2);
 
-        // if most similar language has the max score, it 
+        // if most similar language has the max score, it
         // will have been picked at random
-        if (    !is_array($scores) 
-                || empty($scores) 
-                || current($scores) == $this->_max_score) {
-
+        if (!is_array($scores) || empty($scores)
+            || current($scores) == $this->_max_score
+        ) {
             return null;
         }
 
-        $arr['language'] = ucfirst(key($scores));
+        $arr['language'] = key($scores);
         $arr['similarity'] = current($scores);
         if (next($scores) !== false) { // if false then no next element
             // the goal is to return a higher value if the distance between
             // the similarity of the first score and the second score is high
 
             if ($this->_perl_compatible) {
-
-                $arr['confidence'] =
-                    (current($scores) - $arr['similarity']) / $this->_max_score;
+                $arr['confidence'] = (current($scores) - $arr['similarity'])
+                    / $this->_max_score;
 
             } else {
-
                 $arr['confidence'] = $arr['similarity'] - current($scores);
 
             }
@@ -882,32 +881,26 @@ class Text_LanguageDetect
      * Returns the distribution of unicode blocks in a given utf8 string
      *
      * For the block name of a single char, use unicodeBlockName()
-     * 
-     * @access public
-     * @param string $str input string. Must be ascii or utf8
-     * @param bool $skip_symbols if true, skip ascii digits, symbols and 
-     *                           non-printing characters. Includes spaces,
-     *                           newlines and common punctutation characters.
+     *
+     * @param string $str          input string. Must be ascii or utf8
+     * @param bool   $skip_symbols if true, skip ascii digits, symbols and
+     *                             non-printing characters. Includes spaces,
+     *                             newlines and common punctutation characters.
+     *
      * @return array
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
      */
-    function detectUnicodeBlocks($str, $skip_symbols)
+    public function detectUnicodeBlocks($str, $skip_symbols)
     {
-        // input check
-        if (!is_bool($skip_symbols)) {
-            throw new Exception('Second parameter must be boolean');
-        } 
-
-        if (!is_string($str)) {
-            throw new Exception('First parameter was not a string');
-        }
+        $skip_symbols = (bool)$skip_symbols;
+        $str          = (string)$str;
 
-        $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($str);
         $sample_obj->prepareUnicode();
         $sample_obj->prepareTrigram(false);
         $sample_obj->setUnicodeSkipSymbols($skip_symbols);
         $sample_obj->analyze();
-        $blocks =& $sample_obj->getUnicodeBlocks();
+        $blocks = $sample_obj->getUnicodeBlocks();
         unset($sample_obj);
         return $blocks;
     }
@@ -915,38 +908,37 @@ class Text_LanguageDetect
     /**
      * Returns the block name for a given unicode value
      *
-     * If passed a string, will assume it is being passed a UTF8-formatted 
+     * If passed a string, will assume it is being passed a UTF8-formatted
      * character and will automatically convert. Otherwise it will assume it
      * is being passed a numeric unicode value.
      *
      * Make sure input is of the correct type!
      *
-     * @access public
      * @param mixed $unicode unicode value or utf8 char
+     *
      * @return mixed the block name string or false if not found
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
      */
-    function unicodeBlockName($unicode) {
+    public function unicodeBlockName($unicode)
+    {
         if (is_string($unicode)) {
             // assume it is being passed a utf8 char, so convert it
-
-            // input check
-            if ($this->utf8strlen($unicode) > 1) {
-                throw new Exception('Pass this function only a single char');
+            if (self::utf8strlen($unicode) > 1) {
+                throw new Text_LanguageDetect_Exception(
+                    'Pass a single char only to this method',
+                    Text_LanguageDetect_Exception::PARAM_TYPE
+                );
             }
-
             $unicode = $this->_utf8char2unicode($unicode);
 
-            if ($unicode == -1) {
-                throw new Exception('Malformatted char');
-            }
-
-        // input check
         } elseif (!is_int($unicode)) {
-            throw new Exception('Input must be of type string or int.');
+            throw new Text_LanguageDetect_Exception(
+                'Input must be of type string or int.',
+                Text_LanguageDetect_Exception::PARAM_TYPE
+            );
         }
 
-        $blocks =& $this->_read_unicode_block_db();
+        $blocks = $this->_read_unicode_block_db();
 
         $result = $this->_unicode_block_name($unicode, $blocks);
 
@@ -964,14 +956,17 @@ class Text_LanguageDetect
      * the public interface for this function, which does input checks which
      * this function omits for speed.
      *
-     * @access  protected
-     * @param   int     $unicode the unicode value
-     * @param   array   &$blocks the block database
-     * @param   int     $block_count the number of defined blocks in the database
-     * @see     unicodeBlockName()
+     * @param int   $unicode     the unicode value
+     * @param array $blocks      the block database
+     * @param int   $block_count the number of defined blocks in the database
+     *
+     * @return mixed Block name, -1 if it failed
+     * @see    unicodeBlockName()
+     * @access protected
      */
-    function _unicode_block_name($unicode, &$blocks, $block_count = -1) {
-        // for a reference, see 
+    function _unicode_block_name($unicode, $blocks, $block_count = -1)
+    {
+        // for a reference, see
         // http://www.unicode.org/Public/UNIDATA/Blocks.txt
 
         // assume that ascii characters are the most common
@@ -994,35 +989,36 @@ class Text_LanguageDetect
         while ($low <= $high) {
             $mid = floor(($low + $high) / 2);
 
-            // if it's lower than the lower bound
             if ($unicode < $blocks[$mid][0]) {
+                // if it's lower than the lower bound
                 $high = $mid - 1;
 
-            // if it's higher than the upper bound
             } elseif ($unicode > $blocks[$mid][1]) {
+                // if it's higher than the upper bound
                 $low = $mid + 1;
 
-            // found it
             } else {
+                // found it
                 return $blocks[$mid];
             }
         }
 
-        // failed to find the block 
+        // failed to find the block
         return -1;
 
-        // todo: differentiate when it's out of range or when it falls 
+        // todo: differentiate when it's out of range or when it falls
         //       into an unassigned range?
     }
 
     /**
      * Brings up the unicode block database
      *
-     * @access protected
      * @return array the database of unicode block definitions
-     * @throws PEAR_Error
+     * @throws Text_LanguageDetect_Exception
+     * @access protected
      */
-    function &_read_unicode_block_db() {
+    function _read_unicode_block_db()
+    {
         // since the unicode definitions are always going to be the same,
         // might as well share the memory for the db with all other instances
         // of this class
@@ -1037,29 +1033,27 @@ class Text_LanguageDetect
 
     /**
      * Calculate the similarities between the language models
-     * 
+     *
      * Use this function to see how similar languages are to each other.
      *
      * If passed 2 language names, will return just those languages compared.
      * If passed 1 language name, will return that language compared to
      * all others.
-     * If passed none, will return an array of every language model compared 
+     * If passed none, will return an array of every language model compared
      * to every other one.
      *
-     * @access  public
-     * @param   string   $lang1   the name of the first language to be compared
-     * @param   string   $lang2   the name of the second language to be compared
-     * @return  array    scores of every language compared
-     *                   or the score of just the provided languages
-     *                   or null if one of the supplied languages does not exist
-     * @throws  PEAR_Error
+     * @param string $lang1 the name of the first language to be compared
+     * @param string $lang2 the name of the second language to be compared
+     *
+     * @return array scores of every language compared
+     *               or the score of just the provided languages
+     *               or null if one of the supplied languages does not exist
+     * @throws Text_LanguageDetect_Exception
      */
-    function languageSimilarity($lang1 = null, $lang2 = null)
+    public function languageSimilarity($lang1 = null, $lang2 = null)
     {
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
+        $lang1 = $this->_convertFromNameMode($lang1);
+        $lang2 = $this->_convertFromNameMode($lang2);
         if ($lang1 != null) {
             $lang1 = strtolower($lang1);
 
@@ -1069,12 +1063,8 @@ class Text_LanguageDetect
             }
 
             if ($lang2 != null) {
-
-                // can't only set the second param
-                if ($lang1 == null) {
-                    return null;
-                // check if language model exists
-                } elseif (!isset($this->_lang_db[$lang2])) {
+                if (!isset($this->_lang_db[$lang2])) {
+                    // check if language model exists
                     return null;
                 }
 
@@ -1088,14 +1078,15 @@ class Text_LanguageDetect
                     )
                 );
 
-
-            // compare just $lang1 to all languages
             } else {
+                // compare just $lang1 to all languages
                 $return_arr = array();
                 foreach ($this->_lang_db as $key => $value) {
-                    if ($key != $lang1) { // don't compare a language to itself
+                    if ($key != $lang1) {
+                        // don't compare a language to itself
                         $return_arr[$key] = $this->_normalize_score(
-                            $this->_distance($this->_lang_db[$lang1], $value));
+                            $this->_distance($this->_lang_db[$lang1], $value)
+                        );
                     }
                 }
                 asort($return_arr);
@@ -1104,30 +1095,27 @@ class Text_LanguageDetect
             }
 
 
-        // compare all languages to each other
         } else {
+            // compare all languages to each other
             $return_arr = array();
             foreach (array_keys($this->_lang_db) as $lang1) {
                 foreach (array_keys($this->_lang_db) as $lang2) {
-
                     // skip comparing languages to themselves
-                    if ($lang1 != $lang2) { 
-                    
-                        // don't re-calculate what's already been done
-                        if (isset($return_arr[$lang2][$lang1])) {
+                    if ($lang1 != $lang2) {
 
-                            $return_arr[$lang1][$lang2] =
-                                $return_arr[$lang2][$lang1];
+                        if (isset($return_arr[$lang2][$lang1])) {
+                            // don't re-calculate what's already been done
+                            $return_arr[$lang1][$lang2]
+                                = $return_arr[$lang2][$lang1];
 
-                        // calculate
                         } else {
-
-                            $return_arr[$lang1][$lang2] = 
-                                $this->_normalize_score(
-                                        $this->_distance(
-                                            $this->_lang_db[$lang1],
-                                            $this->_lang_db[$lang2]
-                                        )
+                            // calculate
+                            $return_arr[$lang1][$lang2]
+                                = $this->_normalize_score(
+                                    $this->_distance(
+                                        $this->_lang_db[$lang1],
+                                        $this->_lang_db[$lang2]
+                                    )
                                 );
 
                         }
@@ -1150,20 +1138,14 @@ class Text_LanguageDetect
      *
      * @access      public
      * @return      array language cluster data
-     * @throws      PEAR_Error
+     * @throws      Text_LanguageDetect_Exception
      * @see         languageSimilarity()
-     * @deprecated  this function will eventually be removed and placed into 
+     * @deprecated  this function will eventually be removed and placed into
      *              the model generation class
      */
     function clusterLanguages()
     {
         // todo: set the maximum number of clusters
-
-        // setup check
-        if (!$this->_setup_ok($err)) {
-            return $err;
-        }
-
         // return cached result, if any
         if (isset($this->_clusters)) {
             return $this->_clusters;
@@ -1177,7 +1159,10 @@ class Text_LanguageDetect
 
         foreach ($langs as $lang) {
             if (!isset($this->_lang_db[$lang])) {
-                throw new Exception("missing $lang!\n");
+                throw new Text_LanguageDetect_Exception(
+                    "missing $lang!",
+                    Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE
+                );
             }
         }
 
@@ -1186,7 +1171,9 @@ class Text_LanguageDetect
             $langs[$lang1] = $lang1;
             unset($langs[$old_key]);
         }
-        
+
+        $result_data = $really_map = array();
+
         $i = 0;
         while (count($langs) > 2 && $i++ < 200) {
             $highest_score = -1;
@@ -1194,18 +1181,22 @@ class Text_LanguageDetect
             $highest_key2 = '';
             foreach ($langs as $lang1) {
                 foreach ($langs as $lang2) {
-                    if (    $lang1 != $lang2 
-                            && $arr[$lang1][$lang2] > $highest_score) {
+                    if ($lang1 != $lang2
+                        && $arr[$lang1][$lang2] > $highest_score
+                    ) {
                         $highest_score = $arr[$lang1][$lang2];
                         $highest_key1 = $lang1;
                         $highest_key2 = $lang2;
                     }
                 }
             }
-            
+
             if (!$highest_key1) {
                 // should not ever happen
-                throw new Exception("no highest key? (step: $i)");
+                throw new Text_LanguageDetect_Exception(
+                    "no highest key? (step: $i)",
+                    Text_LanguageDetect_Exception::NO_HIGHEST_KEY
+                );
             }
 
             if ($highest_score == 0) {
@@ -1217,7 +1208,7 @@ class Text_LanguageDetect
             $sum1 = array_sum($arr[$highest_key1]);
             $sum2 = array_sum($arr[$highest_key2]);
 
-            // use the score for the one that is most similar to the rest of 
+            // use the score for the one that is most similar to the rest of
             // the field as the score for the group
             // todo: could try averaging or "centroid" method instead
             // seems like that might make more sense
@@ -1248,7 +1239,7 @@ class Text_LanguageDetect
             $really_lang = $replaceme;
             while (isset($really_map[$really_lang])) {
                 $really_lang = $really_map[$really_lang];
-            } 
+            }
             $really_map[$newkey] = $really_lang;
 
 
@@ -1259,8 +1250,8 @@ class Text_LanguageDetect
                         $arr[$key1][$newkey] = $arr[$key1][$key2];
                         unset($arr[$key1][$key2]);
                         // replacing $arr[$key1][$key2] with $arr[$key1][$newkey]
-                    } 
-                    
+                    }
+
                     if ($key1 == $replaceme) {
                         $arr[$newkey][$key2] = $arr[$key1][$key2];
                         unset($arr[$key1][$key2]);
@@ -1273,7 +1264,7 @@ class Text_LanguageDetect
                     }
                 }
             }
-                        
+
 
             unset($langs[$highest_key1]);
             unset($langs[$highest_key2]);
@@ -1293,7 +1284,7 @@ class Text_LanguageDetect
         }
 
         $return_val = array(
-                'open_forks' => $langs, 
+                'open_forks' => $langs,
                         // the top level of clusters
                         // clusters that are mutually exclusive
                         // or specified by a specific maximum
@@ -1323,11 +1314,11 @@ class Text_LanguageDetect
      * use, and it may disappear or its functionality may change in future
      * releases without notice.
      *
-     * This compares the sample text to top the top level of clusters. If the 
+     * This compares the sample text to top the top level of clusters. If the
      * sample is similar to the cluster it will drop down and compare it to the
      * languages in the cluster, and so on until it hits a leaf node.
      *
-     * this should find the language in considerably fewer compares 
+     * this should find the language in considerably fewer compares
      * (the equivalent of a binary search), however clusterLanguages() is costly
      * and the loss of accuracy from this technique is significant.
      *
@@ -1337,15 +1328,14 @@ class Text_LanguageDetect
      * was very large, however in such cases some method of Bayesian inference
      * might be more helpful.
      *
-     * @see     clusterLanguages()
-     * @access  public
-     * @param   string $str input string
-     * @return  array language scores (only those compared)
-     * @throws  PEAR_Error
+     * @param string $str input string
+     *
+     * @return array language scores (only those compared)
+     * @throws Text_LanguageDetect_Exception
+     * @see    clusterLanguages()
      */
-    function clusteredSearch($str)
+    public function clusteredSearch($str)
     {
-
         // input check
         if (!Text_LanguageDetect_Parser::validateString($str)) {
             return array();
@@ -1359,7 +1349,7 @@ class Text_LanguageDetect
         $dendogram_data  = $result['fork_data'];
         $dendogram_alias = $result['name_map'];
 
-        $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename);
+        $sample_obj = new Text_LanguageDetect_Parser($str);
         $sample_obj->prepareTrigram();
         $sample_obj->setPadStart(!$this->_perl_compatible);
         $sample_obj->analyze();
@@ -1372,7 +1362,7 @@ class Text_LanguageDetect
         }
 
         $i = 0; // counts the number of steps
-        
+
         foreach ($dendogram_start as $lang) {
             if (isset($dendogram_alias[$lang])) {
                 $lang_key = $dendogram_alias[$lang];
@@ -1382,7 +1372,8 @@ class Text_LanguageDetect
 
             $scores[$lang] = $this->_normalize_score(
                 $this->_distance($this->_lang_db[$lang_key], $sample_result),
-                $sample_count);
+                $sample_count
+            );
 
             $i++;
         }
@@ -1411,7 +1402,8 @@ class Text_LanguageDetect
 
                 $scores[$lang] = $this->_normalize_score(
                     $this->_distance($this->_lang_db[$lang_key], $sample_result),
-                    $sample_count);
+                    $sample_count
+                );
 
                 //todo: does not need to do same comparison again
             }
@@ -1428,8 +1420,8 @@ class Text_LanguageDetect
 
             $diff = $scores[$cur_key] - $scores[$loser_key];
 
-            // $cur_key ({$dendogram_alias[$cur_key]}) wins 
-            // over $loser_key ({$dendogram_alias[$loser_key]}) 
+            // $cur_key ({$dendogram_alias[$cur_key]}) wins
+            // over $loser_key ({$dendogram_alias[$loser_key]})
             // with a difference of $diff
         }
 
@@ -1439,9 +1431,9 @@ class Text_LanguageDetect
         // which paths the algorithm decided to take along the tree
 
         // but sometimes the last item is only the second highest
-        if (   ($this->_perl_compatible  && (end($scores) > prev($scores)))
-            || (!$this->_perl_compatible && (end($scores) < prev($scores)))) {
-
+        if (($this->_perl_compatible  && (end($scores) > prev($scores)))
+            || (!$this->_perl_compatible && (end($scores) < prev($scores)))
+        ) {
             $real_last_score = current($scores);
             $real_last_key = key($scores);
 
@@ -1449,7 +1441,7 @@ class Text_LanguageDetect
             unset($scores[$real_last_key]);
             $scores[$real_last_key] = $real_last_score;
         }
-            
+
 
         if (!$this->_perl_compatible) {
             $scores = array_reverse($scores, true);
@@ -1464,12 +1456,11 @@ class Text_LanguageDetect
      *
      * Returns the numbers of characters (not bytes) in a utf8 string
      *
-     * @static
-     * @access  public
-     * @param   string $str string to get the length of
-     * @return  int         number of chars
+     * @param string $str string to get the length of
+     *
+     * @return int number of chars
      */
-    function utf8strlen($str)
+    public static function utf8strlen($str)
     {
         // utf8_decode() will convert unknown chars to '?', which is actually
         // ideal for counting.
@@ -1482,53 +1473,45 @@ class Text_LanguageDetect
     /**
      * Returns the unicode value of a utf8 char
      *
-     * @access  protected
-     * @param   string $char a utf8 (possibly multi-byte) char
-     * @return  int          unicode value or -1 if malformatted
+     * @param string $char a utf8 (possibly multi-byte) char
+     *
+     * @return int unicode value
+     * @access protected
+     * @link   http://en.wikipedia.org/wiki/UTF-8
      */
-    function _utf8char2unicode($char) {
-
+    function _utf8char2unicode($char)
+    {
         // strlen() here will actually get the binary length of a single char
         switch (strlen($char)) {
-
-            // for a reference, see http://en.wikipedia.org/wiki/UTF-8
-
-            case 1:
-                // normal ASCII-7 byte
-                // 0xxxxxxx -->  0xxxxxxx
-                return ord($char{0});
-
-            case 2:
-                // 2 byte unicode
-                // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
-                $z = (ord($char{0}) & 0x000001F) << 6;
-                $x = (ord($char{1}) & 0x0000003F);
-
-                return ($z | $x);
-
-            case 3:
-                // 3 byte unicode
-                // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx 
-                $z =  (ord($char{0}) & 0x0000000F) << 12;
-                $x1 = (ord($char{1}) & 0x0000003F) << 6;
-                $x2 = (ord($char{2}) & 0x0000003F);
-
-                return ($z | $x1 | $x2);
-
-            case 4:
-                // 4 byte unicode
-                // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
-                // 000zzzzz xxxxxxxx xxxxxxxx
-                $z1 = (ord($char{0}) & 0x00000007) << 18;
-                $z2 = (ord($char{1}) & 0x0000003F) << 12;
-                $x1 = (ord($char{2}) & 0x0000003F) << 6;
-                $x2 = (ord($char{3}) & 0x0000003F);
-
-                return ($z1 | $z2 | $x1 | $x2);
-
-            default:
-                // error: malformatted char?
-                return -1;
+        case 1:
+            // normal ASCII-7 byte
+            // 0xxxxxxx -->  0xxxxxxx
+            return ord($char{0});
+
+        case 2:
+            // 2 byte unicode
+            // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx
+            $z = (ord($char{0}) & 0x000001F) << 6;
+            $x = (ord($char{1}) & 0x0000003F);
+            return ($z | $x);
+
+        case 3:
+            // 3 byte unicode
+            // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx
+            $z =  (ord($char{0}) & 0x0000000F) << 12;
+            $x1 = (ord($char{1}) & 0x0000003F) << 6;
+            $x2 = (ord($char{2}) & 0x0000003F);
+            return ($z | $x1 | $x2);
+
+        case 4:
+            // 4 byte unicode
+            // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx -->
+            // 000zzzzz xxxxxxxx xxxxxxxx
+            $z1 = (ord($char{0}) & 0x00000007) << 18;
+            $z2 = (ord($char{1}) & 0x0000003F) << 12;
+            $x1 = (ord($char{2}) & 0x0000003F) << 6;
+            $x2 = (ord($char{3}) & 0x0000003F);
+            return ($z1 | $z2 | $x1 | $x2);
         }
     }
 
@@ -1536,18 +1519,18 @@ class Text_LanguageDetect
      * utf8-safe fast character iterator
      *
      * Will get the next character starting from $counter, which will then be
-     * incremented. If a multi-byte char the bytes will be concatenated and 
+     * incremented. If a multi-byte char the bytes will be concatenated and
      * $counter will be incremeted by the number of bytes in the char.
      *
-     * @access  private
-     * @param   string  &$str        the string being iterated over
-     * @param   int     &$counter    the iterator, will increment by reference
-     * @param   bool    $special_convert whether to do special conversions
-     * @return  char    the next (possibly multi-byte) char from $counter
+     * @param string $str             the string being iterated over
+     * @param int    &$counter        the iterator, will increment by reference
+     * @param bool   $special_convert whether to do special conversions
+     *
+     * @return char the next (possibly multi-byte) char from $counter
+     * @access private
      */
-    function _next_char(&$str, &$counter, $special_convert = false)
+    static function _next_char($str, &$counter, $special_convert = false)
     {
-
         $char = $str{$counter++};
         $ord = ord($char);
 
@@ -1556,7 +1539,6 @@ class Text_LanguageDetect
 
         // normal ascii one byte char
         if ($ord <= 127) {
-
             // special conversions needed for this package
             // (that only apply to regular ascii characters)
             // lower case, and convert all non-alphanumeric characters
@@ -1571,8 +1553,8 @@ class Text_LanguageDetect
 
             return $char;
 
-        // multi-byte chars
         } elseif ($ord >> 5 == 6) { // two-byte char
+            // multi-byte chars
             $nextchar = $str{$counter++}; // get next byte
 
             // lower-casing of non-ascii characters is still incomplete
@@ -1582,27 +1564,27 @@ class Text_LanguageDetect
                 if ($ord == 195) {
                     $nextord = ord($nextchar);
                     $nextord_adj = $nextord + 64;
-                    // for a reference, see 
+                    // for a reference, see
                     // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html
 
                     // &Agrave; - &THORN; but not &times;
-                    if (    $nextord_adj >= 192
-                            && $nextord_adj <= 222 
-                            && $nextord_adj != 215) {
-
-                        $nextchar = chr($nextord + 32); 
+                    if ($nextord_adj >= 192
+                        && $nextord_adj <= 222
+                        && $nextord_adj != 215
+                    ) {
+                        $nextchar = chr($nextord + 32);
                     }
 
-                // lower case cyrillic alphabet
                 } elseif ($ord == 208) {
+                    // lower case cyrillic alphabet
                     $nextord = ord($nextchar);
                     // if A - Pe
                     if ($nextord >= 144 && $nextord <= 159) {
                         // lower case
                         $nextchar = chr($nextord + 32);
 
-                    // if Er - Ya
                     } elseif ($nextord >= 160 && $nextord <= 175) {
+                        // if Er - Ya
                         // lower case
                         $char = chr(209); // == $ord++
                         $nextchar = chr($nextord - 32);
@@ -1611,12 +1593,11 @@ class Text_LanguageDetect
             }
 
             // tag on next byte
-            return $char . $nextchar; 
-
+            return $char . $nextchar;
         } elseif ($ord >> 4  == 14) { // three-byte char
-            
+
             // tag on next 2 bytes
-            return $char . $str{$counter++} . $str{$counter++}; 
+            return $char . $str{$counter++} . $str{$counter++};
 
         } elseif ($ord >> 3 == 30) { // four-byte char
 
@@ -1628,8 +1609,85 @@ class Text_LanguageDetect
         }
     }
 
-}
+    /**
+     * Converts an $language input parameter from the configured mode
+     * to the language name that is used internally.
+     *
+     * Works for strings and arrays.
+     *
+     * @param string|array $lang       A language description ("english"/"en"/"eng")
+     * @param boolean      $convertKey If $lang is an array, setting $key
+     *                                 converts the keys to the language name.
+     *
+     * @return string|array Language name
+     */
+    function _convertFromNameMode($lang, $convertKey = false)
+    {
+        if ($this->_name_mode == 0) {
+            return $lang;
+        }
+
+        if ($this->_name_mode == 2) {
+            $method = 'code2ToName';
+        } else {
+            $method = 'code3ToName';
+        }
+
+        if (is_string($lang)) {
+            return (string)Text_LanguageDetect_ISO639::$method($lang);
+        }
+
+        $newlang = array();
+        foreach ($lang as $key => $val) {
+            if ($convertKey) {
+                $newkey = (string)Text_LanguageDetect_ISO639::$method($key);
+                $newlang[$newkey] = $val;
+            } else {
+                $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val);
+            }
+        }
+        return $newlang;
+    }
 
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+    /**
+     * Converts an $language output parameter from the language name that is
+     * used internally to the configured mode.
+     *
+     * Works for strings and arrays.
+     *
+     * @param string|array $lang       A language description ("english"/"en"/"eng")
+     * @param boolean      $convertKey If $lang is an array, setting $key
+     *                                 converts the keys to the language name.
+     *
+     * @return string|array Language name
+     */
+    function _convertToNameMode($lang, $convertKey = false)
+    {
+        if ($this->_name_mode == 0) {
+            return $lang;
+        }
+
+        if ($this->_name_mode == 2) {
+            $method = 'nameToCode2';
+        } else {
+            $method = 'nameToCode3';
+        }
+
+        if (is_string($lang)) {
+            return Text_LanguageDetect_ISO639::$method($lang);
+        }
+
+        $newlang = array();
+        foreach ($lang as $key => $val) {
+            if ($convertKey) {
+                $newkey = Text_LanguageDetect_ISO639::$method($key);
+                $newlang[$newkey] = $val;
+            } else {
+                $newlang[$key] = Text_LanguageDetect_ISO639::$method($val);
+            }
+        }
+        return $newlang;
+    }
+}
 
-?>
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index 2e8991cc..d0f09d74 100644
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -1,1138 +1,1138 @@
-<?php
-/** 
-* Arc90's Readability ported to PHP for FiveFilters.org
-* Based on readability.js version 1.7.1 (without multi-page support)
-* Updated to allow HTML5 parsing with html5lib
-* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
-* ------------------------------------------------------
-* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
-* Arc90's project URL: http://lab.arc90.com/experiments/readability/
-* JS Source: http://code.google.com/p/arc90labs-readability
-* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
-* More information: http://fivefilters.org/content-only/
-* License: Apache License, Version 2.0
-* Requires: PHP5
-* Date: 2012-09-19
-* 
-* Differences between the PHP port and the original
-* ------------------------------------------------------
-* Arc90's Readability is designed to run in the browser. It works on the DOM 
-* tree (the parsed HTML) after the page's CSS styles have been applied and 
-* Javascript code executed. This PHP port does not run inside a browser. 
-* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 
-* rely on CSS or Javascript support. As such, the results will not always 
-* match Arc90's Readability. (For example, if a web page contains CSS style 
-* rules or Javascript code which hide certain HTML elements from display, 
-* Arc90's Readability will dismiss those from consideration but our PHP port, 
-* unable to understand CSS or Javascript, will not know any better.)
-* 
-* Another significant difference is that the aim of Arc90's Readability is 
-* to re-present the main content block of a given web page so users can 
-* read it more easily in their browsers. Correct identification, clean up, 
-* and separation of the content block is only a part of this process. 
-* This PHP port is only concerned with this part, it does not include code 
-* that relates to presentation in the browser - Arc90 already do 
-* that extremely well, and for PDF output there's FiveFilters.org's 
-* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
-* 
-* Finally, this class contains methods that might be useful for developers 
-* working on HTML document fragments. So without deviating too much from 
-* the original code (which I don't want to do because it makes debugging 
-* and updating more difficult), I've tried to make it a little more 
-* developer friendly. You should be able to use the methods here on 
-* existing DOMElement objects without passing an entire HTML document to 
-* be parsed.
-*/
-
-// This class allows us to do JavaScript like assignements to innerHTML
-require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
-
-// Alternative usage (for testing only!)
-// uncomment the lines below and call Readability.php in your browser 
-// passing it the URL of the page you'd like content from, e.g.:
-// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
-
-/*
-if (!isset($_GET['url']) || $_GET['url'] == '') {
-	die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
-}
-$url = $_GET['url'];
-if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
-$html = file_get_contents($url);
-$r = new Readability($html, $url);
-$r->init();
-echo $r->articleContent->innerHTML;
-*/
-
-class Readability
-{
-	public $version = '1.7.1-without-multi-page';
-	public $convertLinksToFootnotes = false;
-	public $revertForcedParagraphElements = true;
-	public $articleTitle;
-	public $articleContent;
-	public $dom;
-	public $url = null; // optional - URL where HTML was retrieved
-	public $debug = false;
-	public $lightClean = true; // preserves more content (experimental) added 2012-09-19
-	protected $body = null; // 
-	protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
-	protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
-	protected $success = false; // indicates whether we were able to extract or not
-	
-	/**
-	* All of the regular expressions in use within readability.
-	* Defined up here so we don't instantiate them repeatedly in loops.
-	**/
-	public $regexps = array(
-		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
-		'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
-		'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
-		'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
-		'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
-		'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
-		'replaceFonts' => '/<(\/?)font[^>]*>/i',
-		// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
-		'normalize' => '/\s{2,}/',
-		'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
-		'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
-		'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
-	);	
-	
-	/* constants */
-	const FLAG_STRIP_UNLIKELYS = 1;
-	const FLAG_WEIGHT_CLASSES = 2;
-	const FLAG_CLEAN_CONDITIONALLY = 4;
-	
-	/**
-	* Create instance of Readability
-	* @param string UTF-8 encoded string
-	* @param string (optional) URL associated with HTML (used for footnotes)
-	* @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
-	*/	
-	function __construct($html, $url=null, $parser='libxml')
-	{
-		$this->url = $url;
-		/* Turn all double br's into p's */
-		$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
-		$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
-		$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
-		if (trim($html) == '') $html = '<html></html>';
-		if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
-			// all good
-		} else {
-			$this->dom = new DOMDocument();
-			$this->dom->preserveWhiteSpace = false;
-			@$this->dom->loadHTML($html);
-		}
-		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
-	}
-
-	/**
-	* Get article title element
-	* @return DOMElement
-	*/
-	public function getTitle() {
-		return $this->articleTitle;
-	}
-	
-	/**
-	* Get article content element
-	* @return DOMElement
-	*/
-	public function getContent() {
-		return $this->articleContent;
-	}	
-	
-	/**
-	* Runs readability.
-	* 
-	* Workflow:
-	*  1. Prep the document by removing script tags, css, etc.
-	*  2. Build readability's DOM tree.
-	*  3. Grab the article content from the current dom tree.
-	*  4. Replace the current DOM tree with the new one.
-	*  5. Read peacefully.
-	*
-	* @return boolean true if we found content, false otherwise
-	**/
-	public function init()
-	{
-		if (!isset($this->dom->documentElement)) return false;
-		$this->removeScripts($this->dom);
-		//die($this->getInnerHTML($this->dom->documentElement));
-		
-		// Assume successful outcome
-		$this->success = true;
-
-		$bodyElems = $this->dom->getElementsByTagName('body');
-		if ($bodyElems->length > 0) {
-			if ($this->bodyCache == null) {
-				$this->bodyCache = $bodyElems->item(0)->innerHTML;
-			}
-			if ($this->body == null) {
-				$this->body = $bodyElems->item(0);
-			}
-		}
-
-		$this->prepDocument();
-		
-		//die($this->dom->documentElement->parentNode->nodeType);
-		//$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
-		//die($this->getInnerHTML($this->dom->documentElement));
-
-		/* Build readability's DOM tree */
-		$overlay        = $this->dom->createElement('div');
-		$innerDiv       = $this->dom->createElement('div');
-		$articleTitle   = $this->getArticleTitle();
-		$articleContent = $this->grabArticle();
-
-		if (!$articleContent) {
-			$this->success = false;
-			$articleContent = $this->dom->createElement('div');
-			$articleContent->setAttribute('id', 'readability-content');
-			$articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';		
-		}
-		
-		$overlay->setAttribute('id', 'readOverlay');
-		$innerDiv->setAttribute('id', 'readInner');
-
-		/* Glue the structure of our document together. */
-		$innerDiv->appendChild($articleTitle);
-		$innerDiv->appendChild($articleContent);
-		$overlay->appendChild($innerDiv);
-		
-		/* Clear the old HTML, insert the new content. */
-		$this->body->innerHTML = '';
-		$this->body->appendChild($overlay);
-		//document.body.insertBefore(overlay, document.body.firstChild);
-		$this->body->removeAttribute('style');
-
-		$this->postProcessContent($articleContent);
-		
-		// Set title and content instance variables
-		$this->articleTitle = $articleTitle;
-		$this->articleContent = $articleContent;
-		
-		return $this->success;
-	}
-	
-	/**
-	* Debug
-	*/
-	protected function dbg($msg) {
-		if ($this->debug) echo '* ',$msg, "\n";
-	}
-	
-	/**
-	* Run any post-process modifications to article content as necessary.
-	*
-	* @param DOMElement
-	* @return void
-	*/
-	public function postProcessContent($articleContent) {
-		if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 
-			$this->addFootnotes($articleContent);
-		}
-	}
-	
-	/**
-	* Get the article title as an H1.
-	*
-	* @return DOMElement
-	*/
-	protected function getArticleTitle() {
-		$curTitle = '';
-		$origTitle = '';
-
-		try {
-			$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
-		} catch(Exception $e) {}
-		
-		if (preg_match('/ [\|\-] /', $curTitle))
-		{
-			$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
-			
-			if (count(explode(' ', $curTitle)) < 3) {
-				$curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
-			}
-		}
-		else if (strpos($curTitle, ': ') !== false)
-		{
-			$curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
-
-			if (count(explode(' ', $curTitle)) < 3) {
-				$curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
-			}
-		}
-		else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
-		{
-			$hOnes = $this->dom->getElementsByTagName('h1');
-			if($hOnes->length == 1)
-			{
-				$curTitle = $this->getInnerText($hOnes->item(0));
-			}
-		}
-
-		$curTitle = trim($curTitle);
-
-		if (count(explode(' ', $curTitle)) <= 4) {
-			$curTitle = $origTitle;
-		}
-		
-		$articleTitle = $this->dom->createElement('h1');
-		$articleTitle->innerHTML = $curTitle;
-		
-		return $articleTitle;
-	}
-	
-	/**
-	* Prepare the HTML document for readability to scrape it.
-	* This includes things like stripping javascript, CSS, and handling terrible markup.
-	* 
-	* @return void
-	**/
-	protected function prepDocument() {
-		/**
-		* In some cases a body element can't be found (if the HTML is totally hosed for example)
-		* so we create a new body node and append it to the document.
-		*/
-		if ($this->body == null)
-		{
-			$this->body = $this->dom->createElement('body');
-			$this->dom->documentElement->appendChild($this->body);
-		}
-		$this->body->setAttribute('id', 'readabilityBody');
-
-		/* Remove all style tags in head */
-		$styleTags = $this->dom->getElementsByTagName('style');
-		for ($i = $styleTags->length-1; $i >= 0; $i--)
-		{
-			$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
-		}
-
-		/* Turn all double br's into p's */
-		/* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
-		//document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
-		// We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
-		// Manipulating innerHTML as it's done in JS is not possible in PHP.
-	}
-
-	/**
-	* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
-	* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
-	*
-	* @return void
-	**/
-	public function addFootnotes($articleContent) {
-		$footnotesWrapper = $this->dom->createElement('div');
-		$footnotesWrapper->setAttribute('id', 'readability-footnotes');
-		$footnotesWrapper->innerHTML = '<h3>References</h3>';
-		
-		$articleFootnotes = $this->dom->createElement('ol');
-		$articleFootnotes->setAttribute('id', 'readability-footnotes-list');
-		$footnotesWrapper->appendChild($articleFootnotes);
-		
-		$articleLinks = $articleContent->getElementsByTagName('a');
-		
-		$linkCount = 0;
-		for ($i = 0; $i < $articleLinks->length; $i++)
-		{
-			$articleLink  = $articleLinks->item($i);
-			$footnoteLink = $articleLink->cloneNode(true);
-			$refLink      = $this->dom->createElement('a');
-			$footnote     = $this->dom->createElement('li');
-			$linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
-			if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
-			//linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
-			$linkText     = $this->getInnerText($articleLink);
-			
-			if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
-				continue;
-			}
-			
-			$linkCount++;
-
-			/** Add a superscript reference after the article link */
-			$refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
-			$refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
-			$refLink->setAttribute('class', 'readability-DoNotFootnote');
-			$refLink->setAttribute('style', 'color: inherit;');
-			
-			//TODO: does this work or should we use DOMNode.isSameNode()?
-			if ($articleLink->parentNode->lastChild == $articleLink) {
-				$articleLink->parentNode->appendChild($refLink);
-			} else {
-				$articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
-			}
-
-			$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
-			$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
-
-			$footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
-
-			$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
-			$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
-			
-			$footnote->appendChild($footnoteLink);
-			if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
-			
-			$articleFootnotes->appendChild($footnote);
-		}
-
-		if ($linkCount > 0) {
-			$articleContent->appendChild($footnotesWrapper);           
-		}
-	}
-
-	/**
-	* Reverts P elements with class 'readability-styled'
-	* to text nodes - which is what they were before.
-	*
-	* @param DOMElement
-	* @return void
-	*/
-	function revertReadabilityStyledElements($articleContent) {
-		$xpath = new DOMXPath($articleContent->ownerDocument);
-		$elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
-		//$elems = $articleContent->getElementsByTagName('p');
-		for ($i = $elems->length-1; $i >= 0; $i--) {
-			$e = $elems->item($i);
-			$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
-			//if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
-			//	$e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
-			//}
-		}
-	}
-	
-	/**
-	* Prepare the article node for display. Clean out any inline styles,
-	* iframes, forms, strip extraneous <p> tags, etc.
-	*
-	* @param DOMElement
-	* @return void
-	*/
-	function prepArticle($articleContent) {
-		$this->cleanStyles($articleContent);
-		$this->killBreaks($articleContent);
-		if ($this->revertForcedParagraphElements) {
-			$this->revertReadabilityStyledElements($articleContent);
-		}
-
-		/* Clean out junk from the article content */
-		$this->cleanConditionally($articleContent, 'form');
-		$this->clean($articleContent, 'object');
-		$this->clean($articleContent, 'h1');
-
-		/**
-		* If there is only one h2, they are probably using it
-		* as a header and not a subheader, so remove it since we already have a header.
-		***/
-		if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
-			$this->clean($articleContent, 'h2'); 
-		}
-		$this->clean($articleContent, 'iframe');
-
-		$this->cleanHeaders($articleContent);
-
-		/* Do these last as the previous stuff may have removed junk that will affect these */
-		$this->cleanConditionally($articleContent, 'table');
-		$this->cleanConditionally($articleContent, 'ul');
-		$this->cleanConditionally($articleContent, 'div');
-
-		/* Remove extra paragraphs */
-		$articleParagraphs = $articleContent->getElementsByTagName('p');
-		for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
-		{
-			$imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
-			$embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
-			$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
-			$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
-			
-			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
-			{
-				$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
-			}
-		}
-
-		try {
-			$articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
-			//articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
-		}
-		catch (Exception $e) {
-			$this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
-		}
-	}
-	
-	/**
-	* Initialize a node with the readability object. Also checks the
-	* className/id for special names to add to its score.
-	*
-	* @param Element
-	* @return void
-	**/
-	protected function initializeNode($node) {
-		$readability = $this->dom->createAttribute('readability');
-		$readability->value = 0; // this is our contentScore
-		$node->setAttributeNode($readability);		         
-
-		switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
-			case 'DIV':
-				$readability->value += 5;
-				break;
-
-			case 'PRE':
-			case 'TD':
-			case 'BLOCKQUOTE':
-				$readability->value += 3;
-				break;
-				
-			case 'ADDRESS':
-			case 'OL':
-			case 'UL':
-			case 'DL':
-			case 'DD':
-			case 'DT':
-			case 'LI':
-			case 'FORM':
-				$readability->value -= 3;
-				break;
-
-			case 'H1':
-			case 'H2':
-			case 'H3':
-			case 'H4':
-			case 'H5':
-			case 'H6':
-			case 'TH':
-				$readability->value -= 5;
-				break;
-		}
-		$readability->value += $this->getClassWeight($node);
-	}
-	
-	/***
-	* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
-	*               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
-	*
-	* @return DOMElement
-	**/
-	protected function grabArticle($page=null) {
-		$stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
-		if (!$page) $page = $this->dom;
-		$allElements = $page->getElementsByTagName('*');
-		/**
-		* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
-		* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
-		*
-		* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
-		* TODO: Shouldn't this be a reverse traversal?
-		**/
-		$node = null;
-		$nodesToScore = array();
-		for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
-		//for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
-			//$node = $targetList->item($nodeIndex);
-			$tagName = strtoupper($node->tagName);
-			/* Remove unlikely candidates */
-			if ($stripUnlikelyCandidates) {
-				$unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
-				if (
-					preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
-					!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
-					$tagName != 'BODY'
-				)
-				{
-					$this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
-					//$nodesToRemove[] = $node;
-					$node->parentNode->removeChild($node);
-					$nodeIndex--;
-					continue;
-				}               
-			}
-
-			if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
-				$nodesToScore[] = $node;
-			}
-
-			/* Turn all divs that don't have children block level elements into p's */
-			if ($tagName == 'DIV') {
-				if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
-					//$this->dbg('Altering div to p');
-					$newNode = $this->dom->createElement('p');
-					try {
-						$newNode->innerHTML = $node->innerHTML;
-						//$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
-						$node->parentNode->replaceChild($newNode, $node);
-						$nodeIndex--;
-						$nodesToScore[] = $node; // or $newNode?
-					}
-					catch(Exception $e) {
-						$this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
-					}
-				}
-				else
-				{
-					/* EXPERIMENTAL */
-					// TODO: change these p elements back to text nodes after processing
-					for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
-						$childNode = $node->childNodes->item($i);
-						if ($childNode->nodeType == 3) { // XML_TEXT_NODE
-							//$this->dbg('replacing text node with a p tag with the same content.');
-							$p = $this->dom->createElement('p');
-							$p->innerHTML = $childNode->nodeValue;
-							$p->setAttribute('style', 'display: inline;');
-							$p->setAttribute('class', 'readability-styled');
-							$childNode->parentNode->replaceChild($p, $childNode);
-						}
-					}
-				}
-			}
-		}
-		
-		/**
-		* Loop through all paragraphs, and assign a score to them based on how content-y they look.
-		* Then add their score to their parent node.
-		*
-		* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
-		**/
-		$candidates = array();
-		for ($pt=0; $pt < count($nodesToScore); $pt++) {
-			$parentNode      = $nodesToScore[$pt]->parentNode;
-			// $grandParentNode = $parentNode ? $parentNode->parentNode : null;
-			$grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
-			$innerText       = $this->getInnerText($nodesToScore[$pt]);
-
-			if (!$parentNode || !isset($parentNode->tagName)) {
-				continue;
-			}
-
-			/* If this paragraph is less than 25 characters, don't even count it. */
-			if(strlen($innerText) < 25) {
-				continue;
-			}
-
-			/* Initialize readability data for the parent. */
-			if (!$parentNode->hasAttribute('readability')) 
-			{
-				$this->initializeNode($parentNode);
-				$candidates[] = $parentNode;
-			}
-
-			/* Initialize readability data for the grandparent. */
-			if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
-			{
-				$this->initializeNode($grandParentNode);
-				$candidates[] = $grandParentNode;
-			}
-
-			$contentScore = 0;
-
-			/* Add a point for the paragraph itself as a base. */
-			$contentScore++;
-
-			/* Add points for any commas within this paragraph */
-			$contentScore += count(explode(',', $innerText));
-			
-			/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
-			$contentScore += min(floor(strlen($innerText) / 100), 3);
-			
-			/* Add the score to the parent. The grandparent gets half. */
-			$parentNode->getAttributeNode('readability')->value += $contentScore;
-
-			if ($grandParentNode) {
-				$grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             
-			}
-		}
-
-		/**
-		* After we've calculated scores, loop through all of the possible candidate nodes we found
-		* and find the one with the highest score.
-		**/
-		$topCandidate = null;
-		for ($c=0, $cl=count($candidates); $c < $cl; $c++)
-		{
-			/**
-			* Scale the final candidates score based on link density. Good content should have a
-			* relatively small link density (5% or less) and be mostly unaffected by this operation.
-			**/
-			$readability = $candidates[$c]->getAttributeNode('readability');
-			$readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
-
-			$this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
-
-			if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
-				$topCandidate = $candidates[$c];
-			}
-		}
-
-		/**
-		* If we still have no top candidate, just use the body as a last resort.
-		* We also have to copy the body node so it is something we can modify.
-		**/
-		if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
-		{
-			$topCandidate = $this->dom->createElement('div');
-			if ($page instanceof DOMDocument) {
-				if (!isset($page->documentElement)) {
-					// we don't have a body either? what a mess! :)
-				} else {
-					$topCandidate->innerHTML = $page->documentElement->innerHTML;
-					$page->documentElement->innerHTML = '';
-					$page->documentElement->appendChild($topCandidate);
-				}
-			} else {
-				$topCandidate->innerHTML = $page->innerHTML;
-				$page->innerHTML = '';
-				$page->appendChild($topCandidate);
-			}
-			$this->initializeNode($topCandidate);
-		}
-
-		/**
-		* Now that we have the top candidate, look through its siblings for content that might also be related.
-		* Things like preambles, content split by ads that we removed, etc.
-		**/
-		$articleContent        = $this->dom->createElement('div');
-		$articleContent->setAttribute('id', 'readability-content');
-		$siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
-		$siblingNodes          = $topCandidate->parentNode->childNodes;
-		if (!isset($siblingNodes)) {
-			$siblingNodes = new stdClass;
-			$siblingNodes->length = 0;
-		}
-
-		for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
-		{
-			$siblingNode = $siblingNodes->item($s);
-			$append      = false;
-
-			$this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
-
-			//dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
-
-			if ($siblingNode === $topCandidate)
-			// or if ($siblingNode->isSameNode($topCandidate))
-			{
-				$append = true;
-			}
-
-			$contentBonus = 0;
-			/* Give a bonus if sibling nodes and top candidates have the example same classname */
-			if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
-				$contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
-			}
-
-			if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
-			{
-				$append = true;
-			}
-			
-			if (strtoupper($siblingNode->nodeName) == 'P') {
-				$linkDensity = $this->getLinkDensity($siblingNode);
-				$nodeContent = $this->getInnerText($siblingNode);
-				$nodeLength  = strlen($nodeContent);
-				
-				if ($nodeLength > 80 && $linkDensity < 0.25)
-				{
-					$append = true;
-				}
-				else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
-				{
-					$append = true;
-				}
-			}
-
-			if ($append)
-			{
-				$this->dbg('Appending node: ' . $siblingNode->nodeName);
-
-				$nodeToAppend = null;
-				$sibNodeName = strtoupper($siblingNode->nodeName);
-				if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
-					/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
-					
-					$this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
-					$nodeToAppend = $this->dom->createElement('div');
-					try {
-						$nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
-						$nodeToAppend->innerHTML = $siblingNode->innerHTML;
-					}
-					catch(Exception $e)
-					{
-						$this->dbg('Could not alter siblingNode to div, reverting back to original.');
-						$nodeToAppend = $siblingNode;
-						$s--;
-						$sl--;
-					}
-				} else {
-					$nodeToAppend = $siblingNode;
-					$s--;
-					$sl--;
-				}
-				
-				/* To ensure a node does not interfere with readability styles, remove its classnames */
-				$nodeToAppend->removeAttribute('class');
-
-				/* Append sibling and subtract from our list because it removes the node when you append to another node */
-				$articleContent->appendChild($nodeToAppend);
-			}
-		}
-
-		/**
-		* So we have all of the content that we need. Now we clean it up for presentation.
-		**/
-		$this->prepArticle($articleContent);
-
-		/**
-		* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
-		* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
-		* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
-		* finding the -right- content.
-		**/
-		if (strlen($this->getInnerText($articleContent, false)) < 250)
-		{
-			// TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
-			// in the meantime, we check and create an empty element if it's not there.
-			if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
-			$this->body->innerHTML = $this->bodyCache;
-			
-			if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
-				$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
-				return $this->grabArticle($this->body);
-			}
-			else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
-				$this->removeFlag(self::FLAG_WEIGHT_CLASSES);
-				return $this->grabArticle($this->body);              
-			}
-			else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
-				$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
-				return $this->grabArticle($this->body);
-			}
-			else {
-				return false;
-			}
-		}
-		return $articleContent;
-	}
-	
-	/**
-	* Remove script tags from document
-	*
-	* @param DOMElement
-	* @return void
-	*/
-	public function removeScripts($doc) {
-		$scripts = $doc->getElementsByTagName('script');
-		for($i = $scripts->length-1; $i >= 0; $i--)
-		{
-			$scripts->item($i)->parentNode->removeChild($scripts->item($i));
-		}
-	}
-	
-	/**
-	* Get the inner text of a node.
-	* This also strips out any excess whitespace to be found.
-	*
-	* @param DOMElement $
-	* @param boolean $normalizeSpaces (default: true)
-	* @return string
-	**/
-	public function getInnerText($e, $normalizeSpaces=true) {
-		$textContent = '';
-
-		if (!isset($e->textContent) || $e->textContent == '') {
-			return '';
-		}
-
-		$textContent = trim($e->textContent);
-
-		if ($normalizeSpaces) {
-			return preg_replace($this->regexps['normalize'], ' ', $textContent);
-		} else {
-			return $textContent;
-		}
-	}
-
-	/**
-	* Get the number of times a string $s appears in the node $e.
-	*
-	* @param DOMElement $e
-	* @param string - what to count. Default is ","
-	* @return number (integer)
-	**/
-	public function getCharCount($e, $s=',') {
-		return substr_count($this->getInnerText($e), $s);
-	}
-
-	/**
-	* Remove the style attribute on every $e and under.
-	*
-	* @param DOMElement $e
-	* @return void
-	*/
-	public function cleanStyles($e) {
-		if (!is_object($e)) return;
-		$elems = $e->getElementsByTagName('*');
-		foreach ($elems as $elem) {
-			$elem->removeAttribute('style');
-		}
-	}
-	
-	/**
-	* Get the density of links as a percentage of the content
-	* This is the amount of text that is inside a link divided by the total text in the node.
-	* 
-	* @param DOMElement $e
-	* @return number (float)
-	*/
-	public function getLinkDensity($e) {
-		$links      = $e->getElementsByTagName('a');
-		$textLength = strlen($this->getInnerText($e));
-		$linkLength = 0;
-		for ($i=0, $il=$links->length; $i < $il; $i++)
-		{
-			$linkLength += strlen($this->getInnerText($links->item($i)));
-		}
-		if ($textLength > 0) {
-			return $linkLength / $textLength;
-		} else {
-			return 0;
-		}
-	}
-	
-	/**
-	* Get an elements class/id weight. Uses regular expressions to tell if this 
-	* element looks good or bad.
-	*
-	* @param DOMElement $e
-	* @return number (Integer)
-	*/
-	public function getClassWeight($e) {
-		if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
-			return 0;
-		}
-
-		$weight = 0;
-
-		/* Look for a special classname */
-		if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
-		{
-			if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
-				$weight -= 25;
-			}
-			if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
-				$weight += 25;
-			}
-		}
-
-		/* Look for a special ID */
-		if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
-		{
-			if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
-				$weight -= 25;
-			}
-			if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
-				$weight += 25;
-			}
-		}
-		return $weight;
-	}
-
-	/**
-	* Remove extraneous break tags from a node.
-	*
-	* @param DOMElement $node
-	* @return void
-	*/
-	public function killBreaks($node) {
-		$html = $node->innerHTML;
-		$html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
-		$node->innerHTML = $html;
-	}
-
-	/**
-	* Clean a node of all elements of type "tag".
-	* (Unless it's a youtube/vimeo video. People love movies.)
-	*
-	* Updated 2012-09-18 to preserve youtube/vimeo iframes
-	*
-	* @param DOMElement $e
-	* @param string $tag
-	* @return void
-	*/
-	public function clean($e, $tag) {
-		$targetList = $e->getElementsByTagName($tag);
-		$isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
-		
-		for ($y=$targetList->length-1; $y >= 0; $y--) {
-			/* Allow youtube and vimeo videos through as people usually want to see those. */
-			if ($isEmbed) {
-				$attributeValues = '';
-				for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
-					$attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
-				}
-				
-				/* First, check the elements attributes to see if any of them contain youtube or vimeo */
-				if (preg_match($this->regexps['video'], $attributeValues)) {
-					continue;
-				}
-
-				/* Then check the elements inside this element for the same. */
-				if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
-					continue;
-				}
-			}
-			$targetList->item($y)->parentNode->removeChild($targetList->item($y));
-		}
-	}
-	
-	/**
-	* Clean an element of all tags of type "tag" if they look fishy.
-	* "Fishy" is an algorithm based on content length, classnames, 
-	* link density, number of images & embeds, etc.
-	*
-	* @param DOMElement $e
-	* @param string $tag
-	* @return void
-	*/
-	public function cleanConditionally($e, $tag) {
-		if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
-			return;
-		}
-
-		$tagsList = $e->getElementsByTagName($tag);
-		$curTagsLength = $tagsList->length;
-
-		/**
-		* Gather counts for other typical elements embedded within.
-		* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
-		*
-		* TODO: Consider taking into account original contentScore here.
-		*/
-		for ($i=$curTagsLength-1; $i >= 0; $i--) {
-			$weight = $this->getClassWeight($tagsList->item($i));
-			$contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
-			
-			$this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
-
-			if ($weight + $contentScore < 0) {
-				$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
-			}
-			else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
-				/**
-				* If there are not very many commas, and the number of
-				* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
-				**/
-				$p      = $tagsList->item($i)->getElementsByTagName('p')->length;
-				$img    = $tagsList->item($i)->getElementsByTagName('img')->length;
-				$li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
-				$input  = $tagsList->item($i)->getElementsByTagName('input')->length;
-				$a 		= $tagsList->item($i)->getElementsByTagName('a')->length;
-
-				$embedCount = 0;
-				$embeds = $tagsList->item($i)->getElementsByTagName('embed');
-				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
-					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
-						$embedCount++; 
-					}
-				}
-				$embeds = $tagsList->item($i)->getElementsByTagName('iframe');
-				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
-					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
-						$embedCount++; 
-					}
-				}
-
-				$linkDensity   = $this->getLinkDensity($tagsList->item($i));
-				$contentLength = strlen($this->getInnerText($tagsList->item($i)));
-				$toRemove      = false;
-
-				if ($this->lightClean) {
-					$this->dbg('Light clean...');
-					if ( ($img > $p) && ($img > 4) ) {
-						$this->dbg(' more than 4 images and more image elements than paragraph elements');
-						$toRemove = true;
-					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
-						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
-						$toRemove = true;
-					} else if ( $input > floor($p/3) ) {
-						$this->dbg(' too many <input> elements');
-						$toRemove = true; 
-					} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
-						$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
-						$toRemove = true;
-					} else if($weight < 25 && $linkDensity > 0.2) {
-						$this->dbg(' weight smaller than 25 and link density above 0.2');
-						$toRemove = true;
-					} else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
-						$this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
-						$toRemove = true;
-					} else if($embedCount > 3) {
-						$this->dbg(' more than 3 embeds');
-						$toRemove = true;
-					}
-				} else {
-					$this->dbg('Standard clean...');
-					if ( $img > $p ) {
-						$this->dbg(' more image elements than paragraph elements');
-						$toRemove = true;
-					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
-						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
-						$toRemove = true;
-					} else if ( $input > floor($p/3) ) {
-						$this->dbg(' too many <input> elements');
-						$toRemove = true; 
-					} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
-						$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
-						$toRemove = true;
-					} else if($weight < 25 && $linkDensity > 0.2) {
-						$this->dbg(' weight smaller than 25 and link density above 0.2');
-						$toRemove = true;
-					} else if($weight >= 25 && $linkDensity > 0.5) {
-						$this->dbg(' weight above 25 but link density greater than 0.5');
-						$toRemove = true;
-					} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
-						$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
-						$toRemove = true;
-					}
-				}
-
-				if ($toRemove) {
-					//$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
-					$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
-				}
-			}
-		}
-	}
-
-	/**
-	* Clean out spurious headers from an Element. Checks things like classnames and link density.
-	*
-	* @param DOMElement $e
-	* @return void
-	*/
-	public function cleanHeaders($e) {
-		for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
-			$headers = $e->getElementsByTagName('h' . $headerIndex);
-			for ($i=$headers->length-1; $i >=0; $i--) {
-				if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
-					$headers->item($i)->parentNode->removeChild($headers->item($i));
-				}
-			}
-		}
-	}
-
-	public function flagIsActive($flag) {
-		return ($this->flags & $flag) > 0;
-	}
-	
-	public function addFlag($flag) {
-		$this->flags = $this->flags | $flag;
-	}
-	
-	public function removeFlag($flag) {
-		$this->flags = $this->flags & ~$flag;
-	}
-}
+<?php
+/** 
+* Arc90's Readability ported to PHP for FiveFilters.org
+* Based on readability.js version 1.7.1 (without multi-page support)
+* Updated to allow HTML5 parsing with html5lib
+* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
+* ------------------------------------------------------
+* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
+* Arc90's project URL: http://lab.arc90.com/experiments/readability/
+* JS Source: http://code.google.com/p/arc90labs-readability
+* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
+* More information: http://fivefilters.org/content-only/
+* License: Apache License, Version 2.0
+* Requires: PHP5
+* Date: 2012-09-19
+* 
+* Differences between the PHP port and the original
+* ------------------------------------------------------
+* Arc90's Readability is designed to run in the browser. It works on the DOM 
+* tree (the parsed HTML) after the page's CSS styles have been applied and 
+* Javascript code executed. This PHP port does not run inside a browser. 
+* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 
+* rely on CSS or Javascript support. As such, the results will not always 
+* match Arc90's Readability. (For example, if a web page contains CSS style 
+* rules or Javascript code which hide certain HTML elements from display, 
+* Arc90's Readability will dismiss those from consideration but our PHP port, 
+* unable to understand CSS or Javascript, will not know any better.)
+* 
+* Another significant difference is that the aim of Arc90's Readability is 
+* to re-present the main content block of a given web page so users can 
+* read it more easily in their browsers. Correct identification, clean up, 
+* and separation of the content block is only a part of this process. 
+* This PHP port is only concerned with this part, it does not include code 
+* that relates to presentation in the browser - Arc90 already do 
+* that extremely well, and for PDF output there's FiveFilters.org's 
+* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
+* 
+* Finally, this class contains methods that might be useful for developers 
+* working on HTML document fragments. So without deviating too much from 
+* the original code (which I don't want to do because it makes debugging 
+* and updating more difficult), I've tried to make it a little more 
+* developer friendly. You should be able to use the methods here on 
+* existing DOMElement objects without passing an entire HTML document to 
+* be parsed.
+*/
+
+// This class allows us to do JavaScript like assignements to innerHTML
+require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
+
+// Alternative usage (for testing only!)
+// uncomment the lines below and call Readability.php in your browser 
+// passing it the URL of the page you'd like content from, e.g.:
+// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
+
+/*
+if (!isset($_GET['url']) || $_GET['url'] == '') {
+	die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
+}
+$url = $_GET['url'];
+if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
+$html = file_get_contents($url);
+$r = new Readability($html, $url);
+$r->init();
+echo $r->articleContent->innerHTML;
+*/
+
+class Readability
+{
+	public $version = '1.7.1-without-multi-page';
+	public $convertLinksToFootnotes = false;
+	public $revertForcedParagraphElements = true;
+	public $articleTitle;
+	public $articleContent;
+	public $dom;
+	public $url = null; // optional - URL where HTML was retrieved
+	public $debug = false;
+	public $lightClean = true; // preserves more content (experimental) added 2012-09-19
+	protected $body = null; // 
+	protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
+	protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
+	protected $success = false; // indicates whether we were able to extract or not
+	
+	/**
+	* All of the regular expressions in use within readability.
+	* Defined up here so we don't instantiate them repeatedly in loops.
+	**/
+	public $regexps = array(
+		'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
+		'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+		'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+		'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+		'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
+		'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
+		'replaceFonts' => '/<(\/?)font[^>]*>/i',
+		// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
+		'normalize' => '/\s{2,}/',
+		'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
+		'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
+		'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+	);	
+	
+	/* constants */
+	const FLAG_STRIP_UNLIKELYS = 1;
+	const FLAG_WEIGHT_CLASSES = 2;
+	const FLAG_CLEAN_CONDITIONALLY = 4;
+	
+	/**
+	* Create instance of Readability
+	* @param string UTF-8 encoded string
+	* @param string (optional) URL associated with HTML (used for footnotes)
+	* @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+	*/	
+	function __construct($html, $url=null, $parser='libxml')
+	{
+		$this->url = $url;
+		/* Turn all double br's into p's */
+		$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
+		$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
+		$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
+		if (trim($html) == '') $html = '<html></html>';
+		if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+			// all good
+		} else {
+			$this->dom = new DOMDocument();
+			$this->dom->preserveWhiteSpace = false;
+			@$this->dom->loadHTML($html);
+		}
+		$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+	}
+
+	/**
+	* Get article title element
+	* @return DOMElement
+	*/
+	public function getTitle() {
+		return $this->articleTitle;
+	}
+	
+	/**
+	* Get article content element
+	* @return DOMElement
+	*/
+	public function getContent() {
+		return $this->articleContent;
+	}	
+	
+	/**
+	* Runs readability.
+	* 
+	* Workflow:
+	*  1. Prep the document by removing script tags, css, etc.
+	*  2. Build readability's DOM tree.
+	*  3. Grab the article content from the current dom tree.
+	*  4. Replace the current DOM tree with the new one.
+	*  5. Read peacefully.
+	*
+	* @return boolean true if we found content, false otherwise
+	**/
+	public function init()
+	{
+		if (!isset($this->dom->documentElement)) return false;
+		$this->removeScripts($this->dom);
+		//die($this->getInnerHTML($this->dom->documentElement));
+		
+		// Assume successful outcome
+		$this->success = true;
+
+		$bodyElems = $this->dom->getElementsByTagName('body');
+		if ($bodyElems->length > 0) {
+			if ($this->bodyCache == null) {
+				$this->bodyCache = $bodyElems->item(0)->innerHTML;
+			}
+			if ($this->body == null) {
+				$this->body = $bodyElems->item(0);
+			}
+		}
+
+		$this->prepDocument();
+		
+		//die($this->dom->documentElement->parentNode->nodeType);
+		//$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
+		//die($this->getInnerHTML($this->dom->documentElement));
+
+		/* Build readability's DOM tree */
+		$overlay        = $this->dom->createElement('div');
+		$innerDiv       = $this->dom->createElement('div');
+		$articleTitle   = $this->getArticleTitle();
+		$articleContent = $this->grabArticle();
+
+		if (!$articleContent) {
+			$this->success = false;
+			$articleContent = $this->dom->createElement('div');
+			$articleContent->setAttribute('id', 'readability-content');
+			$articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';		
+		}
+		
+		$overlay->setAttribute('id', 'readOverlay');
+		$innerDiv->setAttribute('id', 'readInner');
+
+		/* Glue the structure of our document together. */
+		$innerDiv->appendChild($articleTitle);
+		$innerDiv->appendChild($articleContent);
+		$overlay->appendChild($innerDiv);
+		
+		/* Clear the old HTML, insert the new content. */
+		$this->body->innerHTML = '';
+		$this->body->appendChild($overlay);
+		//document.body.insertBefore(overlay, document.body.firstChild);
+		$this->body->removeAttribute('style');
+
+		$this->postProcessContent($articleContent);
+		
+		// Set title and content instance variables
+		$this->articleTitle = $articleTitle;
+		$this->articleContent = $articleContent;
+		
+		return $this->success;
+	}
+	
+	/**
+	* Debug
+	*/
+	protected function dbg($msg) {
+		if ($this->debug) echo '* ',$msg, "\n";
+	}
+	
+	/**
+	* Run any post-process modifications to article content as necessary.
+	*
+	* @param DOMElement
+	* @return void
+	*/
+	public function postProcessContent($articleContent) {
+		if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 
+			$this->addFootnotes($articleContent);
+		}
+	}
+	
+	/**
+	* Get the article title as an H1.
+	*
+	* @return DOMElement
+	*/
+	protected function getArticleTitle() {
+		$curTitle = '';
+		$origTitle = '';
+
+		try {
+			$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+		} catch(Exception $e) {}
+		
+		if (preg_match('/ [\|\-] /', $curTitle))
+		{
+			$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+			
+			if (count(explode(' ', $curTitle)) < 3) {
+				$curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+			}
+		}
+		else if (strpos($curTitle, ': ') !== false)
+		{
+			$curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+
+			if (count(explode(' ', $curTitle)) < 3) {
+				$curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
+			}
+		}
+		else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
+		{
+			$hOnes = $this->dom->getElementsByTagName('h1');
+			if($hOnes->length == 1)
+			{
+				$curTitle = $this->getInnerText($hOnes->item(0));
+			}
+		}
+
+		$curTitle = trim($curTitle);
+
+		if (count(explode(' ', $curTitle)) <= 4) {
+			$curTitle = $origTitle;
+		}
+		
+		$articleTitle = $this->dom->createElement('h1');
+		$articleTitle->innerHTML = $curTitle;
+		
+		return $articleTitle;
+	}
+	
+	/**
+	* Prepare the HTML document for readability to scrape it.
+	* This includes things like stripping javascript, CSS, and handling terrible markup.
+	* 
+	* @return void
+	**/
+	protected function prepDocument() {
+		/**
+		* In some cases a body element can't be found (if the HTML is totally hosed for example)
+		* so we create a new body node and append it to the document.
+		*/
+		if ($this->body == null)
+		{
+			$this->body = $this->dom->createElement('body');
+			$this->dom->documentElement->appendChild($this->body);
+		}
+		$this->body->setAttribute('id', 'readabilityBody');
+
+		/* Remove all style tags in head */
+		$styleTags = $this->dom->getElementsByTagName('style');
+		for ($i = $styleTags->length-1; $i >= 0; $i--)
+		{
+			$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+		}
+
+		/* Turn all double br's into p's */
+		/* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
+		//document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
+		// We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
+		// Manipulating innerHTML as it's done in JS is not possible in PHP.
+	}
+
+	/**
+	* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
+	* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
+	*
+	* @return void
+	**/
+	public function addFootnotes($articleContent) {
+		$footnotesWrapper = $this->dom->createElement('div');
+		$footnotesWrapper->setAttribute('id', 'readability-footnotes');
+		$footnotesWrapper->innerHTML = '<h3>References</h3>';
+		
+		$articleFootnotes = $this->dom->createElement('ol');
+		$articleFootnotes->setAttribute('id', 'readability-footnotes-list');
+		$footnotesWrapper->appendChild($articleFootnotes);
+		
+		$articleLinks = $articleContent->getElementsByTagName('a');
+		
+		$linkCount = 0;
+		for ($i = 0; $i < $articleLinks->length; $i++)
+		{
+			$articleLink  = $articleLinks->item($i);
+			$footnoteLink = $articleLink->cloneNode(true);
+			$refLink      = $this->dom->createElement('a');
+			$footnote     = $this->dom->createElement('li');
+			$linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
+			if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
+			//linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
+			$linkText     = $this->getInnerText($articleLink);
+			
+			if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+				continue;
+			}
+			
+			$linkCount++;
+
+			/** Add a superscript reference after the article link */
+			$refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
+			$refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
+			$refLink->setAttribute('class', 'readability-DoNotFootnote');
+			$refLink->setAttribute('style', 'color: inherit;');
+			
+			//TODO: does this work or should we use DOMNode.isSameNode()?
+			if ($articleLink->parentNode->lastChild == $articleLink) {
+				$articleLink->parentNode->appendChild($refLink);
+			} else {
+				$articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
+			}
+
+			$articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
+			$articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
+
+			$footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
+
+			$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
+			$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
+			
+			$footnote->appendChild($footnoteLink);
+			if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
+			
+			$articleFootnotes->appendChild($footnote);
+		}
+
+		if ($linkCount > 0) {
+			$articleContent->appendChild($footnotesWrapper);           
+		}
+	}
+
+	/**
+	* Reverts P elements with class 'readability-styled'
+	* to text nodes - which is what they were before.
+	*
+	* @param DOMElement
+	* @return void
+	*/
+	function revertReadabilityStyledElements($articleContent) {
+		$xpath = new DOMXPath($articleContent->ownerDocument);
+		$elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
+		//$elems = $articleContent->getElementsByTagName('p');
+		for ($i = $elems->length-1; $i >= 0; $i--) {
+			$e = $elems->item($i);
+			$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+			//if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
+			//	$e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
+			//}
+		}
+	}
+	
+	/**
+	* Prepare the article node for display. Clean out any inline styles,
+	* iframes, forms, strip extraneous <p> tags, etc.
+	*
+	* @param DOMElement
+	* @return void
+	*/
+	function prepArticle($articleContent) {
+		$this->cleanStyles($articleContent);
+		$this->killBreaks($articleContent);
+		if ($this->revertForcedParagraphElements) {
+			$this->revertReadabilityStyledElements($articleContent);
+		}
+
+		/* Clean out junk from the article content */
+		$this->cleanConditionally($articleContent, 'form');
+		$this->clean($articleContent, 'object');
+		$this->clean($articleContent, 'h1');
+
+		/**
+		* If there is only one h2, they are probably using it
+		* as a header and not a subheader, so remove it since we already have a header.
+		***/
+		if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
+			$this->clean($articleContent, 'h2'); 
+		}
+		$this->clean($articleContent, 'iframe');
+
+		$this->cleanHeaders($articleContent);
+
+		/* Do these last as the previous stuff may have removed junk that will affect these */
+		$this->cleanConditionally($articleContent, 'table');
+		$this->cleanConditionally($articleContent, 'ul');
+		$this->cleanConditionally($articleContent, 'div');
+
+		/* Remove extra paragraphs */
+		$articleParagraphs = $articleContent->getElementsByTagName('p');
+		for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
+		{
+			$imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
+			$embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
+			$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+			$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
+			
+			if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+			{
+				$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
+			}
+		}
+
+		try {
+			$articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
+			//articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
+		}
+		catch (Exception $e) {
+			$this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
+		}
+	}
+	
+	/**
+	* Initialize a node with the readability object. Also checks the
+	* className/id for special names to add to its score.
+	*
+	* @param Element
+	* @return void
+	**/
+	protected function initializeNode($node) {
+		$readability = $this->dom->createAttribute('readability');
+		$readability->value = 0; // this is our contentScore
+		$node->setAttributeNode($readability);		         
+
+		switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
+			case 'DIV':
+				$readability->value += 5;
+				break;
+
+			case 'PRE':
+			case 'TD':
+			case 'BLOCKQUOTE':
+				$readability->value += 3;
+				break;
+				
+			case 'ADDRESS':
+			case 'OL':
+			case 'UL':
+			case 'DL':
+			case 'DD':
+			case 'DT':
+			case 'LI':
+			case 'FORM':
+				$readability->value -= 3;
+				break;
+
+			case 'H1':
+			case 'H2':
+			case 'H3':
+			case 'H4':
+			case 'H5':
+			case 'H6':
+			case 'TH':
+				$readability->value -= 5;
+				break;
+		}
+		$readability->value += $this->getClassWeight($node);
+	}
+	
+	/***
+	* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+	*               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+	*
+	* @return DOMElement
+	**/
+	protected function grabArticle($page=null) {
+		$stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
+		if (!$page) $page = $this->dom;
+		$allElements = $page->getElementsByTagName('*');
+		/**
+		* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+		* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+		*
+		* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+		* TODO: Shouldn't this be a reverse traversal?
+		**/
+		$node = null;
+		$nodesToScore = array();
+		for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
+		//for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
+			//$node = $targetList->item($nodeIndex);
+			$tagName = strtoupper($node->tagName);
+			/* Remove unlikely candidates */
+			if ($stripUnlikelyCandidates) {
+				$unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
+				if (
+					preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+					!preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
+					$tagName != 'BODY'
+				)
+				{
+					$this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
+					//$nodesToRemove[] = $node;
+					$node->parentNode->removeChild($node);
+					$nodeIndex--;
+					continue;
+				}               
+			}
+
+			if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
+				$nodesToScore[] = $node;
+			}
+
+			/* Turn all divs that don't have children block level elements into p's */
+			if ($tagName == 'DIV') {
+				if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+					//$this->dbg('Altering div to p');
+					$newNode = $this->dom->createElement('p');
+					try {
+						$newNode->innerHTML = $node->innerHTML;
+						//$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
+						$node->parentNode->replaceChild($newNode, $node);
+						$nodeIndex--;
+						$nodesToScore[] = $node; // or $newNode?
+					}
+					catch(Exception $e) {
+						$this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
+					}
+				}
+				else
+				{
+					/* EXPERIMENTAL */
+					// TODO: change these p elements back to text nodes after processing
+					for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
+						$childNode = $node->childNodes->item($i);
+						if ($childNode->nodeType == 3) { // XML_TEXT_NODE
+							//$this->dbg('replacing text node with a p tag with the same content.');
+							$p = $this->dom->createElement('p');
+							$p->innerHTML = $childNode->nodeValue;
+							$p->setAttribute('style', 'display: inline;');
+							$p->setAttribute('class', 'readability-styled');
+							$childNode->parentNode->replaceChild($p, $childNode);
+						}
+					}
+				}
+			}
+		}
+		
+		/**
+		* Loop through all paragraphs, and assign a score to them based on how content-y they look.
+		* Then add their score to their parent node.
+		*
+		* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+		**/
+		$candidates = array();
+		for ($pt=0; $pt < count($nodesToScore); $pt++) {
+			$parentNode      = $nodesToScore[$pt]->parentNode;
+			// $grandParentNode = $parentNode ? $parentNode->parentNode : null;
+			$grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
+			$innerText       = $this->getInnerText($nodesToScore[$pt]);
+
+			if (!$parentNode || !isset($parentNode->tagName)) {
+				continue;
+			}
+
+			/* If this paragraph is less than 25 characters, don't even count it. */
+			if(strlen($innerText) < 25) {
+				continue;
+			}
+
+			/* Initialize readability data for the parent. */
+			if (!$parentNode->hasAttribute('readability')) 
+			{
+				$this->initializeNode($parentNode);
+				$candidates[] = $parentNode;
+			}
+
+			/* Initialize readability data for the grandparent. */
+			if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
+			{
+				$this->initializeNode($grandParentNode);
+				$candidates[] = $grandParentNode;
+			}
+
+			$contentScore = 0;
+
+			/* Add a point for the paragraph itself as a base. */
+			$contentScore++;
+
+			/* Add points for any commas within this paragraph */
+			$contentScore += count(explode(',', $innerText));
+			
+			/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+			$contentScore += min(floor(strlen($innerText) / 100), 3);
+			
+			/* Add the score to the parent. The grandparent gets half. */
+			$parentNode->getAttributeNode('readability')->value += $contentScore;
+
+			if ($grandParentNode) {
+				$grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             
+			}
+		}
+
+		/**
+		* After we've calculated scores, loop through all of the possible candidate nodes we found
+		* and find the one with the highest score.
+		**/
+		$topCandidate = null;
+		for ($c=0, $cl=count($candidates); $c < $cl; $c++)
+		{
+			/**
+			* Scale the final candidates score based on link density. Good content should have a
+			* relatively small link density (5% or less) and be mostly unaffected by this operation.
+			**/
+			$readability = $candidates[$c]->getAttributeNode('readability');
+			$readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
+
+			$this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
+
+			if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
+				$topCandidate = $candidates[$c];
+			}
+		}
+
+		/**
+		* If we still have no top candidate, just use the body as a last resort.
+		* We also have to copy the body node so it is something we can modify.
+		**/
+		if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
+		{
+			$topCandidate = $this->dom->createElement('div');
+			if ($page instanceof DOMDocument) {
+				if (!isset($page->documentElement)) {
+					// we don't have a body either? what a mess! :)
+				} else {
+					$topCandidate->innerHTML = $page->documentElement->innerHTML;
+					$page->documentElement->innerHTML = '';
+					$page->documentElement->appendChild($topCandidate);
+				}
+			} else {
+				$topCandidate->innerHTML = $page->innerHTML;
+				$page->innerHTML = '';
+				$page->appendChild($topCandidate);
+			}
+			$this->initializeNode($topCandidate);
+		}
+
+		/**
+		* Now that we have the top candidate, look through its siblings for content that might also be related.
+		* Things like preambles, content split by ads that we removed, etc.
+		**/
+		$articleContent        = $this->dom->createElement('div');
+		$articleContent->setAttribute('id', 'readability-content');
+		$siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
+		$siblingNodes          = $topCandidate->parentNode->childNodes;
+		if (!isset($siblingNodes)) {
+			$siblingNodes = new stdClass;
+			$siblingNodes->length = 0;
+		}
+
+		for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
+		{
+			$siblingNode = $siblingNodes->item($s);
+			$append      = false;
+
+			$this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+
+			//dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
+
+			if ($siblingNode === $topCandidate)
+			// or if ($siblingNode->isSameNode($topCandidate))
+			{
+				$append = true;
+			}
+
+			$contentBonus = 0;
+			/* Give a bonus if sibling nodes and top candidates have the example same classname */
+			if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
+				$contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
+			}
+
+			if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
+			{
+				$append = true;
+			}
+			
+			if (strtoupper($siblingNode->nodeName) == 'P') {
+				$linkDensity = $this->getLinkDensity($siblingNode);
+				$nodeContent = $this->getInnerText($siblingNode);
+				$nodeLength  = strlen($nodeContent);
+				
+				if ($nodeLength > 80 && $linkDensity < 0.25)
+				{
+					$append = true;
+				}
+				else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
+				{
+					$append = true;
+				}
+			}
+
+			if ($append)
+			{
+				$this->dbg('Appending node: ' . $siblingNode->nodeName);
+
+				$nodeToAppend = null;
+				$sibNodeName = strtoupper($siblingNode->nodeName);
+				if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
+					/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+					
+					$this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
+					$nodeToAppend = $this->dom->createElement('div');
+					try {
+						$nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
+						$nodeToAppend->innerHTML = $siblingNode->innerHTML;
+					}
+					catch(Exception $e)
+					{
+						$this->dbg('Could not alter siblingNode to div, reverting back to original.');
+						$nodeToAppend = $siblingNode;
+						$s--;
+						$sl--;
+					}
+				} else {
+					$nodeToAppend = $siblingNode;
+					$s--;
+					$sl--;
+				}
+				
+				/* To ensure a node does not interfere with readability styles, remove its classnames */
+				$nodeToAppend->removeAttribute('class');
+
+				/* Append sibling and subtract from our list because it removes the node when you append to another node */
+				$articleContent->appendChild($nodeToAppend);
+			}
+		}
+
+		/**
+		* So we have all of the content that we need. Now we clean it up for presentation.
+		**/
+		$this->prepArticle($articleContent);
+
+		/**
+		* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+		* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+		* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+		* finding the -right- content.
+		**/
+		if (strlen($this->getInnerText($articleContent, false)) < 250)
+		{
+			// TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+			// in the meantime, we check and create an empty element if it's not there.
+			if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
+			$this->body->innerHTML = $this->bodyCache;
+			
+			if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+				$this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+				return $this->grabArticle($this->body);
+			}
+			else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+				$this->removeFlag(self::FLAG_WEIGHT_CLASSES);
+				return $this->grabArticle($this->body);              
+			}
+			else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+				$this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+				return $this->grabArticle($this->body);
+			}
+			else {
+				return false;
+			}
+		}
+		return $articleContent;
+	}
+	
+	/**
+	* Remove script tags from document
+	*
+	* @param DOMElement
+	* @return void
+	*/
+	public function removeScripts($doc) {
+		$scripts = $doc->getElementsByTagName('script');
+		for($i = $scripts->length-1; $i >= 0; $i--)
+		{
+			$scripts->item($i)->parentNode->removeChild($scripts->item($i));
+		}
+	}
+	
+	/**
+	* Get the inner text of a node.
+	* This also strips out any excess whitespace to be found.
+	*
+	* @param DOMElement $
+	* @param boolean $normalizeSpaces (default: true)
+	* @return string
+	**/
+	public function getInnerText($e, $normalizeSpaces=true) {
+		$textContent = '';
+
+		if (!isset($e->textContent) || $e->textContent == '') {
+			return '';
+		}
+
+		$textContent = trim($e->textContent);
+
+		if ($normalizeSpaces) {
+			return preg_replace($this->regexps['normalize'], ' ', $textContent);
+		} else {
+			return $textContent;
+		}
+	}
+
+	/**
+	* Get the number of times a string $s appears in the node $e.
+	*
+	* @param DOMElement $e
+	* @param string - what to count. Default is ","
+	* @return number (integer)
+	**/
+	public function getCharCount($e, $s=',') {
+		return substr_count($this->getInnerText($e), $s);
+	}
+
+	/**
+	* Remove the style attribute on every $e and under.
+	*
+	* @param DOMElement $e
+	* @return void
+	*/
+	public function cleanStyles($e) {
+		if (!is_object($e)) return;
+		$elems = $e->getElementsByTagName('*');
+		foreach ($elems as $elem) {
+			$elem->removeAttribute('style');
+		}
+	}
+	
+	/**
+	* Get the density of links as a percentage of the content
+	* This is the amount of text that is inside a link divided by the total text in the node.
+	* 
+	* @param DOMElement $e
+	* @return number (float)
+	*/
+	public function getLinkDensity($e) {
+		$links      = $e->getElementsByTagName('a');
+		$textLength = strlen($this->getInnerText($e));
+		$linkLength = 0;
+		for ($i=0, $il=$links->length; $i < $il; $i++)
+		{
+			$linkLength += strlen($this->getInnerText($links->item($i)));
+		}
+		if ($textLength > 0) {
+			return $linkLength / $textLength;
+		} else {
+			return 0;
+		}
+	}
+	
+	/**
+	* Get an elements class/id weight. Uses regular expressions to tell if this 
+	* element looks good or bad.
+	*
+	* @param DOMElement $e
+	* @return number (Integer)
+	*/
+	public function getClassWeight($e) {
+		if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+			return 0;
+		}
+
+		$weight = 0;
+
+		/* Look for a special classname */
+		if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
+		{
+			if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
+				$weight -= 25;
+			}
+			if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
+				$weight += 25;
+			}
+		}
+
+		/* Look for a special ID */
+		if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
+		{
+			if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
+				$weight -= 25;
+			}
+			if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
+				$weight += 25;
+			}
+		}
+		return $weight;
+	}
+
+	/**
+	* Remove extraneous break tags from a node.
+	*
+	* @param DOMElement $node
+	* @return void
+	*/
+	public function killBreaks($node) {
+		$html = $node->innerHTML;
+		$html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
+		$node->innerHTML = $html;
+	}
+
+	/**
+	* Clean a node of all elements of type "tag".
+	* (Unless it's a youtube/vimeo video. People love movies.)
+	*
+	* Updated 2012-09-18 to preserve youtube/vimeo iframes
+	*
+	* @param DOMElement $e
+	* @param string $tag
+	* @return void
+	*/
+	public function clean($e, $tag) {
+		$targetList = $e->getElementsByTagName($tag);
+		$isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
+		
+		for ($y=$targetList->length-1; $y >= 0; $y--) {
+			/* Allow youtube and vimeo videos through as people usually want to see those. */
+			if ($isEmbed) {
+				$attributeValues = '';
+				for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
+					$attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
+				}
+				
+				/* First, check the elements attributes to see if any of them contain youtube or vimeo */
+				if (preg_match($this->regexps['video'], $attributeValues)) {
+					continue;
+				}
+
+				/* Then check the elements inside this element for the same. */
+				if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
+					continue;
+				}
+			}
+			$targetList->item($y)->parentNode->removeChild($targetList->item($y));
+		}
+	}
+	
+	/**
+	* Clean an element of all tags of type "tag" if they look fishy.
+	* "Fishy" is an algorithm based on content length, classnames, 
+	* link density, number of images & embeds, etc.
+	*
+	* @param DOMElement $e
+	* @param string $tag
+	* @return void
+	*/
+	public function cleanConditionally($e, $tag) {
+		if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+			return;
+		}
+
+		$tagsList = $e->getElementsByTagName($tag);
+		$curTagsLength = $tagsList->length;
+
+		/**
+		* Gather counts for other typical elements embedded within.
+		* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+		*
+		* TODO: Consider taking into account original contentScore here.
+		*/
+		for ($i=$curTagsLength-1; $i >= 0; $i--) {
+			$weight = $this->getClassWeight($tagsList->item($i));
+			$contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
+			
+			$this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
+
+			if ($weight + $contentScore < 0) {
+				$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+			}
+			else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
+				/**
+				* If there are not very many commas, and the number of
+				* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+				**/
+				$p      = $tagsList->item($i)->getElementsByTagName('p')->length;
+				$img    = $tagsList->item($i)->getElementsByTagName('img')->length;
+				$li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
+				$input  = $tagsList->item($i)->getElementsByTagName('input')->length;
+				$a 		= $tagsList->item($i)->getElementsByTagName('a')->length;
+
+				$embedCount = 0;
+				$embeds = $tagsList->item($i)->getElementsByTagName('embed');
+				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+						$embedCount++; 
+					}
+				}
+				$embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+				for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+					if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+						$embedCount++; 
+					}
+				}
+
+				$linkDensity   = $this->getLinkDensity($tagsList->item($i));
+				$contentLength = strlen($this->getInnerText($tagsList->item($i)));
+				$toRemove      = false;
+
+				if ($this->lightClean) {
+					$this->dbg('Light clean...');
+					if ( ($img > $p) && ($img > 4) ) {
+						$this->dbg(' more than 4 images and more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+						$this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+						$this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if($embedCount > 3) {
+						$this->dbg(' more than 3 embeds');
+						$toRemove = true;
+					}
+				} else {
+					$this->dbg('Standard clean...');
+					if ( $img > $p ) {
+						$this->dbg(' more image elements than paragraph elements');
+						$toRemove = true;
+					} else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+						$this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+						$toRemove = true;
+					} else if ( $input > floor($p/3) ) {
+						$this->dbg(' too many <input> elements');
+						$toRemove = true; 
+					} else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+						$this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+						$toRemove = true;
+					} else if($weight < 25 && $linkDensity > 0.2) {
+						$this->dbg(' weight smaller than 25 and link density above 0.2');
+						$toRemove = true;
+					} else if($weight >= 25 && $linkDensity > 0.5) {
+						$this->dbg(' weight above 25 but link density greater than 0.5');
+						$toRemove = true;
+					} else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+						$this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+						$toRemove = true;
+					}
+				}
+
+				if ($toRemove) {
+					//$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
+					$tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+				}
+			}
+		}
+	}
+
+	/**
+	* Clean out spurious headers from an Element. Checks things like classnames and link density.
+	*
+	* @param DOMElement $e
+	* @return void
+	*/
+	public function cleanHeaders($e) {
+		for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+			$headers = $e->getElementsByTagName('h' . $headerIndex);
+			for ($i=$headers->length-1; $i >=0; $i--) {
+				if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+					$headers->item($i)->parentNode->removeChild($headers->item($i));
+				}
+			}
+		}
+	}
+
+	public function flagIsActive($flag) {
+		return ($this->flags & $flag) > 0;
+	}
+	
+	public function addFlag($flag) {
+		$this->flags = $this->flags | $flag;
+	}
+	
+	public function removeFlag($flag) {
+		$this->flags = $this->flags & ~$flag;
+	}
+}
 ?>
\ No newline at end of file
-- 
cgit v1.2.3


From d18ff7d9565f982bc15c5930123992d44614e1e2 Mon Sep 17 00:00:00 2001
From: Maryana Rozhankivska <mariroz@mr.lviv.ua>
Date: Fri, 23 May 2014 19:25:48 +0300
Subject: two small unimportant forgotten changes to 3.2 version of
 full-text-rss, issue #694

---
 inc/3rdparty/libraries/language-detect/Parser.php | 354 ----------------------
 1 file changed, 354 deletions(-)
 delete mode 100644 inc/3rdparty/libraries/language-detect/Parser.php

(limited to 'inc/3rdparty/libraries')

diff --git a/inc/3rdparty/libraries/language-detect/Parser.php b/inc/3rdparty/libraries/language-detect/Parser.php
deleted file mode 100644
index 7f15fa98..00000000
--- a/inc/3rdparty/libraries/language-detect/Parser.php
+++ /dev/null
@@ -1,354 +0,0 @@
-<?php
-
-/**
- * This class represents a text sample to be parsed.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
- * @link        http://pear.php.net/package/Text_LanguageDetect/
- * @link        http://langdetect.blogspot.com/
- */
-
-/**
- * This class represents a text sample to be parsed.
- *
- * This separates the analysis of a text sample from the primary LanguageDetect
- * class. After a new profile has been built, the data can be retrieved using
- * the accessor functions.
- *
- * This class is intended to be used by the Text_LanguageDetect class, not 
- * end-users.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     release: 0.2.3
- */
-class Text_LanguageDetect_Parser extends Text_LanguageDetect
-{
-    /**
-     * the piece of text being parsed
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_string;
-
-    /**
-     * stores the trigram frequencies of the sample
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_trigrams = array();
-
-    /**
-     * stores the trigram ranks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_trigram_ranks = array();
-
-    /**
-     * stores the unicode blocks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_unicode_blocks = array();
-    
-    /**
-     * Whether the parser should compile the unicode ranges
-     * 
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_unicode = false;
-
-    /**
-     * Whether the parser should compile trigrams
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_trigram = false;
-
-    /**
-     * Whether the trigram parser should pad the beginning of the string
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_trigram_pad_start = false;
-
-    /**
-     * Whether the unicode parser should skip non-alphabetical ascii chars
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_unicode_skip_symbols = true;
-
-    /**
-     * Constructor
-     *
-     * @access  private
-     * @param   string  $string     string to be parsed
-     */
-    function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
-		if (isset($db)) $this->_db_filename = $db;
-		if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;	
-        $this->_string = $string;
-    }
-
-    /**
-     * Returns true if a string is suitable for parsing
-     *
-     * @static
-     * @access  public
-     * @param   string  $str    input string to test
-     * @return  bool            true if acceptable, false if not
-     */
-    function validateString($str) {
-        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    /**
-     * turn on/off trigram counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareTrigram($bool = true)
-    {
-        $this->_compile_trigram = $bool;
-    }
-
-    /**
-     * turn on/off unicode block counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareUnicode($bool = true)
-    {
-        $this->_compile_unicode = $bool;
-    }
-
-    /**
-     * turn on/off padding the beginning of the sample string
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setPadStart($bool = true)
-    {
-        $this->_trigram_pad_start = $bool;
-    }
-
-    /**
-     * Should the unicode block counter skip non-alphabetical ascii chars?
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setUnicodeSkipSymbols($bool = true)
-    {
-        $this->_unicode_skip_symbols = $bool;
-    }
-
-    /**
-     * Returns the trigram ranks for the text sample
-     *
-     * @access  public
-     * @return  array    trigram ranks in the text sample
-     */
-    function &getTrigramRanks()
-    {
-        return $this->_trigram_ranks;
-    }
-
-    /**
-     * Return the trigram freqency table
-     *
-     * only used in testing to make sure the parser is working
-     *
-     * @access  public
-     * @return  array    trigram freqencies in the text sample
-     */
-    function &getTrigramFreqs()
-    {
-        return $this->_trigram;
-    }
-
-    /**
-     * returns the array of unicode blocks
-     *
-     * @access  public
-     * @return  array   unicode blocks in the text sample
-     */
-    function &getUnicodeBlocks()
-    {
-        return $this->_unicode_blocks;
-    }
-
-    /**
-     * Executes the parsing operation
-     * 
-     * Be sure to call the set*() functions to set options and the 
-     * prepare*() functions first to tell it what kind of data to compute
-     *
-     * Afterwards the get*() functions can be used to access the compiled
-     * information.
-     *
-     * @access public
-     */
-    function analyze()
-    {
-        $len = strlen($this->_string);
-        $byte_counter = 0;
-
-
-        // unicode startup
-        if ($this->_compile_unicode) {
-            $blocks =& $this->_read_unicode_block_db();
-
-            $block_count = count($blocks);
-
-            $skipped_count = 0;
-            $unicode_chars = array();
-        }
-
-        // trigram startup
-        if ($this->_compile_trigram) {
-            // initialize them as blank so the parser will skip the first two
-            // (since it skips trigrams with more than  2 contiguous spaces)
-            $a = ' ';
-            $b = ' ';
-
-            // kludge
-            // if it finds a valid trigram to start and the start pad option is
-            // off, then set a variable that will be used to reduce this
-            // trigram after parsing has finished
-            if (!$this->_trigram_pad_start) {
-                $a = $this->_next_char($this->_string, $byte_counter, true);
-
-                if ($a != ' ') {
-                    $b = $this->_next_char($this->_string, $byte_counter, true);
-                    $dropone = " $a$b";
-                }
-
-                $byte_counter = 0;
-                $a = ' ';
-                $b = ' ';
-            }
-        }
-
-        while ($byte_counter < $len) {
-            $char = $this->_next_char($this->_string, $byte_counter, true);
-
-
-            // language trigram detection
-            if ($this->_compile_trigram) {
-                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
-                    if (!isset($this->_trigram[$a . $b . $char])) {
-                       $this->_trigram[$a . $b . $char] = 1;
-                    } else {
-                       $this->_trigram[$a . $b . $char]++;
-                    }
-                }
-
-                $a = $b;
-                $b = $char;
-            }
-
-            // unicode block detection
-            if ($this->_compile_unicode) {
-                if ($this->_unicode_skip_symbols
-                        && strlen($char) == 1
-                        && ($char < 'A' || $char > 'z'
-                        || ($char > 'Z' && $char < 'a'))
-                        && $char != "'") {  // does not skip the apostrophe
-                                            // since it's included in the language
-                                            // models
-
-                    $skipped_count++;
-                    continue;
-                }
-
-                // build an array of all the characters
-                if (isset($unicode_chars[$char])) {
-                    $unicode_chars[$char]++;
-                } else {
-                    $unicode_chars[$char] = 1;
-                }
-            }
-
-            // todo: add byte detection here
-        }
-
-        // unicode cleanup
-        if ($this->_compile_unicode) {
-            foreach ($unicode_chars as $utf8_char => $count) {
-                $search_result = $this->_unicode_block_name(
-                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
-
-                if ($search_result != -1) {
-                    $block_name = $search_result[2];
-                } else {
-                    $block_name = '[Malformatted]';
-                }
-
-                if (isset($this->_unicode_blocks[$block_name])) {
-                    $this->_unicode_blocks[$block_name] += $count;
-                } else {
-                    $this->_unicode_blocks[$block_name] = $count;
-                }
-            }
-        }
-
-
-        // trigram cleanup
-        if ($this->_compile_trigram) {
-            // pad the end
-            if ($b != ' ') {
-                if (!isset($this->_trigram["$a$b "])) {
-                    $this->_trigram["$a$b "] = 1;
-                } else {
-                    $this->_trigram["$a$b "]++;
-                }
-            }
-
-            // perl compatibility; Language::Guess does not pad the beginning
-            // kludge
-            if (isset($dropone)) {
-                if ($this->_trigram[$dropone] == 1) {
-                    unset($this->_trigram[$dropone]);
-                } else {
-                    $this->_trigram[$dropone]--;
-                }
-            }
-
-            if (!empty($this->_trigram)) {
-                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
-            } else {
-                $this->_trigram_ranks = array();
-            }
-        }
-    }
-}
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
-
-?>
-- 
cgit v1.2.3


From a50583fb97615f4c26cc84ee95d62f867a84b4e6 Mon Sep 17 00:00:00 2001
From: Maryana Rozhankivska <mariroz@mr.lviv.ua>
Date: Fri, 23 May 2014 19:27:17 +0300
Subject: last 3 important changes to 3.2 version of full-text-rss, issue #694

---
 .../language-detect/LanguageDetect/Exception.php   |  57 ++++
 .../language-detect/LanguageDetect/ISO639.php      | 339 ++++++++++++++++++++
 .../language-detect/LanguageDetect/Parser.php      | 347 +++++++++++++++++++++
 3 files changed, 743 insertions(+)
 create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
 create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
 create mode 100644 inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php

(limited to 'inc/3rdparty/libraries')

diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
new file mode 100644
index 00000000..196d994f
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
+<?php
+class Text_LanguageDetect_Exception extends Exception
+{
+    /**
+     * Database file could not be found
+     */
+    const DB_NOT_FOUND = 10;
+
+    /**
+     * Database file found, but not readable
+     */
+    const DB_NOT_READABLE = 11;
+
+    /**
+     * Database file is empty
+     */
+    const DB_EMPTY = 12;
+
+    /**
+     * Database contents is not a PHP array
+     */
+    const DB_NOT_ARRAY = 13;
+
+    /**
+     * Magic quotes are activated
+     */
+    const MAGIC_QUOTES = 14;
+
+
+    /**
+     * Parameter of invalid type passed to method
+     */
+    const PARAM_TYPE = 20;
+
+    /**
+     * Character in parameter is invalid
+     */
+    const INVALID_CHAR = 21;
+
+
+    /**
+     * Language is not in the database
+     */
+    const UNKNOWN_LANGUAGE = 30;
+
+
+    /**
+     * Error during block detection
+     */
+    const BLOCK_DETECTION = 40;
+
+
+    /**
+     * Error while clustering languages
+     */
+    const NO_HIGHEST_KEY = 50;
+}
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
new file mode 100644
index 00000000..05b0590d
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/ISO639.php
@@ -0,0 +1,339 @@
+<?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   SVN: $Id$
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ */
+
+/**
+ * Provides a mapping between the languages from lang.dat and the
+ * ISO 639-1 and ISO-639-2 codes.
+ *
+ * Note that this class contains only languages that exist in lang.dat.
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @link      http://www.loc.gov/standards/iso639-2/php/code_list.php
+ */
+class Text_LanguageDetect_ISO639
+{
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-1 2-letter language code.
+     *
+     * NULL indicates that there is no 2-letter code.
+     *
+     * @var array
+     */
+    public static $nameToCode2 = array(
+        'albanian'   => 'sq',
+        'arabic'     => 'ar',
+        'azeri'      => 'az',
+        'bengali'    => 'bn',
+        'bulgarian'  => 'bg',
+        'cebuano'    => null,
+        'croatian'   => 'hr',
+        'czech'      => 'cs',
+        'danish'     => 'da',
+        'dutch'      => 'nl',
+        'english'    => 'en',
+        'estonian'   => 'et',
+        'farsi'      => 'fa',
+        'finnish'    => 'fi',
+        'french'     => 'fr',
+        'german'     => 'de',
+        'hausa'      => 'ha',
+        'hawaiian'   => null,
+        'hindi'      => 'hi',
+        'hungarian'  => 'hu',
+        'icelandic'  => 'is',
+        'indonesian' => 'id',
+        'italian'    => 'it',
+        'kazakh'     => 'kk',
+        'kyrgyz'     => 'ky',
+        'latin'      => 'la',
+        'latvian'    => 'lv',
+        'lithuanian' => 'lt',
+        'macedonian' => 'mk',
+        'mongolian'  => 'mn',
+        'nepali'     => 'ne',
+        'norwegian'  => 'no',
+        'pashto'     => 'ps',
+        'pidgin'     => null,
+        'polish'     => 'pl',
+        'portuguese' => 'pt',
+        'romanian'   => 'ro',
+        'russian'    => 'ru',
+        'serbian'    => 'sr',
+        'slovak'     => 'sk',
+        'slovene'    => 'sl',
+        'somali'     => 'so',
+        'spanish'    => 'es',
+        'swahili'    => 'sw',
+        'swedish'    => 'sv',
+        'tagalog'    => 'tl',
+        'turkish'    => 'tr',
+        'ukrainian'  => 'uk',
+        'urdu'       => 'ur',
+        'uzbek'      => 'uz',
+        'vietnamese' => 'vi',
+        'welsh'      => 'cy',
+    );
+
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-2 3-letter language code.
+     *
+     * @var array
+     */
+    public static $nameToCode3 = array(
+        'albanian'   => 'sqi',
+        'arabic'     => 'ara',
+        'azeri'      => 'aze',
+        'bengali'    => 'ben',
+        'bulgarian'  => 'bul',
+        'cebuano'    => 'ceb',
+        'croatian'   => 'hrv',
+        'czech'      => 'ces',
+        'danish'     => 'dan',
+        'dutch'      => 'nld',
+        'english'    => 'eng',
+        'estonian'   => 'est',
+        'farsi'      => 'fas',
+        'finnish'    => 'fin',
+        'french'     => 'fra',
+        'german'     => 'deu',
+        'hausa'      => 'hau',
+        'hawaiian'   => 'haw',
+        'hindi'      => 'hin',
+        'hungarian'  => 'hun',
+        'icelandic'  => 'isl',
+        'indonesian' => 'ind',
+        'italian'    => 'ita',
+        'kazakh'     => 'kaz',
+        'kyrgyz'     => 'kir',
+        'latin'      => 'lat',
+        'latvian'    => 'lav',
+        'lithuanian' => 'lit',
+        'macedonian' => 'mkd',
+        'mongolian'  => 'mon',
+        'nepali'     => 'nep',
+        'norwegian'  => 'nor',
+        'pashto'     => 'pus',
+        'pidgin'     => 'crp',
+        'polish'     => 'pol',
+        'portuguese' => 'por',
+        'romanian'   => 'ron',
+        'russian'    => 'rus',
+        'serbian'    => 'srp',
+        'slovak'     => 'slk',
+        'slovene'    => 'slv',
+        'somali'     => 'som',
+        'spanish'    => 'spa',
+        'swahili'    => 'swa',
+        'swedish'    => 'swe',
+        'tagalog'    => 'tgl',
+        'turkish'    => 'tur',
+        'ukrainian'  => 'ukr',
+        'urdu'       => 'urd',
+        'uzbek'      => 'uzb',
+        'vietnamese' => 'vie',
+        'welsh'      => 'cym',
+    );
+
+    /**
+     * Maps ISO 639-1 2-letter language codes to the language names
+     * in the language database
+     *
+     * Not all languages have a 2 letter code, so some are missing
+     *
+     * @var array
+     */
+    public static $code2ToName = array(
+        'ar' => 'arabic',
+        'az' => 'azeri',
+        'bg' => 'bulgarian',
+        'bn' => 'bengali',
+        'cs' => 'czech',
+        'cy' => 'welsh',
+        'da' => 'danish',
+        'de' => 'german',
+        'en' => 'english',
+        'es' => 'spanish',
+        'et' => 'estonian',
+        'fa' => 'farsi',
+        'fi' => 'finnish',
+        'fr' => 'french',
+        'ha' => 'hausa',
+        'hi' => 'hindi',
+        'hr' => 'croatian',
+        'hu' => 'hungarian',
+        'id' => 'indonesian',
+        'is' => 'icelandic',
+        'it' => 'italian',
+        'kk' => 'kazakh',
+        'ky' => 'kyrgyz',
+        'la' => 'latin',
+        'lt' => 'lithuanian',
+        'lv' => 'latvian',
+        'mk' => 'macedonian',
+        'mn' => 'mongolian',
+        'ne' => 'nepali',
+        'nl' => 'dutch',
+        'no' => 'norwegian',
+        'pl' => 'polish',
+        'ps' => 'pashto',
+        'pt' => 'portuguese',
+        'ro' => 'romanian',
+        'ru' => 'russian',
+        'sk' => 'slovak',
+        'sl' => 'slovene',
+        'so' => 'somali',
+        'sq' => 'albanian',
+        'sr' => 'serbian',
+        'sv' => 'swedish',
+        'sw' => 'swahili',
+        'tl' => 'tagalog',
+        'tr' => 'turkish',
+        'uk' => 'ukrainian',
+        'ur' => 'urdu',
+        'uz' => 'uzbek',
+        'vi' => 'vietnamese',
+    );
+
+    /**
+     * Maps ISO 639-2 3-letter language codes to the language names
+     * in the language database.
+     *
+     * @var array
+     */
+    public static $code3ToName = array(
+        'ara' => 'arabic',
+        'aze' => 'azeri',
+        'ben' => 'bengali',
+        'bul' => 'bulgarian',
+        'ceb' => 'cebuano',
+        'ces' => 'czech',
+        'crp' => 'pidgin',
+        'cym' => 'welsh',
+        'dan' => 'danish',
+        'deu' => 'german',
+        'eng' => 'english',
+        'est' => 'estonian',
+        'fas' => 'farsi',
+        'fin' => 'finnish',
+        'fra' => 'french',
+        'hau' => 'hausa',
+        'haw' => 'hawaiian',
+        'hin' => 'hindi',
+        'hrv' => 'croatian',
+        'hun' => 'hungarian',
+        'ind' => 'indonesian',
+        'isl' => 'icelandic',
+        'ita' => 'italian',
+        'kaz' => 'kazakh',
+        'kir' => 'kyrgyz',
+        'lat' => 'latin',
+        'lav' => 'latvian',
+        'lit' => 'lithuanian',
+        'mkd' => 'macedonian',
+        'mon' => 'mongolian',
+        'nep' => 'nepali',
+        'nld' => 'dutch',
+        'nor' => 'norwegian',
+        'pol' => 'polish',
+        'por' => 'portuguese',
+        'pus' => 'pashto',
+        'rom' => 'romanian',
+        'rus' => 'russian',
+        'slk' => 'slovak',
+        'slv' => 'slovene',
+        'som' => 'somali',
+        'spa' => 'spanish',
+        'sqi' => 'albanian',
+        'srp' => 'serbian',
+        'swa' => 'swahili',
+        'swe' => 'swedish',
+        'tgl' => 'tagalog',
+        'tur' => 'turkish',
+        'ukr' => 'ukrainian',
+        'urd' => 'urdu',
+        'uzb' => 'uzbek',
+        'vie' => 'vietnamese',
+    );
+
+    /**
+     * Returns the 2-letter ISO 639-1 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Two-letter language code (e.g. "sv") or NULL if not found
+     */
+    public static function nameToCode2($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode2[$lang])) {
+            return null;
+        }
+        return self::$nameToCode2[$lang];
+    }
+
+    /**
+     * Returns the 3-letter ISO 639-2 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Three-letter language code (e.g. "swe") or NULL if not found
+     */
+    public static function nameToCode3($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode3[$lang])) {
+            return null;
+        }
+        return self::$nameToCode3[$lang];
+    }
+
+    /**
+     * Returns the language name for the given 2-letter ISO 639-1 code.
+     *
+     * @param string $code Two-letter language code (e.g. "sv")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code2ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code2ToName[$code])) {
+            return null;
+        }
+        return self::$code2ToName[$code];
+    }
+
+    /**
+     * Returns the language name for the given 3-letter ISO 639-2 code.
+     *
+     * @param string $code Three-letter language code (e.g. "swe")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code3ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code3ToName[$code])) {
+            return null;
+        }
+        return self::$code3ToName[$code];
+    }
+}
\ No newline at end of file
diff --git a/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
new file mode 100644
index 00000000..fb0e1e20
--- /dev/null
+++ b/inc/3rdparty/libraries/language-detect/LanguageDetect/Parser.php
@@ -0,0 +1,347 @@
+<?php
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
+ * @link        http://pear.php.net/package/Text_LanguageDetect/
+ * @link        http://langdetect.blogspot.com/
+ */
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not 
+ * end-users.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     release: 0.3.0
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+    /**
+     * the piece of text being parsed
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_string;
+
+    /**
+     * stores the trigram frequencies of the sample
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_trigrams = array();
+
+    /**
+     * stores the trigram ranks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_trigram_ranks = array();
+
+    /**
+     * stores the unicode blocks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_unicode_blocks = array();
+    
+    /**
+     * Whether the parser should compile the unicode ranges
+     * 
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_unicode = false;
+
+    /**
+     * Whether the parser should compile trigrams
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_trigram = false;
+
+    /**
+     * Whether the trigram parser should pad the beginning of the string
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_trigram_pad_start = false;
+
+    /**
+     * Whether the unicode parser should skip non-alphabetical ascii chars
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_unicode_skip_symbols = true;
+
+    /**
+     * Constructor
+     *
+     * @access  private
+     * @param   string  $string     string to be parsed
+     */
+    function Text_LanguageDetect_Parser($string) {
+        $this->_string = $string;
+    }
+
+    /**
+     * Returns true if a string is suitable for parsing
+     *
+     * @param   string  $str    input string to test
+     * @return  bool            true if acceptable, false if not
+     */
+    public static function validateString($str) {
+        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * turn on/off trigram counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareTrigram($bool = true)
+    {
+        $this->_compile_trigram = $bool;
+    }
+
+    /**
+     * turn on/off unicode block counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareUnicode($bool = true)
+    {
+        $this->_compile_unicode = $bool;
+    }
+
+    /**
+     * turn on/off padding the beginning of the sample string
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setPadStart($bool = true)
+    {
+        $this->_trigram_pad_start = $bool;
+    }
+
+    /**
+     * Should the unicode block counter skip non-alphabetical ascii chars?
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setUnicodeSkipSymbols($bool = true)
+    {
+        $this->_unicode_skip_symbols = $bool;
+    }
+
+    /**
+     * Returns the trigram ranks for the text sample
+     *
+     * @access  public
+     * @return  array    trigram ranks in the text sample
+     */
+    function &getTrigramRanks()
+    {
+        return $this->_trigram_ranks;
+    }
+
+    /**
+     * Return the trigram freqency table
+     *
+     * only used in testing to make sure the parser is working
+     *
+     * @access  public
+     * @return  array    trigram freqencies in the text sample
+     */
+    function &getTrigramFreqs()
+    {
+        return $this->_trigram;
+    }
+
+    /**
+     * returns the array of unicode blocks
+     *
+     * @access  public
+     * @return  array   unicode blocks in the text sample
+     */
+    function &getUnicodeBlocks()
+    {
+        return $this->_unicode_blocks;
+    }
+
+    /**
+     * Executes the parsing operation
+     * 
+     * Be sure to call the set*() functions to set options and the 
+     * prepare*() functions first to tell it what kind of data to compute
+     *
+     * Afterwards the get*() functions can be used to access the compiled
+     * information.
+     *
+     * @access public
+     */
+    function analyze()
+    {
+        $len = strlen($this->_string);
+        $byte_counter = 0;
+
+
+        // unicode startup
+        if ($this->_compile_unicode) {
+            $blocks = $this->_read_unicode_block_db();
+            $block_count = count($blocks);
+
+            $skipped_count = 0;
+            $unicode_chars = array();
+        }
+
+        // trigram startup
+        if ($this->_compile_trigram) {
+            // initialize them as blank so the parser will skip the first two
+            // (since it skips trigrams with more than  2 contiguous spaces)
+            $a = ' ';
+            $b = ' ';
+
+            // kludge
+            // if it finds a valid trigram to start and the start pad option is
+            // off, then set a variable that will be used to reduce this
+            // trigram after parsing has finished
+            if (!$this->_trigram_pad_start) {
+                $a = $this->_next_char($this->_string, $byte_counter, true);
+
+                if ($a != ' ') {
+                    $b = $this->_next_char($this->_string, $byte_counter, true);
+                    $dropone = " $a$b";
+                }
+
+                $byte_counter = 0;
+                $a = ' ';
+                $b = ' ';
+            }
+        }
+
+        while ($byte_counter < $len) {
+            $char = $this->_next_char($this->_string, $byte_counter, true);
+
+
+            // language trigram detection
+            if ($this->_compile_trigram) {
+                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+                    if (!isset($this->_trigram[$a . $b . $char])) {
+                       $this->_trigram[$a . $b . $char] = 1;
+                    } else {
+                       $this->_trigram[$a . $b . $char]++;
+                    }
+                }
+
+                $a = $b;
+                $b = $char;
+            }
+
+            // unicode block detection
+            if ($this->_compile_unicode) {
+                if ($this->_unicode_skip_symbols
+                        && strlen($char) == 1
+                        && ($char < 'A' || $char > 'z'
+                        || ($char > 'Z' && $char < 'a'))
+                        && $char != "'") {  // does not skip the apostrophe
+                                            // since it's included in the language
+                                            // models
+
+                    $skipped_count++;
+                    continue;
+                }
+
+                // build an array of all the characters
+                if (isset($unicode_chars[$char])) {
+                    $unicode_chars[$char]++;
+                } else {
+                    $unicode_chars[$char] = 1;
+                }
+            }
+
+            // todo: add byte detection here
+        }
+
+        // unicode cleanup
+        if ($this->_compile_unicode) {
+            foreach ($unicode_chars as $utf8_char => $count) {
+                $search_result = $this->_unicode_block_name(
+                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+
+                if ($search_result != -1) {
+                    $block_name = $search_result[2];
+                } else {
+                    $block_name = '[Malformatted]';
+                }
+
+                if (isset($this->_unicode_blocks[$block_name])) {
+                    $this->_unicode_blocks[$block_name] += $count;
+                } else {
+                    $this->_unicode_blocks[$block_name] = $count;
+                }
+            }
+        }
+
+
+        // trigram cleanup
+        if ($this->_compile_trigram) {
+            // pad the end
+            if ($b != ' ') {
+                if (!isset($this->_trigram["$a$b "])) {
+                    $this->_trigram["$a$b "] = 1;
+                } else {
+                    $this->_trigram["$a$b "]++;
+                }
+            }
+
+            // perl compatibility; Language::Guess does not pad the beginning
+            // kludge
+            if (isset($dropone)) {
+                if ($this->_trigram[$dropone] == 1) {
+                    unset($this->_trigram[$dropone]);
+                } else {
+                    $this->_trigram[$dropone]--;
+                }
+            }
+
+            if (!empty($this->_trigram)) {
+                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+            } else {
+                $this->_trigram_ranks = array();
+            }
+        }
+    }
+}
+
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
\ No newline at end of file
-- 
cgit v1.2.3