From 3db95a85de1297908e780742cd0b7d779c5f522e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20L=C5=93uillet?=
Date: Wed, 31 Jul 2013 19:09:06 +0200
Subject: update external libs
---
inc/JSLikeHTMLElement.php | 12 +--
inc/Readability.php | 198 +++++++++++++++++++++++-----------------------
2 files changed, 105 insertions(+), 105 deletions(-)
(limited to 'inc')
diff --git a/inc/JSLikeHTMLElement.php b/inc/JSLikeHTMLElement.php
index dfcc1be5..238ba8a8 100644
--- a/inc/JSLikeHTMLElement.php
+++ b/inc/JSLikeHTMLElement.php
@@ -4,7 +4,7 @@
*
* This class extends PHP's DOMElement to allow
* users to get and set the innerHTML property of
-* HTML elements in the same way it's done in
+* HTML elements in the same way it's done in
* JavaScript.
*
* Example usage:
@@ -15,16 +15,16 @@
* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
* $doc->loadHTML('
'
* echo "\n\n";
-*
+*
* // set innerHTML
* $elem->innerHTML = 'FiveFilters.org';
* echo $elem->innerHTML; // prints 'FiveFilters.org'
* echo "\n\n";
-*
+*
* // print document (with our changes)
* echo $doc->saveXML();
* @endcode
@@ -59,7 +59,7 @@ class JSLikeHTMLElement extends DOMElement
$value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
// Using will generate a warning, but so will bad HTML
// (and by this point, bad HTML is what we've got).
- // We use it (and suppress the warning) because an HTML fragment will
+ // We use it (and suppress the warning) because an HTML fragment will
// be wrapped around tags which we don't really want to keep.
// Note: despite the warning, if loadHTML succeeds it will return true.
$result = @$f->loadHTML(''.$value.'');
@@ -86,7 +86,7 @@ class JSLikeHTMLElement extends DOMElement
* @code
* $string = $div->innerHTML;
* @endcode
- */
+ */
public function __get($name)
{
if ($name == 'innerHTML') {
diff --git a/inc/Readability.php b/inc/Readability.php
index d28d28f9..e1e8738b 100644
--- a/inc/Readability.php
+++ b/inc/Readability.php
@@ -1,5 +1,5 @@
'/( (\s| ?)*){1,}/',
'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
- );
-
+ );
+
/* constants */
const FLAG_STRIP_UNLIKELYS = 1;
const FLAG_WEIGHT_CLASSES = 2;
const FLAG_CLEAN_CONDITIONALLY = 4;
-
+
/**
* Create instance of Readability
* @param string UTF-8 encoded string
* @param string (optional) URL associated with HTML (used for footnotes)
* @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
- */
+ */
function __construct($html, $url=null, $parser='libxml')
{
$this->url = $url;
@@ -135,18 +135,18 @@ class Readability
public function getTitle() {
return $this->articleTitle;
}
-
+
/**
* Get article content element
* @return DOMElement
*/
public function getContent() {
return $this->articleContent;
- }
-
+ }
+
/**
* Runs readability.
- *
+ *
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
@@ -161,7 +161,7 @@ class Readability
if (!isset($this->dom->documentElement)) return false;
$this->removeScripts($this->dom);
//die($this->getInnerHTML($this->dom->documentElement));
-
+
// Assume successful outcome
$this->success = true;
@@ -176,7 +176,7 @@ class Readability
}
$this->prepDocument();
-
+
//die($this->dom->documentElement->parentNode->nodeType);
//$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
//die($this->getInnerHTML($this->dom->documentElement));
@@ -191,9 +191,9 @@ class Readability
$this->success = false;
$articleContent = $this->dom->createElement('div');
$articleContent->setAttribute('id', 'readability-content');
- $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
';
+ $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
';
}
-
+
$overlay->setAttribute('id', 'readOverlay');
$innerDiv->setAttribute('id', 'readInner');
@@ -201,7 +201,7 @@ class Readability
$innerDiv->appendChild($articleTitle);
$innerDiv->appendChild($articleContent);
$overlay->appendChild($innerDiv);
-
+
/* Clear the old HTML, insert the new content. */
$this->body->innerHTML = '';
$this->body->appendChild($overlay);
@@ -209,21 +209,21 @@ class Readability
$this->body->removeAttribute('style');
$this->postProcessContent($articleContent);
-
+
// Set title and content instance variables
$this->articleTitle = $articleTitle;
$this->articleContent = $articleContent;
-
+
return $this->success;
}
-
+
/**
* Debug
*/
protected function dbg($msg) {
if ($this->debug) echo '* ',$msg, "\n";
}
-
+
/**
* Run any post-process modifications to article content as necessary.
*
@@ -231,11 +231,11 @@ class Readability
* @return void
*/
public function postProcessContent($articleContent) {
- if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
+ if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
$this->addFootnotes($articleContent);
}
}
-
+
/**
* Get the article title as an H1.
*
@@ -248,11 +248,11 @@ class Readability
try {
$curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
} catch(Exception $e) {}
-
+
if (preg_match('/ [\|\-] /', $curTitle))
{
$curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
-
+
if (count(explode(' ', $curTitle)) < 3) {
$curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
}
@@ -279,17 +279,17 @@ class Readability
if (count(explode(' ', $curTitle)) <= 4) {
$curTitle = $origTitle;
}
-
+
$articleTitle = $this->dom->createElement('h1');
$articleTitle->innerHTML = $curTitle;
-
+
return $articleTitle;
}
-
+
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
- *
+ *
* @return void
**/
protected function prepDocument() {
@@ -328,13 +328,13 @@ class Readability
$footnotesWrapper = $this->dom->createElement('div');
$footnotesWrapper->setAttribute('id', 'readability-footnotes');
$footnotesWrapper->innerHTML = '
References
';
-
+
$articleFootnotes = $this->dom->createElement('ol');
$articleFootnotes->setAttribute('id', 'readability-footnotes-list');
$footnotesWrapper->appendChild($articleFootnotes);
-
+
$articleLinks = $articleContent->getElementsByTagName('a');
-
+
$linkCount = 0;
for ($i = 0; $i < $articleLinks->length; $i++)
{
@@ -346,11 +346,11 @@ class Readability
if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
//linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
$linkText = $this->getInnerText($articleLink);
-
+
if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
continue;
}
-
+
$linkCount++;
/** Add a superscript reference after the article link */
@@ -358,7 +358,7 @@ class Readability
$refLink->innerHTML = '[' . $linkCount . ']';
$refLink->setAttribute('class', 'readability-DoNotFootnote');
$refLink->setAttribute('style', 'color: inherit;');
-
+
//TODO: does this work or should we use DOMNode.isSameNode()?
if ($articleLink->parentNode->lastChild == $articleLink) {
$articleLink->parentNode->appendChild($refLink);
@@ -373,15 +373,15 @@ class Readability
$footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
$footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
-
+
$footnote->appendChild($footnoteLink);
if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')';
-
+
$articleFootnotes->appendChild($footnote);
}
if ($linkCount > 0) {
- $articleContent->appendChild($footnotesWrapper);
+ $articleContent->appendChild($footnotesWrapper);
}
}
@@ -404,7 +404,7 @@ class Readability
//}
}
}
-
+
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous
tags, etc.
@@ -429,7 +429,7 @@ class Readability
* as a header and not a subheader, so remove it since we already have a header.
***/
if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
- $this->clean($articleContent, 'h2');
+ $this->clean($articleContent, 'h2');
}
$this->clean($articleContent, 'iframe');
@@ -448,7 +448,7 @@ class Readability
$embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
$objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
$iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
-
+
if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
{
$articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
@@ -457,13 +457,13 @@ class Readability
try {
$articleContent->innerHTML = preg_replace('/ ]*>\s*
'
+* echo "\n\n";
+*
+* // set innerHTML
+* $elem->innerHTML = 'FiveFilters.org';
+* echo $elem->innerHTML; // prints 'FiveFilters.org'
+* echo "\n\n";
+*
+* // print document (with our changes)
+* echo $doc->saveXML();
+* @endcode
+*
+* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
+* @see http://fivefilters.org (the project this was written for)
+*/
+class JSLikeHTMLElement extends DOMElement
+{
+ /**
+ * Used for setting innerHTML like it's done in JavaScript:
+ * @code
+ * $div->innerHTML = '
Chapter 2
The story begins...
';
+ * @endcode
+ */
+ public function __set($name, $value) {
+ if ($name == 'innerHTML') {
+ // first, empty the element
+ for ($x=$this->childNodes->length-1; $x>=0; $x--) {
+ $this->removeChild($this->childNodes->item($x));
+ }
+ // $value holds our new inner HTML
+ if ($value != '') {
+ $f = $this->ownerDocument->createDocumentFragment();
+ // appendXML() expects well-formed markup (XHTML)
+ $result = @$f->appendXML($value); // @ to suppress PHP warnings
+ if ($result) {
+ if ($f->hasChildNodes()) $this->appendChild($f);
+ } else {
+ // $value is probably ill-formed
+ $f = new DOMDocument();
+ $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
+ // Using will generate a warning, but so will bad HTML
+ // (and by this point, bad HTML is what we've got).
+ // We use it (and suppress the warning) because an HTML fragment will
+ // be wrapped around tags which we don't really want to keep.
+ // Note: despite the warning, if loadHTML succeeds it will return true.
+ $result = @$f->loadHTML(''.$value.'');
+ if ($result) {
+ $import = $f->getElementsByTagName('htmlfragment')->item(0);
+ foreach ($import->childNodes as $child) {
+ $importedNode = $this->ownerDocument->importNode($child, true);
+ $this->appendChild($importedNode);
+ }
+ } else {
+ // oh well, we tried, we really did. :(
+ // this element is now empty
+ }
+ }
+ }
+ } else {
+ $trace = debug_backtrace();
+ trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
+ }
+ }
+
+ /**
+ * Used for getting innerHTML like it's done in JavaScript:
+ * @code
+ * $string = $div->innerHTML;
+ * @endcode
+ */
+ public function __get($name)
+ {
+ if ($name == 'innerHTML') {
+ $inner = '';
+ foreach ($this->childNodes as $child) {
+ $inner .= $this->ownerDocument->saveXML($child);
+ }
+ return $inner;
+ }
+
+ $trace = debug_backtrace();
+ trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
+ return null;
+ }
+
+ public function __toString()
+ {
+ return '['.$this->tagName.']';
+ }
+}
\ No newline at end of file
diff --git a/inc/3rdparty/Readability.php b/inc/3rdparty/Readability.php
new file mode 100644
index 00000000..e1e8738b
--- /dev/null
+++ b/inc/3rdparty/Readability.php
@@ -0,0 +1,1137 @@
+init();
+echo $r->articleContent->innerHTML;
+*/
+
+class Readability
+{
+ public $version = '1.7.1-without-multi-page';
+ public $convertLinksToFootnotes = false;
+ public $revertForcedParagraphElements = true;
+ public $articleTitle;
+ public $articleContent;
+ public $dom;
+ public $url = null; // optional - URL where HTML was retrieved
+ public $debug = false;
+ public $lightClean = true; // preserves more content (experimental) added 2012-09-19
+ protected $body = null; //
+ protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
+ protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
+ protected $success = false; // indicates whether we were able to extract or not
+
+ /**
+ * All of the regular expressions in use within readability.
+ * Defined up here so we don't instantiate them repeatedly in loops.
+ **/
+ public $regexps = array(
+ 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
+ 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+ 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+ 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+ 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
+ 'replaceBrs' => '/( ]*>[ \n\r\t]*){2,}/i',
+ 'replaceFonts' => '/<(\/?)font[^>]*>/i',
+ // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
+ 'normalize' => '/\s{2,}/',
+ 'killBreaks' => '/( (\s| ?)*){1,}/',
+ 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
+ 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+ );
+
+ /* constants */
+ const FLAG_STRIP_UNLIKELYS = 1;
+ const FLAG_WEIGHT_CLASSES = 2;
+ const FLAG_CLEAN_CONDITIONALLY = 4;
+
+ /**
+ * Create instance of Readability
+ * @param string UTF-8 encoded string
+ * @param string (optional) URL associated with HTML (used for footnotes)
+ * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+ */
+ function __construct($html, $url=null, $parser='libxml')
+ {
+ $this->url = $url;
+ /* Turn all double br's into p's */
+ $html = preg_replace($this->regexps['replaceBrs'], '
', $html);
+ $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
+ $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
+ if (trim($html) == '') $html = '';
+ if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+ // all good
+ } else {
+ $this->dom = new DOMDocument();
+ $this->dom->preserveWhiteSpace = false;
+ @$this->dom->loadHTML($html);
+ }
+ $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+ }
+
+ /**
+ * Get article title element
+ * @return DOMElement
+ */
+ public function getTitle() {
+ return $this->articleTitle;
+ }
+
+ /**
+ * Get article content element
+ * @return DOMElement
+ */
+ public function getContent() {
+ return $this->articleContent;
+ }
+
+ /**
+ * Runs readability.
+ *
+ * Workflow:
+ * 1. Prep the document by removing script tags, css, etc.
+ * 2. Build readability's DOM tree.
+ * 3. Grab the article content from the current dom tree.
+ * 4. Replace the current DOM tree with the new one.
+ * 5. Read peacefully.
+ *
+ * @return boolean true if we found content, false otherwise
+ **/
+ public function init()
+ {
+ if (!isset($this->dom->documentElement)) return false;
+ $this->removeScripts($this->dom);
+ //die($this->getInnerHTML($this->dom->documentElement));
+
+ // Assume successful outcome
+ $this->success = true;
+
+ $bodyElems = $this->dom->getElementsByTagName('body');
+ if ($bodyElems->length > 0) {
+ if ($this->bodyCache == null) {
+ $this->bodyCache = $bodyElems->item(0)->innerHTML;
+ }
+ if ($this->body == null) {
+ $this->body = $bodyElems->item(0);
+ }
+ }
+
+ $this->prepDocument();
+
+ //die($this->dom->documentElement->parentNode->nodeType);
+ //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
+ //die($this->getInnerHTML($this->dom->documentElement));
+
+ /* Build readability's DOM tree */
+ $overlay = $this->dom->createElement('div');
+ $innerDiv = $this->dom->createElement('div');
+ $articleTitle = $this->getArticleTitle();
+ $articleContent = $this->grabArticle();
+
+ if (!$articleContent) {
+ $this->success = false;
+ $articleContent = $this->dom->createElement('div');
+ $articleContent->setAttribute('id', 'readability-content');
+ $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
';
+ }
+
+ $overlay->setAttribute('id', 'readOverlay');
+ $innerDiv->setAttribute('id', 'readInner');
+
+ /* Glue the structure of our document together. */
+ $innerDiv->appendChild($articleTitle);
+ $innerDiv->appendChild($articleContent);
+ $overlay->appendChild($innerDiv);
+
+ /* Clear the old HTML, insert the new content. */
+ $this->body->innerHTML = '';
+ $this->body->appendChild($overlay);
+ //document.body.insertBefore(overlay, document.body.firstChild);
+ $this->body->removeAttribute('style');
+
+ $this->postProcessContent($articleContent);
+
+ // Set title and content instance variables
+ $this->articleTitle = $articleTitle;
+ $this->articleContent = $articleContent;
+
+ return $this->success;
+ }
+
+ /**
+ * Debug
+ */
+ protected function dbg($msg) {
+ if ($this->debug) echo '* ',$msg, "\n";
+ }
+
+ /**
+ * Run any post-process modifications to article content as necessary.
+ *
+ * @param DOMElement
+ * @return void
+ */
+ public function postProcessContent($articleContent) {
+ if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
+ $this->addFootnotes($articleContent);
+ }
+ }
+
+ /**
+ * Get the article title as an H1.
+ *
+ * @return DOMElement
+ */
+ protected function getArticleTitle() {
+ $curTitle = '';
+ $origTitle = '';
+
+ try {
+ $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+ } catch(Exception $e) {}
+
+ if (preg_match('/ [\|\-] /', $curTitle))
+ {
+ $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+
+ if (count(explode(' ', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+ }
+ }
+ else if (strpos($curTitle, ': ') !== false)
+ {
+ $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+
+ if (count(explode(' ', $curTitle)) < 3) {
+ $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
+ }
+ }
+ else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
+ {
+ $hOnes = $this->dom->getElementsByTagName('h1');
+ if($hOnes->length == 1)
+ {
+ $curTitle = $this->getInnerText($hOnes->item(0));
+ }
+ }
+
+ $curTitle = trim($curTitle);
+
+ if (count(explode(' ', $curTitle)) <= 4) {
+ $curTitle = $origTitle;
+ }
+
+ $articleTitle = $this->dom->createElement('h1');
+ $articleTitle->innerHTML = $curTitle;
+
+ return $articleTitle;
+ }
+
+ /**
+ * Prepare the HTML document for readability to scrape it.
+ * This includes things like stripping javascript, CSS, and handling terrible markup.
+ *
+ * @return void
+ **/
+ protected function prepDocument() {
+ /**
+ * In some cases a body element can't be found (if the HTML is totally hosed for example)
+ * so we create a new body node and append it to the document.
+ */
+ if ($this->body == null)
+ {
+ $this->body = $this->dom->createElement('body');
+ $this->dom->documentElement->appendChild($this->body);
+ }
+ $this->body->setAttribute('id', 'readabilityBody');
+
+ /* Remove all style tags in head */
+ $styleTags = $this->dom->getElementsByTagName('style');
+ for ($i = $styleTags->length-1; $i >= 0; $i--)
+ {
+ $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+ }
+
+ /* Turn all double br's into p's */
+ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
+ //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '
').replace(readability.regexps.replaceFonts, '<$1span>');
+ // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
+ // Manipulating innerHTML as it's done in JS is not possible in PHP.
+ }
+
+ /**
+ * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
+ * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
+ *
+ * @return void
+ **/
+ public function addFootnotes($articleContent) {
+ $footnotesWrapper = $this->dom->createElement('div');
+ $footnotesWrapper->setAttribute('id', 'readability-footnotes');
+ $footnotesWrapper->innerHTML = '
References
';
+
+ $articleFootnotes = $this->dom->createElement('ol');
+ $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
+ $footnotesWrapper->appendChild($articleFootnotes);
+
+ $articleLinks = $articleContent->getElementsByTagName('a');
+
+ $linkCount = 0;
+ for ($i = 0; $i < $articleLinks->length; $i++)
+ {
+ $articleLink = $articleLinks->item($i);
+ $footnoteLink = $articleLink->cloneNode(true);
+ $refLink = $this->dom->createElement('a');
+ $footnote = $this->dom->createElement('li');
+ $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
+ if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
+ //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
+ $linkText = $this->getInnerText($articleLink);
+
+ if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+ continue;
+ }
+
+ $linkCount++;
+
+ /** Add a superscript reference after the article link */
+ $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
+ $refLink->innerHTML = '[' . $linkCount . ']';
+ $refLink->setAttribute('class', 'readability-DoNotFootnote');
+ $refLink->setAttribute('style', 'color: inherit;');
+
+ //TODO: does this work or should we use DOMNode.isSameNode()?
+ if ($articleLink->parentNode->lastChild == $articleLink) {
+ $articleLink->parentNode->appendChild($refLink);
+ } else {
+ $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
+ }
+
+ $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
+ $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
+
+ $footnote->innerHTML = '^ ';
+
+ $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
+ $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
+
+ $footnote->appendChild($footnoteLink);
+ if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')';
+
+ $articleFootnotes->appendChild($footnote);
+ }
+
+ if ($linkCount > 0) {
+ $articleContent->appendChild($footnotesWrapper);
+ }
+ }
+
+ /**
+ * Reverts P elements with class 'readability-styled'
+ * to text nodes - which is what they were before.
+ *
+ * @param DOMElement
+ * @return void
+ */
+ function revertReadabilityStyledElements($articleContent) {
+ $xpath = new DOMXPath($articleContent->ownerDocument);
+ $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
+ //$elems = $articleContent->getElementsByTagName('p');
+ for ($i = $elems->length-1; $i >= 0; $i--) {
+ $e = $elems->item($i);
+ $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+ //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
+ // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
+ //}
+ }
+ }
+
+ /**
+ * Prepare the article node for display. Clean out any inline styles,
+ * iframes, forms, strip extraneous
tags, etc.
+ *
+ * @param DOMElement
+ * @return void
+ */
+ function prepArticle($articleContent) {
+ $this->cleanStyles($articleContent);
+ $this->killBreaks($articleContent);
+ if ($this->revertForcedParagraphElements) {
+ $this->revertReadabilityStyledElements($articleContent);
+ }
+
+ /* Clean out junk from the article content */
+ $this->cleanConditionally($articleContent, 'form');
+ $this->clean($articleContent, 'object');
+ $this->clean($articleContent, 'h1');
+
+ /**
+ * If there is only one h2, they are probably using it
+ * as a header and not a subheader, so remove it since we already have a header.
+ ***/
+ if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
+ $this->clean($articleContent, 'h2');
+ }
+ $this->clean($articleContent, 'iframe');
+
+ $this->cleanHeaders($articleContent);
+
+ /* Do these last as the previous stuff may have removed junk that will affect these */
+ $this->cleanConditionally($articleContent, 'table');
+ $this->cleanConditionally($articleContent, 'ul');
+ $this->cleanConditionally($articleContent, 'div');
+
+ /* Remove extra paragraphs */
+ $articleParagraphs = $articleContent->getElementsByTagName('p');
+ for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
+ {
+ $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
+ $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
+ $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+ $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
+
+ if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+ {
+ $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
+ }
+ }
+
+ try {
+ $articleContent->innerHTML = preg_replace('/ ]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
+ }
+ }
+
+ /**
+ * Initialize a node with the readability object. Also checks the
+ * className/id for special names to add to its score.
+ *
+ * @param Element
+ * @return void
+ **/
+ protected function initializeNode($node) {
+ $readability = $this->dom->createAttribute('readability');
+ $readability->value = 0; // this is our contentScore
+ $node->setAttributeNode($readability);
+
+ switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
+ case 'DIV':
+ $readability->value += 5;
+ break;
+
+ case 'PRE':
+ case 'TD':
+ case 'BLOCKQUOTE':
+ $readability->value += 3;
+ break;
+
+ case 'ADDRESS':
+ case 'OL':
+ case 'UL':
+ case 'DL':
+ case 'DD':
+ case 'DT':
+ case 'LI':
+ case 'FORM':
+ $readability->value -= 3;
+ break;
+
+ case 'H1':
+ case 'H2':
+ case 'H3':
+ case 'H4':
+ case 'H5':
+ case 'H6':
+ case 'TH':
+ $readability->value -= 5;
+ break;
+ }
+ $readability->value += $this->getClassWeight($node);
+ }
+
+ /***
+ * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+ * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+ *
+ * @return DOMElement
+ **/
+ protected function grabArticle($page=null) {
+ $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
+ if (!$page) $page = $this->dom;
+ $allElements = $page->getElementsByTagName('*');
+ /**
+ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+ * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+ *
+ * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+ * TODO: Shouldn't this be a reverse traversal?
+ **/
+ $node = null;
+ $nodesToScore = array();
+ for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
+ //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
+ //$node = $targetList->item($nodeIndex);
+ $tagName = strtoupper($node->tagName);
+ /* Remove unlikely candidates */
+ if ($stripUnlikelyCandidates) {
+ $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
+ if (
+ preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+ !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
+ $tagName != 'BODY'
+ )
+ {
+ $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
+ //$nodesToRemove[] = $node;
+ $node->parentNode->removeChild($node);
+ $nodeIndex--;
+ continue;
+ }
+ }
+
+ if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
+ $nodesToScore[] = $node;
+ }
+
+ /* Turn all divs that don't have children block level elements into p's */
+ if ($tagName == 'DIV') {
+ if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+ //$this->dbg('Altering div to p');
+ $newNode = $this->dom->createElement('p');
+ try {
+ $newNode->innerHTML = $node->innerHTML;
+ //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
+ $node->parentNode->replaceChild($newNode, $node);
+ $nodeIndex--;
+ $nodesToScore[] = $node; // or $newNode?
+ }
+ catch(Exception $e) {
+ $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
+ }
+ }
+ else
+ {
+ /* EXPERIMENTAL */
+ // TODO: change these p elements back to text nodes after processing
+ for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
+ $childNode = $node->childNodes->item($i);
+ if ($childNode->nodeType == 3) { // XML_TEXT_NODE
+ //$this->dbg('replacing text node with a p tag with the same content.');
+ $p = $this->dom->createElement('p');
+ $p->innerHTML = $childNode->nodeValue;
+ $p->setAttribute('style', 'display: inline;');
+ $p->setAttribute('class', 'readability-styled');
+ $childNode->parentNode->replaceChild($p, $childNode);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+ * Then add their score to their parent node.
+ *
+ * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+ **/
+ $candidates = array();
+ for ($pt=0; $pt < count($nodesToScore); $pt++) {
+ $parentNode = $nodesToScore[$pt]->parentNode;
+ // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
+ $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
+ $innerText = $this->getInnerText($nodesToScore[$pt]);
+
+ if (!$parentNode || !isset($parentNode->tagName)) {
+ continue;
+ }
+
+ /* If this paragraph is less than 25 characters, don't even count it. */
+ if(strlen($innerText) < 25) {
+ continue;
+ }
+
+ /* Initialize readability data for the parent. */
+ if (!$parentNode->hasAttribute('readability'))
+ {
+ $this->initializeNode($parentNode);
+ $candidates[] = $parentNode;
+ }
+
+ /* Initialize readability data for the grandparent. */
+ if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
+ {
+ $this->initializeNode($grandParentNode);
+ $candidates[] = $grandParentNode;
+ }
+
+ $contentScore = 0;
+
+ /* Add a point for the paragraph itself as a base. */
+ $contentScore++;
+
+ /* Add points for any commas within this paragraph */
+ $contentScore += count(explode(',', $innerText));
+
+ /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+ $contentScore += min(floor(strlen($innerText) / 100), 3);
+
+ /* Add the score to the parent. The grandparent gets half. */
+ $parentNode->getAttributeNode('readability')->value += $contentScore;
+
+ if ($grandParentNode) {
+ $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
+ }
+ }
+
+ /**
+ * After we've calculated scores, loop through all of the possible candidate nodes we found
+ * and find the one with the highest score.
+ **/
+ $topCandidate = null;
+ for ($c=0, $cl=count($candidates); $c < $cl; $c++)
+ {
+ /**
+ * Scale the final candidates score based on link density. Good content should have a
+ * relatively small link density (5% or less) and be mostly unaffected by this operation.
+ **/
+ $readability = $candidates[$c]->getAttributeNode('readability');
+ $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
+
+ $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
+
+ if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
+ $topCandidate = $candidates[$c];
+ }
+ }
+
+ /**
+ * If we still have no top candidate, just use the body as a last resort.
+ * We also have to copy the body node so it is something we can modify.
+ **/
+ if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
+ {
+ $topCandidate = $this->dom->createElement('div');
+ if ($page instanceof DOMDocument) {
+ if (!isset($page->documentElement)) {
+ // we don't have a body either? what a mess! :)
+ } else {
+ $topCandidate->innerHTML = $page->documentElement->innerHTML;
+ $page->documentElement->innerHTML = '';
+ $page->documentElement->appendChild($topCandidate);
+ }
+ } else {
+ $topCandidate->innerHTML = $page->innerHTML;
+ $page->innerHTML = '';
+ $page->appendChild($topCandidate);
+ }
+ $this->initializeNode($topCandidate);
+ }
+
+ /**
+ * Now that we have the top candidate, look through its siblings for content that might also be related.
+ * Things like preambles, content split by ads that we removed, etc.
+ **/
+ $articleContent = $this->dom->createElement('div');
+ $articleContent->setAttribute('id', 'readability-content');
+ $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
+ $siblingNodes = $topCandidate->parentNode->childNodes;
+ if (!isset($siblingNodes)) {
+ $siblingNodes = new stdClass;
+ $siblingNodes->length = 0;
+ }
+
+ for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
+ {
+ $siblingNode = $siblingNodes->item($s);
+ $append = false;
+
+ $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+
+ //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
+
+ if ($siblingNode === $topCandidate)
+ // or if ($siblingNode->isSameNode($topCandidate))
+ {
+ $append = true;
+ }
+
+ $contentBonus = 0;
+ /* Give a bonus if sibling nodes and top candidates have the example same classname */
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
+ $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
+ }
+
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
+ {
+ $append = true;
+ }
+
+ if (strtoupper($siblingNode->nodeName) == 'P') {
+ $linkDensity = $this->getLinkDensity($siblingNode);
+ $nodeContent = $this->getInnerText($siblingNode);
+ $nodeLength = strlen($nodeContent);
+
+ if ($nodeLength > 80 && $linkDensity < 0.25)
+ {
+ $append = true;
+ }
+ else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
+ {
+ $append = true;
+ }
+ }
+
+ if ($append)
+ {
+ $this->dbg('Appending node: ' . $siblingNode->nodeName);
+
+ $nodeToAppend = null;
+ $sibNodeName = strtoupper($siblingNode->nodeName);
+ if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
+ /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+
+ $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
+ $nodeToAppend = $this->dom->createElement('div');
+ try {
+ $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
+ $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+ }
+ catch(Exception $e)
+ {
+ $this->dbg('Could not alter siblingNode to div, reverting back to original.');
+ $nodeToAppend = $siblingNode;
+ $s--;
+ $sl--;
+ }
+ } else {
+ $nodeToAppend = $siblingNode;
+ $s--;
+ $sl--;
+ }
+
+ /* To ensure a node does not interfere with readability styles, remove its classnames */
+ $nodeToAppend->removeAttribute('class');
+
+ /* Append sibling and subtract from our list because it removes the node when you append to another node */
+ $articleContent->appendChild($nodeToAppend);
+ }
+ }
+
+ /**
+ * So we have all of the content that we need. Now we clean it up for presentation.
+ **/
+ $this->prepArticle($articleContent);
+
+ /**
+ * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+ * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+ * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+ * finding the -right- content.
+ **/
+ if (strlen($this->getInnerText($articleContent, false)) < 250)
+ {
+ // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+ // in the meantime, we check and create an empty element if it's not there.
+ if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
+ $this->body->innerHTML = $this->bodyCache;
+
+ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+ $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+ return $this->grabArticle($this->body);
+ }
+ else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+ $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
+ return $this->grabArticle($this->body);
+ }
+ else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+ return $this->grabArticle($this->body);
+ }
+ else {
+ return false;
+ }
+ }
+ return $articleContent;
+ }
+
+ /**
+ * Remove script tags from document
+ *
+ * @param DOMElement
+ * @return void
+ */
+ public function removeScripts($doc) {
+ $scripts = $doc->getElementsByTagName('script');
+ for($i = $scripts->length-1; $i >= 0; $i--)
+ {
+ $scripts->item($i)->parentNode->removeChild($scripts->item($i));
+ }
+ }
+
+ /**
+ * Get the inner text of a node.
+ * This also strips out any excess whitespace to be found.
+ *
+ * @param DOMElement $
+ * @param boolean $normalizeSpaces (default: true)
+ * @return string
+ **/
+ public function getInnerText($e, $normalizeSpaces=true) {
+ $textContent = '';
+
+ if (!isset($e->textContent) || $e->textContent == '') {
+ return '';
+ }
+
+ $textContent = trim($e->textContent);
+
+ if ($normalizeSpaces) {
+ return preg_replace($this->regexps['normalize'], ' ', $textContent);
+ } else {
+ return $textContent;
+ }
+ }
+
+ /**
+ * Get the number of times a string $s appears in the node $e.
+ *
+ * @param DOMElement $e
+ * @param string - what to count. Default is ","
+ * @return number (integer)
+ **/
+ public function getCharCount($e, $s=',') {
+ return substr_count($this->getInnerText($e), $s);
+ }
+
+ /**
+ * Remove the style attribute on every $e and under.
+ *
+ * @param DOMElement $e
+ * @return void
+ */
+ public function cleanStyles($e) {
+ if (!is_object($e)) return;
+ $elems = $e->getElementsByTagName('*');
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('style');
+ }
+ }
+
+ /**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ *
+ * @param DOMElement $e
+ * @return number (float)
+ */
+ public function getLinkDensity($e) {
+ $links = $e->getElementsByTagName('a');
+ $textLength = strlen($this->getInnerText($e));
+ $linkLength = 0;
+ for ($i=0, $il=$links->length; $i < $il; $i++)
+ {
+ $linkLength += strlen($this->getInnerText($links->item($i)));
+ }
+ if ($textLength > 0) {
+ return $linkLength / $textLength;
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Get an elements class/id weight. Uses regular expressions to tell if this
+ * element looks good or bad.
+ *
+ * @param DOMElement $e
+ * @return number (Integer)
+ */
+ public function getClassWeight($e) {
+ if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+ return 0;
+ }
+
+ $weight = 0;
+
+ /* Look for a special classname */
+ if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
+ {
+ if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
+ $weight -= 25;
+ }
+ if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
+ $weight += 25;
+ }
+ }
+
+ /* Look for a special ID */
+ if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
+ {
+ if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
+ $weight -= 25;
+ }
+ if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
+ $weight += 25;
+ }
+ }
+ return $weight;
+ }
+
+ /**
+ * Remove extraneous break tags from a node.
+ *
+ * @param DOMElement $node
+ * @return void
+ */
+ public function killBreaks($node) {
+ $html = $node->innerHTML;
+ $html = preg_replace($this->regexps['killBreaks'], ' ', $html);
+ $node->innerHTML = $html;
+ }
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.)
+ *
+ * Updated 2012-09-18 to preserve youtube/vimeo iframes
+ *
+ * @param DOMElement $e
+ * @param string $tag
+ * @return void
+ */
+ public function clean($e, $tag) {
+ $targetList = $e->getElementsByTagName($tag);
+ $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
+
+ for ($y=$targetList->length-1; $y >= 0; $y--) {
+ /* Allow youtube and vimeo videos through as people usually want to see those. */
+ if ($isEmbed) {
+ $attributeValues = '';
+ for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
+ $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
+ }
+
+ /* First, check the elements attributes to see if any of them contain youtube or vimeo */
+ if (preg_match($this->regexps['video'], $attributeValues)) {
+ continue;
+ }
+
+ /* Then check the elements inside this element for the same. */
+ if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
+ continue;
+ }
+ }
+ $targetList->item($y)->parentNode->removeChild($targetList->item($y));
+ }
+ }
+
+ /**
+ * Clean an element of all tags of type "tag" if they look fishy.
+ * "Fishy" is an algorithm based on content length, classnames,
+ * link density, number of images & embeds, etc.
+ *
+ * @param DOMElement $e
+ * @param string $tag
+ * @return void
+ */
+ public function cleanConditionally($e, $tag) {
+ if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ return;
+ }
+
+ $tagsList = $e->getElementsByTagName($tag);
+ $curTagsLength = $tagsList->length;
+
+ /**
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ *
+ * TODO: Consider taking into account original contentScore here.
+ */
+ for ($i=$curTagsLength-1; $i >= 0; $i--) {
+ $weight = $this->getClassWeight($tagsList->item($i));
+ $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
+
+ $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
+
+ if ($weight + $contentScore < 0) {
+ $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+ }
+ else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
+ /**
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+ **/
+ $p = $tagsList->item($i)->getElementsByTagName('p')->length;
+ $img = $tagsList->item($i)->getElementsByTagName('img')->length;
+ $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
+ $input = $tagsList->item($i)->getElementsByTagName('input')->length;
+ $a = $tagsList->item($i)->getElementsByTagName('a')->length;
+
+ $embedCount = 0;
+ $embeds = $tagsList->item($i)->getElementsByTagName('embed');
+ for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+ if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+ $embedCount++;
+ }
+ }
+ $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+ for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+ if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+ $embedCount++;
+ }
+ }
+
+ $linkDensity = $this->getLinkDensity($tagsList->item($i));
+ $contentLength = strlen($this->getInnerText($tagsList->item($i)));
+ $toRemove = false;
+
+ if ($this->lightClean) {
+ $this->dbg('Light clean...');
+ if ( ($img > $p) && ($img > 4) ) {
+ $this->dbg(' more than 4 images and more image elements than paragraph elements');
+ $toRemove = true;
+ } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+ $this->dbg(' too many
elements, and parent is not
or ');
+ $toRemove = true;
+ } else if ( $input > floor($p/3) ) {
+ $this->dbg(' too many elements');
+ $toRemove = true;
+ } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+ $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
+ $toRemove = true;
+ } else if($weight < 25 && $linkDensity > 0.2) {
+ $this->dbg(' weight smaller than 25 and link density above 0.2');
+ $toRemove = true;
+ } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+ $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+ $toRemove = true;
+ } else if($embedCount > 3) {
+ $this->dbg(' more than 3 embeds');
+ $toRemove = true;
+ }
+ } else {
+ $this->dbg('Standard clean...');
+ if ( $img > $p ) {
+ $this->dbg(' more image elements than paragraph elements');
+ $toRemove = true;
+ } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+ $this->dbg(' too many
elements, and parent is not
or ');
+ $toRemove = true;
+ } else if ( $input > floor($p/3) ) {
+ $this->dbg(' too many elements');
+ $toRemove = true;
+ } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+ $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+ $toRemove = true;
+ } else if($weight < 25 && $linkDensity > 0.2) {
+ $this->dbg(' weight smaller than 25 and link density above 0.2');
+ $toRemove = true;
+ } else if($weight >= 25 && $linkDensity > 0.5) {
+ $this->dbg(' weight above 25 but link density greater than 0.5');
+ $toRemove = true;
+ } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+ $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+ $toRemove = true;
+ }
+ }
+
+ if ($toRemove) {
+ //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
+ $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+ }
+ }
+ }
+ }
+
+ /**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * @param DOMElement $e
+ * @return void
+ */
+ public function cleanHeaders($e) {
+ for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+ $headers = $e->getElementsByTagName('h' . $headerIndex);
+ for ($i=$headers->length-1; $i >=0; $i--) {
+ if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+ $headers->item($i)->parentNode->removeChild($headers->item($i));
+ }
+ }
+ }
+ }
+
+ public function flagIsActive($flag) {
+ return ($this->flags & $flag) > 0;
+ }
+
+ public function addFlag($flag) {
+ $this->flags = $this->flags | $flag;
+ }
+
+ public function removeFlag($flag) {
+ $this->flags = $this->flags & ~$flag;
+ }
+}
\ No newline at end of file
diff --git a/inc/3rdparty/Session.class.php b/inc/3rdparty/Session.class.php
new file mode 100644
index 00000000..eff924cc
--- /dev/null
+++ b/inc/3rdparty/Session.class.php
@@ -0,0 +1,136 @@
+ $value) {
+ $_SESSION[$key] = $value;
+ }
+ if ($login==$login_test && $password==$password_test){
+ // generate unique random number to sign forms (HMAC)
+ $_SESSION['uid'] = sha1(uniqid('',true).'_'.mt_rand());
+ $_SESSION['info']=Session::_allInfos();
+ $_SESSION['username']=$login;
+ // Set session expiration.
+ $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
+ return true;
+ }
+ return false;
+ }
+
+ // Force logout
+ public static function logout()
+ {
+ unset($_SESSION['uid'],$_SESSION['info'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass']);
+ }
+
+ // Make sure user is logged in.
+ public static function isLogged()
+ {
+ if (!isset ($_SESSION['uid'])
+ || $_SESSION['info']!=Session::_allInfos()
+ || time()>=$_SESSION['expires_on']){
+ Session::logout();
+ return false;
+ }
+ // User accessed a page : Update his/her session expiration date.
+ $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
+ return true;
+ }
+
+ // Returns a token.
+ public static function getToken()
+ {
+ if (!isset($_SESSION['tokens'])){
+ $_SESSION['tokens']=array();
+ }
+ // We generate a random string and store it on the server side.
+ $rnd = sha1(uniqid('',true).'_'.mt_rand());
+ $_SESSION['tokens'][$rnd]=1;
+ return $rnd;
+ }
+
+ // Tells if a token is ok. Using this function will destroy the token.
+ // return true if token is ok.
+ public static function isToken($token)
+ {
+ if (isset($_SESSION['tokens'][$token]))
+ {
+ unset($_SESSION['tokens'][$token]); // Token is used: destroy it.
+ return true; // Token is ok.
+ }
+ return false; // Wrong token, or already used.
+ }
+}
\ No newline at end of file
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
new file mode 100644
index 00000000..43b94e57
--- /dev/null
+++ b/inc/3rdparty/simple_html_dom.php
@@ -0,0 +1,1722 @@
+size is the "real" number of bytes the dom was created from.
+ * but for most purposes, it's a really good estimation.
+ * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
+ * Allow the user to tell us how much they trust the html.
+ * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
+ * This allows for us to find tags based on the text they contain.
+ * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
+ * Paperg: added parse_charset so that we know about the character set of the source document.
+ * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
+ * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
+ *
+ * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
+ * PaperG (John Schlick) Added get_display_size for "IMG" tags.
+ *
+ * Licensed under The MIT License
+ * Redistributions of files must retain the above copyright notice.
+ *
+ * @author S.C. Chen
+ * @author John Schlick
+ * @author Rus Carroll
+ * @version 1.5 ($Rev: 202 $)
+ * @package PlaceLocalInclude
+ * @subpackage simple_html_dom
+ */
+
+/**
+ * All of the Defines for the classes below.
+ * @author S.C. Chen
+ */
+define('HDOM_TYPE_ELEMENT', 1);
+define('HDOM_TYPE_COMMENT', 2);
+define('HDOM_TYPE_TEXT', 3);
+define('HDOM_TYPE_ENDTAG', 4);
+define('HDOM_TYPE_ROOT', 5);
+define('HDOM_TYPE_UNKNOWN', 6);
+define('HDOM_QUOTE_DOUBLE', 0);
+define('HDOM_QUOTE_SINGLE', 1);
+define('HDOM_QUOTE_NO', 3);
+define('HDOM_INFO_BEGIN', 0);
+define('HDOM_INFO_END', 1);
+define('HDOM_INFO_QUOTE', 2);
+define('HDOM_INFO_SPACE', 3);
+define('HDOM_INFO_TEXT', 4);
+define('HDOM_INFO_INNER', 5);
+define('HDOM_INFO_OUTER', 6);
+define('HDOM_INFO_ENDSPACE',7);
+define('DEFAULT_TARGET_CHARSET', 'UTF-8');
+define('DEFAULT_BR_TEXT', "\r\n");
+define('DEFAULT_SPAN_TEXT', " ");
+define('MAX_FILE_SIZE', 600000);
+// helper functions
+// -----------------------------------------------------------------------------
+// get html dom from file
+// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
+function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+{
+ // We DO force the tags to be terminated.
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
+ // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
+ $contents = file_get_contents($url, $use_include_path, $context, $offset);
+ // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
+ //$contents = retrieve_url_contents($url);
+ if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
+ {
+ return false;
+ }
+ // The second parameter can force the selectors to all be lowercase.
+ $dom->load($contents, $lowercase, $stripRN);
+ return $dom;
+}
+
+// get html dom from string
+function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+{
+ $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
+ if (empty($str) || strlen($str) > MAX_FILE_SIZE)
+ {
+ $dom->clear();
+ return false;
+ }
+ $dom->load($str, $lowercase, $stripRN);
+ return $dom;
+}
+
+// dump html dom tree
+function dump_html_tree($node, $show_attr=true, $deep=0)
+{
+ $node->dump($node);
+}
+
+
+/**
+ * simple html dom node
+ * PaperG - added ability for "find" routine to lowercase the value of the selector.
+ * PaperG - added $tag_start to track the start position of the tag in the total byte index
+ *
+ * @package PlaceLocalInclude
+ */
+class simple_html_dom_node
+{
+ public $nodetype = HDOM_TYPE_TEXT;
+ public $tag = 'text';
+ public $attr = array();
+ public $children = array();
+ public $nodes = array();
+ public $parent = null;
+ // The "info" array - see HDOM_INFO_... for what each element contains.
+ public $_ = array();
+ public $tag_start = 0;
+ private $dom = null;
+
+ function __construct($dom)
+ {
+ $this->dom = $dom;
+ $dom->nodes[] = $this;
+ }
+
+ function __destruct()
+ {
+ $this->clear();
+ }
+
+ function __toString()
+ {
+ return $this->outertext();
+ }
+
+ // clean up memory due to php5 circular references memory leak...
+ function clear()
+ {
+ $this->dom = null;
+ $this->nodes = null;
+ $this->parent = null;
+ $this->children = null;
+ }
+
+ // dump node's tree
+ function dump($show_attr=true, $deep=0)
+ {
+ $lead = str_repeat(' ', $deep);
+
+ echo $lead.$this->tag;
+ if ($show_attr && count($this->attr)>0)
+ {
+ echo '(';
+ foreach ($this->attr as $k=>$v)
+ echo "[$k]=>\"".$this->$k.'", ';
+ echo ')';
+ }
+ echo "\n";
+
+ if ($this->nodes)
+ {
+ foreach ($this->nodes as $c)
+ {
+ $c->dump($show_attr, $deep+1);
+ }
+ }
+ }
+
+
+ // Debugging function to dump a single dom node with a bunch of information about it.
+ function dump_node($echo=true)
+ {
+
+ $string = $this->tag;
+ if (count($this->attr)>0)
+ {
+ $string .= '(';
+ foreach ($this->attr as $k=>$v)
+ {
+ $string .= "[$k]=>\"".$this->$k.'", ';
+ }
+ $string .= ')';
+ }
+ if (count($this->_)>0)
+ {
+ $string .= ' $_ (';
+ foreach ($this->_ as $k=>$v)
+ {
+ if (is_array($v))
+ {
+ $string .= "[$k]=>(";
+ foreach ($v as $k2=>$v2)
+ {
+ $string .= "[$k2]=>\"".$v2.'", ';
+ }
+ $string .= ")";
+ } else {
+ $string .= "[$k]=>\"".$v.'", ';
+ }
+ }
+ $string .= ")";
+ }
+
+ if (isset($this->text))
+ {
+ $string .= " text: (" . $this->text . ")";
+ }
+
+ $string .= " HDOM_INNER_INFO: '";
+ if (isset($node->_[HDOM_INFO_INNER]))
+ {
+ $string .= $node->_[HDOM_INFO_INNER] . "'";
+ }
+ else
+ {
+ $string .= ' NULL ';
+ }
+
+ $string .= " children: " . count($this->children);
+ $string .= " nodes: " . count($this->nodes);
+ $string .= " tag_start: " . $this->tag_start;
+ $string .= "\n";
+
+ if ($echo)
+ {
+ echo $string;
+ return;
+ }
+ else
+ {
+ return $string;
+ }
+ }
+
+ // returns the parent of node
+ // If a node is passed in, it will reset the parent of the current node to that one.
+ function parent($parent=null)
+ {
+ // I am SURE that this doesn't work properly.
+ // It fails to unset the current node from it's current parents nodes or children list first.
+ if ($parent !== null)
+ {
+ $this->parent = $parent;
+ $this->parent->nodes[] = $this;
+ $this->parent->children[] = $this;
+ }
+
+ return $this->parent;
+ }
+
+ // verify that node has children
+ function has_child()
+ {
+ return !empty($this->children);
+ }
+
+ // returns children of node
+ function children($idx=-1)
+ {
+ if ($idx===-1)
+ {
+ return $this->children;
+ }
+ if (isset($this->children[$idx])) return $this->children[$idx];
+ return null;
+ }
+
+ // returns the first child of node
+ function first_child()
+ {
+ if (count($this->children)>0)
+ {
+ return $this->children[0];
+ }
+ return null;
+ }
+
+ // returns the last child of node
+ function last_child()
+ {
+ if (($count=count($this->children))>0)
+ {
+ return $this->children[$count-1];
+ }
+ return null;
+ }
+
+ // returns the next sibling of node
+ function next_sibling()
+ {
+ if ($this->parent===null)
+ {
+ return null;
+ }
+
+ $idx = 0;
+ $count = count($this->parent->children);
+ while ($idx<$count && $this!==$this->parent->children[$idx])
+ {
+ ++$idx;
+ }
+ if (++$idx>=$count)
+ {
+ return null;
+ }
+ return $this->parent->children[$idx];
+ }
+
+ // returns the previous sibling of node
+ function prev_sibling()
+ {
+ if ($this->parent===null) return null;
+ $idx = 0;
+ $count = count($this->parent->children);
+ while ($idx<$count && $this!==$this->parent->children[$idx])
+ ++$idx;
+ if (--$idx<0) return null;
+ return $this->parent->children[$idx];
+ }
+
+ // function to locate a specific ancestor tag in the path to the root.
+ function find_ancestor_tag($tag)
+ {
+ global $debug_object;
+ if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+
+ // Start by including ourselves in the comparison.
+ $returnDom = $this;
+
+ while (!is_null($returnDom))
+ {
+ if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }
+
+ if ($returnDom->tag == $tag)
+ {
+ break;
+ }
+ $returnDom = $returnDom->parent;
+ }
+ return $returnDom;
+ }
+
+ // get dom node's inner html
+ function innertext()
+ {
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+
+ $ret = '';
+ foreach ($this->nodes as $n)
+ $ret .= $n->outertext();
+ return $ret;
+ }
+
+ // get dom node's outer text (with tag)
+ function outertext()
+ {
+ global $debug_object;
+ if (is_object($debug_object))
+ {
+ $text = '';
+ if ($this->tag == 'text')
+ {
+ if (!empty($this->text))
+ {
+ $text = " with text: " . $this->text;
+ }
+ }
+ $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
+ }
+
+ if ($this->tag==='root') return $this->innertext();
+
+ // trigger callback
+ if ($this->dom && $this->dom->callback!==null)
+ {
+ call_user_func_array($this->dom->callback, array($this));
+ }
+
+ if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+
+ // render begin tag
+ if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
+ {
+ $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
+ } else {
+ $ret = "";
+ }
+
+ // render inner text
+ if (isset($this->_[HDOM_INFO_INNER]))
+ {
+ // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
+ if ($this->tag != "br")
+ {
+ $ret .= $this->_[HDOM_INFO_INNER];
+ }
+ } else {
+ if ($this->nodes)
+ {
+ foreach ($this->nodes as $n)
+ {
+ $ret .= $this->convert_text($n->outertext());
+ }
+ }
+ }
+
+ // render end tag
+ if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
+ $ret .= ''.$this->tag.'>';
+ return $ret;
+ }
+
+ // get dom node's plain text
+ function text()
+ {
+ if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
+ switch ($this->nodetype)
+ {
+ case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+ case HDOM_TYPE_COMMENT: return '';
+ case HDOM_TYPE_UNKNOWN: return '';
+ }
+ if (strcasecmp($this->tag, 'script')===0) return '';
+ if (strcasecmp($this->tag, 'style')===0) return '';
+
+ $ret = '';
+ // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
+ // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
+ // WHY is this happening?
+ if (!is_null($this->nodes))
+ {
+ foreach ($this->nodes as $n)
+ {
+ $ret .= $this->convert_text($n->text());
+ }
+
+ // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
+ if ($this->tag == "span")
+ {
+ $ret .= $this->dom->default_span_text;
+ }
+
+
+ }
+ return $ret;
+ }
+
+ function xmltext()
+ {
+ $ret = $this->innertext();
+ $ret = str_ireplace('', '', $ret);
+ return $ret;
+ }
+
+ // build node's text with tag
+ function makeup()
+ {
+ // text, comment, unknown
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+
+ $ret = '<'.$this->tag;
+ $i = -1;
+
+ foreach ($this->attr as $key=>$val)
+ {
+ ++$i;
+
+ // skip removed attribute
+ if ($val===null || $val===false)
+ continue;
+
+ $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
+ //no value attr: nowrap, checked selected...
+ if ($val===true)
+ $ret .= $key;
+ else {
+ switch ($this->_[HDOM_INFO_QUOTE][$i])
+ {
+ case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
+ case HDOM_QUOTE_SINGLE: $quote = '\''; break;
+ default: $quote = '';
+ }
+ $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
+ }
+ }
+ $ret = $this->dom->restore_noise($ret);
+ return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
+ }
+
+ // find elements by css selector
+ //PaperG - added ability for find to lowercase the value of the selector.
+ function find($selector, $idx=null, $lowercase=false)
+ {
+ $selectors = $this->parse_selector($selector);
+ if (($count=count($selectors))===0) return array();
+ $found_keys = array();
+
+ // find each selector
+ for ($c=0; $c<$count; ++$c)
+ {
+ // The change on the below line was documented on the sourceforge code tracker id 2788009
+ // used to be: if (($levle=count($selectors[0]))===0) return array();
+ if (($levle=count($selectors[$c]))===0) return array();
+ if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
+
+ $head = array($this->_[HDOM_INFO_BEGIN]=>1);
+
+ // handle descendant selectors, no recursive!
+ for ($l=0; $l<$levle; ++$l)
+ {
+ $ret = array();
+ foreach ($head as $k=>$v)
+ {
+ $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
+ //PaperG - Pass this optional parameter on to the seek function.
+ $n->seek($selectors[$c][$l], $ret, $lowercase);
+ }
+ $head = $ret;
+ }
+
+ foreach ($head as $k=>$v)
+ {
+ if (!isset($found_keys[$k]))
+ $found_keys[$k] = 1;
+ }
+ }
+
+ // sort keys
+ ksort($found_keys);
+
+ $found = array();
+ foreach ($found_keys as $k=>$v)
+ $found[] = $this->dom->nodes[$k];
+
+ // return nth-element or array
+ if (is_null($idx)) return $found;
+ else if ($idx<0) $idx = count($found) + $idx;
+ return (isset($found[$idx])) ? $found[$idx] : null;
+ }
+
+ // seek for given conditions
+ // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
+ protected function seek($selector, &$ret, $lowercase=false)
+ {
+ global $debug_object;
+ if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+
+ list($tag, $key, $val, $exp, $no_key) = $selector;
+
+ // xpath index
+ if ($tag && $key && is_numeric($key))
+ {
+ $count = 0;
+ foreach ($this->children as $c)
+ {
+ if ($tag==='*' || $tag===$c->tag) {
+ if (++$count==$key) {
+ $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
+ return;
+ }
+ }
+ }
+ return;
+ }
+
+ $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
+ if ($end==0) {
+ $parent = $this->parent;
+ while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
+ $end -= 1;
+ $parent = $parent->parent;
+ }
+ $end += $parent->_[HDOM_INFO_END];
+ }
+
+ for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
+ $node = $this->dom->nodes[$i];
+
+ $pass = true;
+
+ if ($tag==='*' && !$key) {
+ if (in_array($node, $this->children, true))
+ $ret[$i] = 1;
+ continue;
+ }
+
+ // compare tag
+ if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
+ // compare key
+ if ($pass && $key) {
+ if ($no_key) {
+ if (isset($node->attr[$key])) $pass=false;
+ } else {
+ if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
+ }
+ }
+ // compare value
+ if ($pass && $key && $val && $val!=='*') {
+ // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
+ if ($key == "plaintext") {
+ // $node->plaintext actually returns $node->text();
+ $nodeKeyValue = $node->text();
+ } else {
+ // this is a normal search, we want the value of that attribute of the tag.
+ $nodeKeyValue = $node->attr[$key];
+ }
+ if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
+
+ //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
+ if ($lowercase) {
+ $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
+ } else {
+ $check = $this->match($exp, $val, $nodeKeyValue);
+ }
+ if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}
+
+ // handle multiple class
+ if (!$check && strcasecmp($key, 'class')===0) {
+ foreach (explode(' ',$node->attr[$key]) as $k) {
+ // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
+ if (!empty($k)) {
+ if ($lowercase) {
+ $check = $this->match($exp, strtolower($val), strtolower($k));
+ } else {
+ $check = $this->match($exp, $val, $k);
+ }
+ if ($check) break;
+ }
+ }
+ }
+ if (!$check) $pass = false;
+ }
+ if ($pass) $ret[$i] = 1;
+ unset($node);
+ }
+ // It's passed by reference so this is actually what this function returns.
+ if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}
+ }
+
+ protected function match($exp, $pattern, $value) {
+ global $debug_object;
+ if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+
+ switch ($exp) {
+ case '=':
+ return ($value===$pattern);
+ case '!=':
+ return ($value!==$pattern);
+ case '^=':
+ return preg_match("/^".preg_quote($pattern,'/')."/", $value);
+ case '$=':
+ return preg_match("/".preg_quote($pattern,'/')."$/", $value);
+ case '*=':
+ if ($pattern[0]=='/') {
+ return preg_match($pattern, $value);
+ }
+ return preg_match("/".$pattern."/i", $value);
+ }
+ return false;
+ }
+
+ protected function parse_selector($selector_string) {
+ global $debug_object;
+ if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+
+ // pattern of CSS selectors, modified from mootools
+ // Paperg: Add the colon to the attrbute, so that it properly finds like google does.
+ // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
+// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
+// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
+// farther study is required to determine of this should be documented or removed.
+// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+ $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+ preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
+ if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}
+
+ $selectors = array();
+ $result = array();
+ //print_r($matches);
+
+ foreach ($matches as $m) {
+ $m[0] = trim($m[0]);
+ if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
+ // for browser generated xpath
+ if ($m[1]==='tbody') continue;
+
+ list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
+ if (!empty($m[2])) {$key='id'; $val=$m[2];}
+ if (!empty($m[3])) {$key='class'; $val=$m[3];}
+ if (!empty($m[4])) {$key=$m[4];}
+ if (!empty($m[5])) {$exp=$m[5];}
+ if (!empty($m[6])) {$val=$m[6];}
+
+ // convert to lowercase
+ if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
+ //elements that do NOT have the specified attribute
+ if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
+
+ $result[] = array($tag, $key, $val, $exp, $no_key);
+ if (trim($m[7])===',') {
+ $selectors[] = $result;
+ $result = array();
+ }
+ }
+ if (count($result)>0)
+ $selectors[] = $result;
+ return $selectors;
+ }
+
+ function __get($name) {
+ if (isset($this->attr[$name]))
+ {
+ return $this->convert_text($this->attr[$name]);
+ }
+ switch ($name) {
+ case 'outertext': return $this->outertext();
+ case 'innertext': return $this->innertext();
+ case 'plaintext': return $this->text();
+ case 'xmltext': return $this->xmltext();
+ default: return array_key_exists($name, $this->attr);
+ }
+ }
+
+ function __set($name, $value) {
+ switch ($name) {
+ case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
+ case 'innertext':
+ if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
+ return $this->_[HDOM_INFO_INNER] = $value;
+ }
+ if (!isset($this->attr[$name])) {
+ $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
+ $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
+ }
+ $this->attr[$name] = $value;
+ }
+
+ function __isset($name) {
+ switch ($name) {
+ case 'outertext': return true;
+ case 'innertext': return true;
+ case 'plaintext': return true;
+ }
+ //no value attr: nowrap, checked selected...
+ return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
+ }
+
+ function __unset($name) {
+ if (isset($this->attr[$name]))
+ unset($this->attr[$name]);
+ }
+
+ // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
+ function convert_text($text)
+ {
+ global $debug_object;
+ if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+
+ $converted_text = $text;
+
+ $sourceCharset = "";
+ $targetCharset = "";
+
+ if ($this->dom)
+ {
+ $sourceCharset = strtoupper($this->dom->_charset);
+ $targetCharset = strtoupper($this->dom->_target_charset);
+ }
+ if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
+
+ if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
+ {
+ // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
+ if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
+ {
+ $converted_text = $text;
+ }
+ else
+ {
+ $converted_text = iconv($sourceCharset, $targetCharset, $text);
+ }
+ }
+
+ // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
+ if ($targetCharset == 'UTF-8')
+ {
+ if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
+ {
+ $converted_text = substr($converted_text, 3);
+ }
+ if (substr($converted_text, -3) == "\xef\xbb\xbf")
+ {
+ $converted_text = substr($converted_text, 0, -3);
+ }
+ }
+
+ return $converted_text;
+ }
+
+ /**
+ * Returns true if $string is valid UTF-8 and false otherwise.
+ *
+ * @param mixed $str String to be tested
+ * @return boolean
+ */
+ static function is_utf8($str)
+ {
+ $c=0; $b=0;
+ $bits=0;
+ $len=strlen($str);
+ for($i=0; $i<$len; $i++)
+ {
+ $c=ord($str[$i]);
+ if($c > 128)
+ {
+ if(($c >= 254)) return false;
+ elseif($c >= 252) $bits=6;
+ elseif($c >= 248) $bits=5;
+ elseif($c >= 240) $bits=4;
+ elseif($c >= 224) $bits=3;
+ elseif($c >= 192) $bits=2;
+ else return false;
+ if(($i+$bits) > $len) return false;
+ while($bits > 1)
+ {
+ $i++;
+ $b=ord($str[$i]);
+ if($b < 128 || $b > 191) return false;
+ $bits--;
+ }
+ }
+ }
+ return true;
+ }
+ /*
+ function is_utf8($string)
+ {
+ //this is buggy
+ return (utf8_encode(utf8_decode($string)) == $string);
+ }
+ */
+
+ /**
+ * Function to try a few tricks to determine the displayed size of an img on the page.
+ * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
+ *
+ * @author John Schlick
+ * @version April 19 2012
+ * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
+ */
+ function get_display_size()
+ {
+ global $debug_object;
+
+ $width = -1;
+ $height = -1;
+
+ if ($this->tag !== 'img')
+ {
+ return false;
+ }
+
+ // See if there is aheight or width attribute in the tag itself.
+ if (isset($this->attr['width']))
+ {
+ $width = $this->attr['width'];
+ }
+
+ if (isset($this->attr['height']))
+ {
+ $height = $this->attr['height'];
+ }
+
+ // Now look for an inline style.
+ if (isset($this->attr['style']))
+ {
+ // Thanks to user gnarf from stackoverflow for this regular expression.
+ $attributes = array();
+ preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
+ foreach ($matches as $match) {
+ $attributes[$match[1]] = $match[2];
+ }
+
+ // If there is a width in the style attributes:
+ if (isset($attributes['width']) && $width == -1)
+ {
+ // check that the last two characters are px (pixels)
+ if (strtolower(substr($attributes['width'], -2)) == 'px')
+ {
+ $proposed_width = substr($attributes['width'], 0, -2);
+ // Now make sure that it's an integer and not something stupid.
+ if (filter_var($proposed_width, FILTER_VALIDATE_INT))
+ {
+ $width = $proposed_width;
+ }
+ }
+ }
+
+ // If there is a width in the style attributes:
+ if (isset($attributes['height']) && $height == -1)
+ {
+ // check that the last two characters are px (pixels)
+ if (strtolower(substr($attributes['height'], -2)) == 'px')
+ {
+ $proposed_height = substr($attributes['height'], 0, -2);
+ // Now make sure that it's an integer and not something stupid.
+ if (filter_var($proposed_height, FILTER_VALIDATE_INT))
+ {
+ $height = $proposed_height;
+ }
+ }
+ }
+
+ }
+
+ // Future enhancement:
+ // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
+
+ // Far future enhancement
+ // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
+ // Note that in this case, the class or id will have the img subselector for it to apply to the image.
+
+ // ridiculously far future development
+ // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
+
+ $result = array('height' => $height,
+ 'width' => $width);
+ return $result;
+ }
+
+ // camel naming conventions
+ function getAllAttributes() {return $this->attr;}
+ function getAttribute($name) {return $this->__get($name);}
+ function setAttribute($name, $value) {$this->__set($name, $value);}
+ function hasAttribute($name) {return $this->__isset($name);}
+ function removeAttribute($name) {$this->__set($name, null);}
+ function getElementById($id) {return $this->find("#$id", 0);}
+ function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
+ function getElementByTagName($name) {return $this->find($name, 0);}
+ function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
+ function parentNode() {return $this->parent();}
+ function childNodes($idx=-1) {return $this->children($idx);}
+ function firstChild() {return $this->first_child();}
+ function lastChild() {return $this->last_child();}
+ function nextSibling() {return $this->next_sibling();}
+ function previousSibling() {return $this->prev_sibling();}
+ function hasChildNodes() {return $this->has_child();}
+ function nodeName() {return $this->tag;}
+ function appendChild($node) {$node->parent($this); return $node;}
+
+}
+
+/**
+ * simple html dom parser
+ * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
+ * Paperg - change $size from protected to public so we can easily access it
+ * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
+ *
+ * @package PlaceLocalInclude
+ */
+class simple_html_dom
+{
+ public $root = null;
+ public $nodes = array();
+ public $callback = null;
+ public $lowercase = false;
+ // Used to keep track of how large the text was when we started.
+ public $original_size;
+ public $size;
+ protected $pos;
+ protected $doc;
+ protected $char;
+ protected $cursor;
+ protected $parent;
+ protected $noise = array();
+ protected $token_blank = " \t\r\n";
+ protected $token_equal = ' =/>';
+ protected $token_slash = " />\r\n\t";
+ protected $token_attr = ' >';
+ // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
+ public $_charset = '';
+ public $_target_charset = '';
+ protected $default_br_text = "";
+ public $default_span_text = "";
+
+ // use isset instead of in_array, performance boost about 30%...
+ protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
+ protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
+ // Known sourceforge issue #2977341
+ // B tags that are not closed cause us to return everything to the end of the document.
+ protected $optional_closing_tags = array(
+ 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
+ 'th'=>array('th'=>1),
+ 'td'=>array('td'=>1),
+ 'li'=>array('li'=>1),
+ 'dt'=>array('dt'=>1, 'dd'=>1),
+ 'dd'=>array('dd'=>1, 'dt'=>1),
+ 'dl'=>array('dd'=>1, 'dt'=>1),
+ 'p'=>array('p'=>1),
+ 'nobr'=>array('nobr'=>1),
+ 'b'=>array('b'=>1),
+ 'option'=>array('option'=>1),
+ );
+
+ function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+ {
+ if ($str)
+ {
+ if (preg_match("/^http:\/\//i",$str) || is_file($str))
+ {
+ $this->load_file($str);
+ }
+ else
+ {
+ $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
+ }
+ }
+ // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
+ if (!$forceTagsClosed) {
+ $this->optional_closing_array=array();
+ }
+ $this->_target_charset = $target_charset;
+ }
+
+ function __destruct()
+ {
+ $this->clear();
+ }
+
+ // load html from string
+ function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+ {
+ global $debug_object;
+
+ // prepare
+ $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
+ // strip out comments
+ $this->remove_noise("''is");
+ // strip out cdata
+ $this->remove_noise("''is", true);
+ // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
+ // Script tags removal now preceeds style tag removal.
+ // strip out