From 3db95a85de1297908e780742cd0b7d779c5f522e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Wed, 31 Jul 2013 19:09:06 +0200 Subject: update external libs --- inc/Readability.php | 198 ++++++++++++++++++++++++++-------------------------- 1 file changed, 99 insertions(+), 99 deletions(-) (limited to 'inc/Readability.php') diff --git a/inc/Readability.php b/inc/Readability.php index d28d28f9..e1e8738b 100644 --- a/inc/Readability.php +++ b/inc/Readability.php @@ -1,5 +1,5 @@ '/((\s| ?)*){1,}/', 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' - ); - + ); + /* constants */ const FLAG_STRIP_UNLIKELYS = 1; const FLAG_WEIGHT_CLASSES = 2; const FLAG_CLEAN_CONDITIONALLY = 4; - + /** * Create instance of Readability * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (used for footnotes) * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') - */ + */ function __construct($html, $url=null, $parser='libxml') { $this->url = $url; @@ -135,18 +135,18 @@ class Readability public function getTitle() { return $this->articleTitle; } - + /** * Get article content element * @return DOMElement */ public function getContent() { return $this->articleContent; - } - + } + /** * Runs readability. - * + * * Workflow: * 1. Prep the document by removing script tags, css, etc. * 2. Build readability's DOM tree. @@ -161,7 +161,7 @@ class Readability if (!isset($this->dom->documentElement)) return false; $this->removeScripts($this->dom); //die($this->getInnerHTML($this->dom->documentElement)); - + // Assume successful outcome $this->success = true; @@ -176,7 +176,7 @@ class Readability } $this->prepDocument(); - + //die($this->dom->documentElement->parentNode->nodeType); //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); //die($this->getInnerHTML($this->dom->documentElement)); @@ -191,9 +191,9 @@ class Readability $this->success = false; $articleContent = $this->dom->createElement('div'); $articleContent->setAttribute('id', 'readability-content'); - $articleContent->innerHTML = '

Sorry, Readability was unable to parse this page for content.

'; + $articleContent->innerHTML = '

Sorry, Readability was unable to parse this page for content.

'; } - + $overlay->setAttribute('id', 'readOverlay'); $innerDiv->setAttribute('id', 'readInner'); @@ -201,7 +201,7 @@ class Readability $innerDiv->appendChild($articleTitle); $innerDiv->appendChild($articleContent); $overlay->appendChild($innerDiv); - + /* Clear the old HTML, insert the new content. */ $this->body->innerHTML = ''; $this->body->appendChild($overlay); @@ -209,21 +209,21 @@ class Readability $this->body->removeAttribute('style'); $this->postProcessContent($articleContent); - + // Set title and content instance variables $this->articleTitle = $articleTitle; $this->articleContent = $articleContent; - + return $this->success; } - + /** * Debug */ protected function dbg($msg) { if ($this->debug) echo '* ',$msg, "\n"; } - + /** * Run any post-process modifications to article content as necessary. * @@ -231,11 +231,11 @@ class Readability * @return void */ public function postProcessContent($articleContent) { - if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { + if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { $this->addFootnotes($articleContent); } } - + /** * Get the article title as an H1. * @@ -248,11 +248,11 @@ class Readability try { $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); } catch(Exception $e) {} - + if (preg_match('/ [\|\-] /', $curTitle)) { $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); - + if (count(explode(' ', $curTitle)) < 3) { $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); } @@ -279,17 +279,17 @@ class Readability if (count(explode(' ', $curTitle)) <= 4) { $curTitle = $origTitle; } - + $articleTitle = $this->dom->createElement('h1'); $articleTitle->innerHTML = $curTitle; - + return $articleTitle; } - + /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. - * + * * @return void **/ protected function prepDocument() { @@ -328,13 +328,13 @@ class Readability $footnotesWrapper = $this->dom->createElement('div'); $footnotesWrapper->setAttribute('id', 'readability-footnotes'); $footnotesWrapper->innerHTML = '

References

'; - + $articleFootnotes = $this->dom->createElement('ol'); $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); $footnotesWrapper->appendChild($articleFootnotes); - + $articleLinks = $articleContent->getElementsByTagName('a'); - + $linkCount = 0; for ($i = 0; $i < $articleLinks->length; $i++) { @@ -346,11 +346,11 @@ class Readability if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, $linkText = $this->getInnerText($articleLink); - + if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { continue; } - + $linkCount++; /** Add a superscript reference after the article link */ @@ -358,7 +358,7 @@ class Readability $refLink->innerHTML = '[' . $linkCount . ']'; $refLink->setAttribute('class', 'readability-DoNotFootnote'); $refLink->setAttribute('style', 'color: inherit;'); - + //TODO: does this work or should we use DOMNode.isSameNode()? if ($articleLink->parentNode->lastChild == $articleLink) { $articleLink->parentNode->appendChild($refLink); @@ -373,15 +373,15 @@ class Readability $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); - + $footnote->appendChild($footnoteLink); if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . ' (' . $linkDomain . ')'; - + $articleFootnotes->appendChild($footnote); } if ($linkCount > 0) { - $articleContent->appendChild($footnotesWrapper); + $articleContent->appendChild($footnotesWrapper); } } @@ -404,7 +404,7 @@ class Readability //} } } - + /** * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous

tags, etc. @@ -429,7 +429,7 @@ class Readability * as a header and not a subheader, so remove it since we already have a header. ***/ if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { - $this->clean($articleContent, 'h2'); + $this->clean($articleContent, 'h2'); } $this->clean($articleContent, 'iframe'); @@ -448,7 +448,7 @@ class Readability $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; - + if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') { $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); @@ -457,13 +457,13 @@ class Readability try { $articleContent->innerHTML = preg_replace('/]*>\s*

innerHTML); - //articleContent.innerHTML = articleContent.innerHTML.replace(/]*>\s*

]*>\s*

dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); } } - + /** * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. @@ -474,7 +474,7 @@ class Readability protected function initializeNode($node) { $readability = $this->dom->createAttribute('readability'); $readability->value = 0; // this is our contentScore - $node->setAttributeNode($readability); + $node->setAttributeNode($readability); switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case case 'DIV': @@ -486,7 +486,7 @@ class Readability case 'BLOCKQUOTE': $readability->value += 3; break; - + case 'ADDRESS': case 'OL': case 'UL': @@ -510,7 +510,7 @@ class Readability } $readability->value += $this->getClassWeight($node); } - + /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. @@ -548,7 +548,7 @@ class Readability $node->parentNode->removeChild($node); $nodeIndex--; continue; - } + } } if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { @@ -589,7 +589,7 @@ class Readability } } } - + /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. @@ -613,7 +613,7 @@ class Readability } /* Initialize readability data for the parent. */ - if (!$parentNode->hasAttribute('readability')) + if (!$parentNode->hasAttribute('readability')) { $this->initializeNode($parentNode); $candidates[] = $parentNode; @@ -633,15 +633,15 @@ class Readability /* Add points for any commas within this paragraph */ $contentScore += count(explode(',', $innerText)); - + /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ $contentScore += min(floor(strlen($innerText) / 100), 3); - + /* Add the score to the parent. The grandparent gets half. */ $parentNode->getAttributeNode('readability')->value += $contentScore; if ($grandParentNode) { - $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; + $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; } } @@ -727,12 +727,12 @@ class Readability { $append = true; } - + if (strtoupper($siblingNode->nodeName) == 'P') { $linkDensity = $this->getLinkDensity($siblingNode); $nodeContent = $this->getInnerText($siblingNode); $nodeLength = strlen($nodeContent); - + if ($nodeLength > 80 && $linkDensity < 0.25) { $append = true; @@ -751,7 +751,7 @@ class Readability $sibNodeName = strtoupper($siblingNode->nodeName); if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ - + $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); $nodeToAppend = $this->dom->createElement('div'); try { @@ -770,7 +770,7 @@ class Readability $s--; $sl--; } - + /* To ensure a node does not interfere with readability styles, remove its classnames */ $nodeToAppend->removeAttribute('class'); @@ -796,14 +796,14 @@ class Readability // in the meantime, we check and create an empty element if it's not there. if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); $this->body->innerHTML = $this->bodyCache; - + if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); return $this->grabArticle($this->body); } else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { $this->removeFlag(self::FLAG_WEIGHT_CLASSES); - return $this->grabArticle($this->body); + return $this->grabArticle($this->body); } else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); @@ -815,7 +815,7 @@ class Readability } return $articleContent; } - + /** * Remove script tags from document * @@ -829,7 +829,7 @@ class Readability $scripts->item($i)->parentNode->removeChild($scripts->item($i)); } } - + /** * Get the inner text of a node. * This also strips out any excess whitespace to be found. @@ -878,11 +878,11 @@ class Readability $elem->removeAttribute('style'); } } - + /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. - * + * * @param DOMElement $e * @return number (float) */ @@ -900,9 +900,9 @@ class Readability return 0; } } - + /** - * Get an elements class/id weight. Uses regular expressions to tell if this + * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. * * @param DOMElement $e @@ -964,7 +964,7 @@ class Readability public function clean($e, $tag) { $targetList = $e->getElementsByTagName($tag); $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); - + for ($y=$targetList->length-1; $y >= 0; $y--) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if ($isEmbed) { @@ -972,7 +972,7 @@ class Readability for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) } - + /* First, check the elements attributes to see if any of them contain youtube or vimeo */ if (preg_match($this->regexps['video'], $attributeValues)) { continue; @@ -986,10 +986,10 @@ class Readability $targetList->item($y)->parentNode->removeChild($targetList->item($y)); } } - + /** * Clean an element of all tags of type "tag" if they look fishy. - * "Fishy" is an algorithm based on content length, classnames, + * "Fishy" is an algorithm based on content length, classnames, * link density, number of images & embeds, etc. * * @param DOMElement $e @@ -1013,7 +1013,7 @@ class Readability for ($i=$curTagsLength-1; $i >= 0; $i--) { $weight = $this->getClassWeight($tagsList->item($i)); $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; - + $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); if ($weight + $contentScore < 0) { @@ -1034,13 +1034,13 @@ class Readability $embeds = $tagsList->item($i)->getElementsByTagName('embed'); for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; + $embedCount++; } } $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { - $embedCount++; + $embedCount++; } } @@ -1058,7 +1058,7 @@ class Readability $toRemove = true; } else if ( $input > floor($p/3) ) { $this->dbg(' too many elements'); - $toRemove = true; + $toRemove = true; } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); $toRemove = true; @@ -1082,7 +1082,7 @@ class Readability $toRemove = true; } else if ( $input > floor($p/3) ) { $this->dbg(' too many elements'); - $toRemove = true; + $toRemove = true; } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); $toRemove = true; @@ -1126,11 +1126,11 @@ class Readability public function flagIsActive($flag) { return ($this->flags & $flag) > 0; } - + public function addFlag($flag) { $this->flags = $this->flags | $flag; } - + public function removeFlag($flag) { $this->flags = $this->flags & ~$flag; } -- cgit v1.2.3