aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--inc/JSLikeHTMLElement.php12
-rw-r--r--inc/Readability.php198
2 files changed, 105 insertions, 105 deletions
diff --git a/inc/JSLikeHTMLElement.php b/inc/JSLikeHTMLElement.php
index dfcc1be5..238ba8a8 100644
--- a/inc/JSLikeHTMLElement.php
+++ b/inc/JSLikeHTMLElement.php
@@ -4,7 +4,7 @@
4* 4*
5* This class extends PHP's DOMElement to allow 5* This class extends PHP's DOMElement to allow
6* users to get and set the innerHTML property of 6* users to get and set the innerHTML property of
7* HTML elements in the same way it's done in 7* HTML elements in the same way it's done in
8* JavaScript. 8* JavaScript.
9* 9*
10* Example usage: 10* Example usage:
@@ -15,16 +15,16 @@
15* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); 15* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
16* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); 16* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
17* $elem = $doc->getElementsByTagName('div')->item(0); 17* $elem = $doc->getElementsByTagName('div')->item(0);
18* 18*
19* // print innerHTML 19* // print innerHTML
20* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' 20* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
21* echo "\n\n"; 21* echo "\n\n";
22* 22*
23* // set innerHTML 23* // set innerHTML
24* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; 24* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>';
25* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' 25* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>'
26* echo "\n\n"; 26* echo "\n\n";
27* 27*
28* // print document (with our changes) 28* // print document (with our changes)
29* echo $doc->saveXML(); 29* echo $doc->saveXML();
30* @endcode 30* @endcode
@@ -59,7 +59,7 @@ class JSLikeHTMLElement extends DOMElement
59 $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); 59 $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
60 // Using <htmlfragment> will generate a warning, but so will bad HTML 60 // Using <htmlfragment> will generate a warning, but so will bad HTML
61 // (and by this point, bad HTML is what we've got). 61 // (and by this point, bad HTML is what we've got).
62 // We use it (and suppress the warning) because an HTML fragment will 62 // We use it (and suppress the warning) because an HTML fragment will
63 // be wrapped around <html><body> tags which we don't really want to keep. 63 // be wrapped around <html><body> tags which we don't really want to keep.
64 // Note: despite the warning, if loadHTML succeeds it will return true. 64 // Note: despite the warning, if loadHTML succeeds it will return true.
65 $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); 65 $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>');
@@ -86,7 +86,7 @@ class JSLikeHTMLElement extends DOMElement
86 * @code 86 * @code
87 * $string = $div->innerHTML; 87 * $string = $div->innerHTML;
88 * @endcode 88 * @endcode
89 */ 89 */
90 public function __get($name) 90 public function __get($name)
91 { 91 {
92 if ($name == 'innerHTML') { 92 if ($name == 'innerHTML') {
diff --git a/inc/Readability.php b/inc/Readability.php
index d28d28f9..e1e8738b 100644
--- a/inc/Readability.php
+++ b/inc/Readability.php
@@ -1,5 +1,5 @@
1<?php 1<?php
2/** 2/**
3* Arc90's Readability ported to PHP for FiveFilters.org 3* Arc90's Readability ported to PHP for FiveFilters.org
4* Based on readability.js version 1.7.1 (without multi-page support) 4* Based on readability.js version 1.7.1 (without multi-page support)
5* Updated to allow HTML5 parsing with html5lib 5* Updated to allow HTML5 parsing with html5lib
@@ -13,34 +13,34 @@
13* License: Apache License, Version 2.0 13* License: Apache License, Version 2.0
14* Requires: PHP5 14* Requires: PHP5
15* Date: 2012-09-19 15* Date: 2012-09-19
16* 16*
17* Differences between the PHP port and the original 17* Differences between the PHP port and the original
18* ------------------------------------------------------ 18* ------------------------------------------------------
19* Arc90's Readability is designed to run in the browser. It works on the DOM 19* Arc90's Readability is designed to run in the browser. It works on the DOM
20* tree (the parsed HTML) after the page's CSS styles have been applied and 20* tree (the parsed HTML) after the page's CSS styles have been applied and
21* Javascript code executed. This PHP port does not run inside a browser. 21* Javascript code executed. This PHP port does not run inside a browser.
22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
23* rely on CSS or Javascript support. As such, the results will not always 23* rely on CSS or Javascript support. As such, the results will not always
24* match Arc90's Readability. (For example, if a web page contains CSS style 24* match Arc90's Readability. (For example, if a web page contains CSS style
25* rules or Javascript code which hide certain HTML elements from display, 25* rules or Javascript code which hide certain HTML elements from display,
26* Arc90's Readability will dismiss those from consideration but our PHP port, 26* Arc90's Readability will dismiss those from consideration but our PHP port,
27* unable to understand CSS or Javascript, will not know any better.) 27* unable to understand CSS or Javascript, will not know any better.)
28* 28*
29* Another significant difference is that the aim of Arc90's Readability is 29* Another significant difference is that the aim of Arc90's Readability is
30* to re-present the main content block of a given web page so users can 30* to re-present the main content block of a given web page so users can
31* read it more easily in their browsers. Correct identification, clean up, 31* read it more easily in their browsers. Correct identification, clean up,
32* and separation of the content block is only a part of this process. 32* and separation of the content block is only a part of this process.
33* This PHP port is only concerned with this part, it does not include code 33* This PHP port is only concerned with this part, it does not include code
34* that relates to presentation in the browser - Arc90 already do 34* that relates to presentation in the browser - Arc90 already do
35* that extremely well, and for PDF output there's FiveFilters.org's 35* that extremely well, and for PDF output there's FiveFilters.org's
36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/. 36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
37* 37*
38* Finally, this class contains methods that might be useful for developers 38* Finally, this class contains methods that might be useful for developers
39* working on HTML document fragments. So without deviating too much from 39* working on HTML document fragments. So without deviating too much from
40* the original code (which I don't want to do because it makes debugging 40* the original code (which I don't want to do because it makes debugging
41* and updating more difficult), I've tried to make it a little more 41* and updating more difficult), I've tried to make it a little more
42* developer friendly. You should be able to use the methods here on 42* developer friendly. You should be able to use the methods here on
43* existing DOMElement objects without passing an entire HTML document to 43* existing DOMElement objects without passing an entire HTML document to
44* be parsed. 44* be parsed.
45*/ 45*/
46 46
@@ -48,7 +48,7 @@
48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); 48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
49 49
50// Alternative usage (for testing only!) 50// Alternative usage (for testing only!)
51// uncomment the lines below and call Readability.php in your browser 51// uncomment the lines below and call Readability.php in your browser
52// passing it the URL of the page you'd like content from, e.g.: 52// passing it the URL of the page you'd like content from, e.g.:
53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php 53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
54 54
@@ -75,11 +75,11 @@ class Readability
75 public $url = null; // optional - URL where HTML was retrieved 75 public $url = null; // optional - URL where HTML was retrieved
76 public $debug = false; 76 public $debug = false;
77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19 77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 protected $body = null; // 78 protected $body = null; //
79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. 80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 protected $success = false; // indicates whether we were able to extract or not 81 protected $success = false; // indicates whether we were able to extract or not
82 82
83 /** 83 /**
84 * All of the regular expressions in use within readability. 84 * All of the regular expressions in use within readability.
85 * Defined up here so we don't instantiate them repeatedly in loops. 85 * Defined up here so we don't instantiate them repeatedly in loops.
@@ -97,19 +97,19 @@ class Readability
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/', 97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', 98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 ); 100 );
101 101
102 /* constants */ 102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1; 103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2; 104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4; 105 const FLAG_CLEAN_CONDITIONALLY = 4;
106 106
107 /** 107 /**
108 * Create instance of Readability 108 * Create instance of Readability
109 * @param string UTF-8 encoded string 109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes) 110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') 111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */ 112 */
113 function __construct($html, $url=null, $parser='libxml') 113 function __construct($html, $url=null, $parser='libxml')
114 { 114 {
115 $this->url = $url; 115 $this->url = $url;
@@ -135,18 +135,18 @@ class Readability
135 public function getTitle() { 135 public function getTitle() {
136 return $this->articleTitle; 136 return $this->articleTitle;
137 } 137 }
138 138
139 /** 139 /**
140 * Get article content element 140 * Get article content element
141 * @return DOMElement 141 * @return DOMElement
142 */ 142 */
143 public function getContent() { 143 public function getContent() {
144 return $this->articleContent; 144 return $this->articleContent;
145 } 145 }
146 146
147 /** 147 /**
148 * Runs readability. 148 * Runs readability.
149 * 149 *
150 * Workflow: 150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc. 151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree. 152 * 2. Build readability's DOM tree.
@@ -161,7 +161,7 @@ class Readability
161 if (!isset($this->dom->documentElement)) return false; 161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom); 162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement)); 163 //die($this->getInnerHTML($this->dom->documentElement));
164 164
165 // Assume successful outcome 165 // Assume successful outcome
166 $this->success = true; 166 $this->success = true;
167 167
@@ -176,7 +176,7 @@ class Readability
176 } 176 }
177 177
178 $this->prepDocument(); 178 $this->prepDocument();
179 179
180 //die($this->dom->documentElement->parentNode->nodeType); 180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); 181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement)); 182 //die($this->getInnerHTML($this->dom->documentElement));
@@ -191,9 +191,9 @@ class Readability
191 $this->success = false; 191 $this->success = false;
192 $articleContent = $this->dom->createElement('div'); 192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content'); 193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; 194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 } 195 }
196 196
197 $overlay->setAttribute('id', 'readOverlay'); 197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner'); 198 $innerDiv->setAttribute('id', 'readInner');
199 199
@@ -201,7 +201,7 @@ class Readability
201 $innerDiv->appendChild($articleTitle); 201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent); 202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv); 203 $overlay->appendChild($innerDiv);
204 204
205 /* Clear the old HTML, insert the new content. */ 205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = ''; 206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay); 207 $this->body->appendChild($overlay);
@@ -209,21 +209,21 @@ class Readability
209 $this->body->removeAttribute('style'); 209 $this->body->removeAttribute('style');
210 210
211 $this->postProcessContent($articleContent); 211 $this->postProcessContent($articleContent);
212 212
213 // Set title and content instance variables 213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle; 214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent; 215 $this->articleContent = $articleContent;
216 216
217 return $this->success; 217 return $this->success;
218 } 218 }
219 219
220 /** 220 /**
221 * Debug 221 * Debug
222 */ 222 */
223 protected function dbg($msg) { 223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n"; 224 if ($this->debug) echo '* ',$msg, "\n";
225 } 225 }
226 226
227 /** 227 /**
228 * Run any post-process modifications to article content as necessary. 228 * Run any post-process modifications to article content as necessary.
229 * 229 *
@@ -231,11 +231,11 @@ class Readability
231 * @return void 231 * @return void
232 */ 232 */
233 public function postProcessContent($articleContent) { 233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent); 235 $this->addFootnotes($articleContent);
236 } 236 }
237 } 237 }
238 238
239 /** 239 /**
240 * Get the article title as an H1. 240 * Get the article title as an H1.
241 * 241 *
@@ -248,11 +248,11 @@ class Readability
248 try { 248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); 249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {} 250 } catch(Exception $e) {}
251 251
252 if (preg_match('/ [\|\-] /', $curTitle)) 252 if (preg_match('/ [\|\-] /', $curTitle))
253 { 253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); 254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255 255
256 if (count(explode(' ', $curTitle)) < 3) { 256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); 257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 } 258 }
@@ -279,17 +279,17 @@ class Readability
279 if (count(explode(' ', $curTitle)) <= 4) { 279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle; 280 $curTitle = $origTitle;
281 } 281 }
282 282
283 $articleTitle = $this->dom->createElement('h1'); 283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle; 284 $articleTitle->innerHTML = $curTitle;
285 285
286 return $articleTitle; 286 return $articleTitle;
287 } 287 }
288 288
289 /** 289 /**
290 * Prepare the HTML document for readability to scrape it. 290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup. 291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 * 292 *
293 * @return void 293 * @return void
294 **/ 294 **/
295 protected function prepDocument() { 295 protected function prepDocument() {
@@ -328,13 +328,13 @@ class Readability
328 $footnotesWrapper = $this->dom->createElement('div'); 328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes'); 329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>'; 330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331 331
332 $articleFootnotes = $this->dom->createElement('ol'); 332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); 333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes); 334 $footnotesWrapper->appendChild($articleFootnotes);
335 335
336 $articleLinks = $articleContent->getElementsByTagName('a'); 336 $articleLinks = $articleContent->getElementsByTagName('a');
337 337
338 $linkCount = 0; 338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++) 339 for ($i = 0; $i < $articleLinks->length; $i++)
340 { 340 {
@@ -346,11 +346,11 @@ class Readability
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); 346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, 347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink); 348 $linkText = $this->getInnerText($articleLink);
349 349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { 350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue; 351 continue;
352 } 352 }
353 353
354 $linkCount++; 354 $linkCount++;
355 355
356 /** Add a superscript reference after the article link */ 356 /** Add a superscript reference after the article link */
@@ -358,7 +358,7 @@ class Readability
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; 358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote'); 359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;'); 360 $refLink->setAttribute('style', 'color: inherit;');
361 361
362 //TODO: does this work or should we use DOMNode.isSameNode()? 362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) { 363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink); 364 $articleLink->parentNode->appendChild($refLink);
@@ -373,15 +373,15 @@ class Readability
373 373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); 374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); 375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376 376
377 $footnote->appendChild($footnoteLink); 377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; 378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379 379
380 $articleFootnotes->appendChild($footnote); 380 $articleFootnotes->appendChild($footnote);
381 } 381 }
382 382
383 if ($linkCount > 0) { 383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper); 384 $articleContent->appendChild($footnotesWrapper);
385 } 385 }
386 } 386 }
387 387
@@ -404,7 +404,7 @@ class Readability
404 //} 404 //}
405 } 405 }
406 } 406 }
407 407
408 /** 408 /**
409 * Prepare the article node for display. Clean out any inline styles, 409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc. 410 * iframes, forms, strip extraneous <p> tags, etc.
@@ -429,7 +429,7 @@ class Readability
429 * as a header and not a subheader, so remove it since we already have a header. 429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/ 430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { 431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2'); 432 $this->clean($articleContent, 'h2');
433 } 433 }
434 $this->clean($articleContent, 'iframe'); 434 $this->clean($articleContent, 'iframe');
435 435
@@ -448,7 +448,7 @@ class Readability
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; 448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; 449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; 450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451 451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') 452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 { 453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); 454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
@@ -457,13 +457,13 @@ class Readability
457 457
458 try { 458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); 459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); 460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 } 461 }
462 catch (Exception $e) { 462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); 463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 } 464 }
465 } 465 }
466 466
467 /** 467 /**
468 * Initialize a node with the readability object. Also checks the 468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score. 469 * className/id for special names to add to its score.
@@ -474,7 +474,7 @@ class Readability
474 protected function initializeNode($node) { 474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability'); 475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore 476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability); 477 $node->setAttributeNode($readability);
478 478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case 479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV': 480 case 'DIV':
@@ -486,7 +486,7 @@ class Readability
486 case 'BLOCKQUOTE': 486 case 'BLOCKQUOTE':
487 $readability->value += 3; 487 $readability->value += 3;
488 break; 488 break;
489 489
490 case 'ADDRESS': 490 case 'ADDRESS':
491 case 'OL': 491 case 'OL':
492 case 'UL': 492 case 'UL':
@@ -510,7 +510,7 @@ class Readability
510 } 510 }
511 $readability->value += $this->getClassWeight($node); 511 $readability->value += $this->getClassWeight($node);
512 } 512 }
513 513
514 /*** 514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@@ -548,7 +548,7 @@ class Readability
548 $node->parentNode->removeChild($node); 548 $node->parentNode->removeChild($node);
549 $nodeIndex--; 549 $nodeIndex--;
550 continue; 550 continue;
551 } 551 }
552 } 552 }
553 553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { 554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
@@ -589,7 +589,7 @@ class Readability
589 } 589 }
590 } 590 }
591 } 591 }
592 592
593 /** 593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node. 595 * Then add their score to their parent node.
@@ -613,7 +613,7 @@ class Readability
613 } 613 }
614 614
615 /* Initialize readability data for the parent. */ 615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability')) 616 if (!$parentNode->hasAttribute('readability'))
617 { 617 {
618 $this->initializeNode($parentNode); 618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode; 619 $candidates[] = $parentNode;
@@ -633,15 +633,15 @@ class Readability
633 633
634 /* Add points for any commas within this paragraph */ 634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText)); 635 $contentScore += count(explode(',', $innerText));
636 636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3); 638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639 639
640 /* Add the score to the parent. The grandparent gets half. */ 640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore; 641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642 642
643 if ($grandParentNode) { 643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; 644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 } 645 }
646 } 646 }
647 647
@@ -727,12 +727,12 @@ class Readability
727 { 727 {
728 $append = true; 728 $append = true;
729 } 729 }
730 730
731 if (strtoupper($siblingNode->nodeName) == 'P') { 731 if (strtoupper($siblingNode->nodeName) == 'P') {
732 $linkDensity = $this->getLinkDensity($siblingNode); 732 $linkDensity = $this->getLinkDensity($siblingNode);
733 $nodeContent = $this->getInnerText($siblingNode); 733 $nodeContent = $this->getInnerText($siblingNode);
734 $nodeLength = strlen($nodeContent); 734 $nodeLength = strlen($nodeContent);
735 735
736 if ($nodeLength > 80 && $linkDensity < 0.25) 736 if ($nodeLength > 80 && $linkDensity < 0.25)
737 { 737 {
738 $append = true; 738 $append = true;
@@ -751,7 +751,7 @@ class Readability
751 $sibNodeName = strtoupper($siblingNode->nodeName); 751 $sibNodeName = strtoupper($siblingNode->nodeName);
752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { 752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754 754
755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); 755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 $nodeToAppend = $this->dom->createElement('div'); 756 $nodeToAppend = $this->dom->createElement('div');
757 try { 757 try {
@@ -770,7 +770,7 @@ class Readability
770 $s--; 770 $s--;
771 $sl--; 771 $sl--;
772 } 772 }
773 773
774 /* To ensure a node does not interfere with readability styles, remove its classnames */ 774 /* To ensure a node does not interfere with readability styles, remove its classnames */
775 $nodeToAppend->removeAttribute('class'); 775 $nodeToAppend->removeAttribute('class');
776 776
@@ -796,14 +796,14 @@ class Readability
796 // in the meantime, we check and create an empty element if it's not there. 796 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 $this->body->innerHTML = $this->bodyCache; 798 $this->body->innerHTML = $this->bodyCache;
799 799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body); 802 return $this->grabArticle($this->body);
803 } 803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { 804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES); 805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body); 806 return $this->grabArticle($this->body);
807 } 807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { 808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); 809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
@@ -815,7 +815,7 @@ class Readability
815 } 815 }
816 return $articleContent; 816 return $articleContent;
817 } 817 }
818 818
819 /** 819 /**
820 * Remove script tags from document 820 * Remove script tags from document
821 * 821 *
@@ -829,7 +829,7 @@ class Readability
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i)); 829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 } 830 }
831 } 831 }
832 832
833 /** 833 /**
834 * Get the inner text of a node. 834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found. 835 * This also strips out any excess whitespace to be found.
@@ -878,11 +878,11 @@ class Readability
878 $elem->removeAttribute('style'); 878 $elem->removeAttribute('style');
879 } 879 }
880 } 880 }
881 881
882 /** 882 /**
883 * Get the density of links as a percentage of the content 883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node. 884 * This is the amount of text that is inside a link divided by the total text in the node.
885 * 885 *
886 * @param DOMElement $e 886 * @param DOMElement $e
887 * @return number (float) 887 * @return number (float)
888 */ 888 */
@@ -900,9 +900,9 @@ class Readability
900 return 0; 900 return 0;
901 } 901 }
902 } 902 }
903 903
904 /** 904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this 905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad. 906 * element looks good or bad.
907 * 907 *
908 * @param DOMElement $e 908 * @param DOMElement $e
@@ -964,7 +964,7 @@ class Readability
964 public function clean($e, $tag) { 964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag); 965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); 966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967 967
968 for ($y=$targetList->length-1; $y >= 0; $y--) { 968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */ 969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) { 970 if ($isEmbed) {
@@ -972,7 +972,7 @@ class Readability
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { 972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) 973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 } 974 }
975 975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) { 977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue; 978 continue;
@@ -986,10 +986,10 @@ class Readability
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y)); 986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 } 987 }
988 } 988 }
989 989
990 /** 990 /**
991 * Clean an element of all tags of type "tag" if they look fishy. 991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames, 992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc. 993 * link density, number of images & embeds, etc.
994 * 994 *
995 * @param DOMElement $e 995 * @param DOMElement $e
@@ -1013,7 +1013,7 @@ class Readability
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) { 1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i)); 1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; 1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016 1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); 1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018 1018
1019 if ($weight + $contentScore < 0) { 1019 if ($weight + $contentScore < 0) {
@@ -1034,13 +1034,13 @@ class Readability
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed'); 1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++; 1037 $embedCount++;
1038 } 1038 }
1039 } 1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); 1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { 1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { 1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++; 1043 $embedCount++;
1044 } 1044 }
1045 } 1045 }
1046 1046
@@ -1058,7 +1058,7 @@ class Readability
1058 $toRemove = true; 1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) { 1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements'); 1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true; 1061 $toRemove = true;
1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { 1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); 1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true; 1064 $toRemove = true;
@@ -1082,7 +1082,7 @@ class Readability
1082 $toRemove = true; 1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) { 1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements'); 1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true; 1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { 1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); 1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true; 1088 $toRemove = true;
@@ -1126,11 +1126,11 @@ class Readability
1126 public function flagIsActive($flag) { 1126 public function flagIsActive($flag) {
1127 return ($this->flags & $flag) > 0; 1127 return ($this->flags & $flag) > 0;
1128 } 1128 }
1129 1129
1130 public function addFlag($flag) { 1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag; 1131 $this->flags = $this->flags | $flag;
1132 } 1132 }
1133 1133
1134 public function removeFlag($flag) { 1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag; 1135 $this->flags = $this->flags & ~$flag;
1136 } 1136 }