diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 12:50:28 +0200 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-05-29 12:50:28 +0200 |
commit | 87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b (patch) | |
tree | 558818975ac41403e7d55ad07c5b0ac29806e907 /inc/3rdparty/libraries/readability | |
parent | ab157bbb75ba226917145c9bf906cbf764a85cd0 (diff) | |
parent | 0b9bb8cb7868f24137c5d8b85c39cc88ea877411 (diff) | |
download | wallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.tar.gz wallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.tar.zst wallabag-87f01ea2e97715ac5df4ef7a6741cc26f3a5cd1b.zip |
Merge pull request #707 from mariroz/dev
update to 3.2 version of full-text-rss, issue #694
Diffstat (limited to 'inc/3rdparty/libraries/readability')
-rw-r--r-- | inc/3rdparty/libraries/readability/Readability.php | 2274 |
1 files changed, 1137 insertions, 1137 deletions
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index 2e8991cc..d0f09d74 100644 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php | |||
@@ -1,1138 +1,1138 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Arc90's Readability ported to PHP for FiveFilters.org | 3 | * Arc90's Readability ported to PHP for FiveFilters.org |
4 | * Based on readability.js version 1.7.1 (without multi-page support) | 4 | * Based on readability.js version 1.7.1 (without multi-page support) |
5 | * Updated to allow HTML5 parsing with html5lib | 5 | * Updated to allow HTML5 parsing with html5lib |
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds | 6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds |
7 | * ------------------------------------------------------ | 7 | * ------------------------------------------------------ |
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | 8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js |
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | 9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ |
10 | * JS Source: http://code.google.com/p/arc90labs-readability | 10 | * JS Source: http://code.google.com/p/arc90labs-readability |
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | 11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net |
12 | * More information: http://fivefilters.org/content-only/ | 12 | * More information: http://fivefilters.org/content-only/ |
13 | * License: Apache License, Version 2.0 | 13 | * License: Apache License, Version 2.0 |
14 | * Requires: PHP5 | 14 | * Requires: PHP5 |
15 | * Date: 2012-09-19 | 15 | * Date: 2012-09-19 |
16 | * | 16 | * |
17 | * Differences between the PHP port and the original | 17 | * Differences between the PHP port and the original |
18 | * ------------------------------------------------------ | 18 | * ------------------------------------------------------ |
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM | 19 | * Arc90's Readability is designed to run in the browser. It works on the DOM |
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and | 20 | * tree (the parsed HTML) after the page's CSS styles have been applied and |
21 | * Javascript code executed. This PHP port does not run inside a browser. | 21 | * Javascript code executed. This PHP port does not run inside a browser. |
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | 22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot |
23 | * rely on CSS or Javascript support. As such, the results will not always | 23 | * rely on CSS or Javascript support. As such, the results will not always |
24 | * match Arc90's Readability. (For example, if a web page contains CSS style | 24 | * match Arc90's Readability. (For example, if a web page contains CSS style |
25 | * rules or Javascript code which hide certain HTML elements from display, | 25 | * rules or Javascript code which hide certain HTML elements from display, |
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, | 26 | * Arc90's Readability will dismiss those from consideration but our PHP port, |
27 | * unable to understand CSS or Javascript, will not know any better.) | 27 | * unable to understand CSS or Javascript, will not know any better.) |
28 | * | 28 | * |
29 | * Another significant difference is that the aim of Arc90's Readability is | 29 | * Another significant difference is that the aim of Arc90's Readability is |
30 | * to re-present the main content block of a given web page so users can | 30 | * to re-present the main content block of a given web page so users can |
31 | * read it more easily in their browsers. Correct identification, clean up, | 31 | * read it more easily in their browsers. Correct identification, clean up, |
32 | * and separation of the content block is only a part of this process. | 32 | * and separation of the content block is only a part of this process. |
33 | * This PHP port is only concerned with this part, it does not include code | 33 | * This PHP port is only concerned with this part, it does not include code |
34 | * that relates to presentation in the browser - Arc90 already do | 34 | * that relates to presentation in the browser - Arc90 already do |
35 | * that extremely well, and for PDF output there's FiveFilters.org's | 35 | * that extremely well, and for PDF output there's FiveFilters.org's |
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | 36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. |
37 | * | 37 | * |
38 | * Finally, this class contains methods that might be useful for developers | 38 | * Finally, this class contains methods that might be useful for developers |
39 | * working on HTML document fragments. So without deviating too much from | 39 | * working on HTML document fragments. So without deviating too much from |
40 | * the original code (which I don't want to do because it makes debugging | 40 | * the original code (which I don't want to do because it makes debugging |
41 | * and updating more difficult), I've tried to make it a little more | 41 | * and updating more difficult), I've tried to make it a little more |
42 | * developer friendly. You should be able to use the methods here on | 42 | * developer friendly. You should be able to use the methods here on |
43 | * existing DOMElement objects without passing an entire HTML document to | 43 | * existing DOMElement objects without passing an entire HTML document to |
44 | * be parsed. | 44 | * be parsed. |
45 | */ | 45 | */ |
46 | 46 | ||
47 | // This class allows us to do JavaScript like assignements to innerHTML | 47 | // This class allows us to do JavaScript like assignements to innerHTML |
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | 48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); |
49 | 49 | ||
50 | // Alternative usage (for testing only!) | 50 | // Alternative usage (for testing only!) |
51 | // uncomment the lines below and call Readability.php in your browser | 51 | // uncomment the lines below and call Readability.php in your browser |
52 | // passing it the URL of the page you'd like content from, e.g.: | 52 | // passing it the URL of the page you'd like content from, e.g.: |
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | 53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php |
54 | 54 | ||
55 | /* | 55 | /* |
56 | if (!isset($_GET['url']) || $_GET['url'] == '') { | 56 | if (!isset($_GET['url']) || $_GET['url'] == '') { |
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | 57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); |
58 | } | 58 | } |
59 | $url = $_GET['url']; | 59 | $url = $_GET['url']; |
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | 60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; |
61 | $html = file_get_contents($url); | 61 | $html = file_get_contents($url); |
62 | $r = new Readability($html, $url); | 62 | $r = new Readability($html, $url); |
63 | $r->init(); | 63 | $r->init(); |
64 | echo $r->articleContent->innerHTML; | 64 | echo $r->articleContent->innerHTML; |
65 | */ | 65 | */ |
66 | 66 | ||
67 | class Readability | 67 | class Readability |
68 | { | 68 | { |
69 | public $version = '1.7.1-without-multi-page'; | 69 | public $version = '1.7.1-without-multi-page'; |
70 | public $convertLinksToFootnotes = false; | 70 | public $convertLinksToFootnotes = false; |
71 | public $revertForcedParagraphElements = true; | 71 | public $revertForcedParagraphElements = true; |
72 | public $articleTitle; | 72 | public $articleTitle; |
73 | public $articleContent; | 73 | public $articleContent; |
74 | public $dom; | 74 | public $dom; |
75 | public $url = null; // optional - URL where HTML was retrieved | 75 | public $url = null; // optional - URL where HTML was retrieved |
76 | public $debug = false; | 76 | public $debug = false; |
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 | 77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 |
78 | protected $body = null; // | 78 | protected $body = null; // |
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | 79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later |
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | 80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. |
81 | protected $success = false; // indicates whether we were able to extract or not | 81 | protected $success = false; // indicates whether we were able to extract or not |
82 | 82 | ||
83 | /** | 83 | /** |
84 | * All of the regular expressions in use within readability. | 84 | * All of the regular expressions in use within readability. |
85 | * Defined up here so we don't instantiate them repeatedly in loops. | 85 | * Defined up here so we don't instantiate them repeatedly in loops. |
86 | **/ | 86 | **/ |
87 | public $regexps = array( | 87 | public $regexps = array( |
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', | 88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', |
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | 89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', | 90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', |
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | 91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', |
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | 92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', |
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | 93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', |
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | 94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', |
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | 95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() |
96 | 'normalize' => '/\s{2,}/', | 96 | 'normalize' => '/\s{2,}/', |
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | 97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', |
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', | 98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', |
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | 99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' |
100 | ); | 100 | ); |
101 | 101 | ||
102 | /* constants */ | 102 | /* constants */ |
103 | const FLAG_STRIP_UNLIKELYS = 1; | 103 | const FLAG_STRIP_UNLIKELYS = 1; |
104 | const FLAG_WEIGHT_CLASSES = 2; | 104 | const FLAG_WEIGHT_CLASSES = 2; |
105 | const FLAG_CLEAN_CONDITIONALLY = 4; | 105 | const FLAG_CLEAN_CONDITIONALLY = 4; |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * Create instance of Readability | 108 | * Create instance of Readability |
109 | * @param string UTF-8 encoded string | 109 | * @param string UTF-8 encoded string |
110 | * @param string (optional) URL associated with HTML (used for footnotes) | 110 | * @param string (optional) URL associated with HTML (used for footnotes) |
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
112 | */ | 112 | */ |
113 | function __construct($html, $url=null, $parser='libxml') | 113 | function __construct($html, $url=null, $parser='libxml') |
114 | { | 114 | { |
115 | $this->url = $url; | 115 | $this->url = $url; |
116 | /* Turn all double br's into p's */ | 116 | /* Turn all double br's into p's */ |
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | 117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); |
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | 118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); |
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | 119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); |
120 | if (trim($html) == '') $html = '<html></html>'; | 120 | if (trim($html) == '') $html = '<html></html>'; |
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { | 121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { |
122 | // all good | 122 | // all good |
123 | } else { | 123 | } else { |
124 | $this->dom = new DOMDocument(); | 124 | $this->dom = new DOMDocument(); |
125 | $this->dom->preserveWhiteSpace = false; | 125 | $this->dom->preserveWhiteSpace = false; |
126 | @$this->dom->loadHTML($html); | 126 | @$this->dom->loadHTML($html); |
127 | } | 127 | } |
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | 128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); |
129 | } | 129 | } |
130 | 130 | ||
131 | /** | 131 | /** |
132 | * Get article title element | 132 | * Get article title element |
133 | * @return DOMElement | 133 | * @return DOMElement |
134 | */ | 134 | */ |
135 | public function getTitle() { | 135 | public function getTitle() { |
136 | return $this->articleTitle; | 136 | return $this->articleTitle; |
137 | } | 137 | } |
138 | 138 | ||
139 | /** | 139 | /** |
140 | * Get article content element | 140 | * Get article content element |
141 | * @return DOMElement | 141 | * @return DOMElement |
142 | */ | 142 | */ |
143 | public function getContent() { | 143 | public function getContent() { |
144 | return $this->articleContent; | 144 | return $this->articleContent; |
145 | } | 145 | } |
146 | 146 | ||
147 | /** | 147 | /** |
148 | * Runs readability. | 148 | * Runs readability. |
149 | * | 149 | * |
150 | * Workflow: | 150 | * Workflow: |
151 | * 1. Prep the document by removing script tags, css, etc. | 151 | * 1. Prep the document by removing script tags, css, etc. |
152 | * 2. Build readability's DOM tree. | 152 | * 2. Build readability's DOM tree. |
153 | * 3. Grab the article content from the current dom tree. | 153 | * 3. Grab the article content from the current dom tree. |
154 | * 4. Replace the current DOM tree with the new one. | 154 | * 4. Replace the current DOM tree with the new one. |
155 | * 5. Read peacefully. | 155 | * 5. Read peacefully. |
156 | * | 156 | * |
157 | * @return boolean true if we found content, false otherwise | 157 | * @return boolean true if we found content, false otherwise |
158 | **/ | 158 | **/ |
159 | public function init() | 159 | public function init() |
160 | { | 160 | { |
161 | if (!isset($this->dom->documentElement)) return false; | 161 | if (!isset($this->dom->documentElement)) return false; |
162 | $this->removeScripts($this->dom); | 162 | $this->removeScripts($this->dom); |
163 | //die($this->getInnerHTML($this->dom->documentElement)); | 163 | //die($this->getInnerHTML($this->dom->documentElement)); |
164 | 164 | ||
165 | // Assume successful outcome | 165 | // Assume successful outcome |
166 | $this->success = true; | 166 | $this->success = true; |
167 | 167 | ||
168 | $bodyElems = $this->dom->getElementsByTagName('body'); | 168 | $bodyElems = $this->dom->getElementsByTagName('body'); |
169 | if ($bodyElems->length > 0) { | 169 | if ($bodyElems->length > 0) { |
170 | if ($this->bodyCache == null) { | 170 | if ($this->bodyCache == null) { |
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | 171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; |
172 | } | 172 | } |
173 | if ($this->body == null) { | 173 | if ($this->body == null) { |
174 | $this->body = $bodyElems->item(0); | 174 | $this->body = $bodyElems->item(0); |
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | $this->prepDocument(); | 178 | $this->prepDocument(); |
179 | 179 | ||
180 | //die($this->dom->documentElement->parentNode->nodeType); | 180 | //die($this->dom->documentElement->parentNode->nodeType); |
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | 181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); |
182 | //die($this->getInnerHTML($this->dom->documentElement)); | 182 | //die($this->getInnerHTML($this->dom->documentElement)); |
183 | 183 | ||
184 | /* Build readability's DOM tree */ | 184 | /* Build readability's DOM tree */ |
185 | $overlay = $this->dom->createElement('div'); | 185 | $overlay = $this->dom->createElement('div'); |
186 | $innerDiv = $this->dom->createElement('div'); | 186 | $innerDiv = $this->dom->createElement('div'); |
187 | $articleTitle = $this->getArticleTitle(); | 187 | $articleTitle = $this->getArticleTitle(); |
188 | $articleContent = $this->grabArticle(); | 188 | $articleContent = $this->grabArticle(); |
189 | 189 | ||
190 | if (!$articleContent) { | 190 | if (!$articleContent) { |
191 | $this->success = false; | 191 | $this->success = false; |
192 | $articleContent = $this->dom->createElement('div'); | 192 | $articleContent = $this->dom->createElement('div'); |
193 | $articleContent->setAttribute('id', 'readability-content'); | 193 | $articleContent->setAttribute('id', 'readability-content'); |
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | 194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; |
195 | } | 195 | } |
196 | 196 | ||
197 | $overlay->setAttribute('id', 'readOverlay'); | 197 | $overlay->setAttribute('id', 'readOverlay'); |
198 | $innerDiv->setAttribute('id', 'readInner'); | 198 | $innerDiv->setAttribute('id', 'readInner'); |
199 | 199 | ||
200 | /* Glue the structure of our document together. */ | 200 | /* Glue the structure of our document together. */ |
201 | $innerDiv->appendChild($articleTitle); | 201 | $innerDiv->appendChild($articleTitle); |
202 | $innerDiv->appendChild($articleContent); | 202 | $innerDiv->appendChild($articleContent); |
203 | $overlay->appendChild($innerDiv); | 203 | $overlay->appendChild($innerDiv); |
204 | 204 | ||
205 | /* Clear the old HTML, insert the new content. */ | 205 | /* Clear the old HTML, insert the new content. */ |
206 | $this->body->innerHTML = ''; | 206 | $this->body->innerHTML = ''; |
207 | $this->body->appendChild($overlay); | 207 | $this->body->appendChild($overlay); |
208 | //document.body.insertBefore(overlay, document.body.firstChild); | 208 | //document.body.insertBefore(overlay, document.body.firstChild); |
209 | $this->body->removeAttribute('style'); | 209 | $this->body->removeAttribute('style'); |
210 | 210 | ||
211 | $this->postProcessContent($articleContent); | 211 | $this->postProcessContent($articleContent); |
212 | 212 | ||
213 | // Set title and content instance variables | 213 | // Set title and content instance variables |
214 | $this->articleTitle = $articleTitle; | 214 | $this->articleTitle = $articleTitle; |
215 | $this->articleContent = $articleContent; | 215 | $this->articleContent = $articleContent; |
216 | 216 | ||
217 | return $this->success; | 217 | return $this->success; |
218 | } | 218 | } |
219 | 219 | ||
220 | /** | 220 | /** |
221 | * Debug | 221 | * Debug |
222 | */ | 222 | */ |
223 | protected function dbg($msg) { | 223 | protected function dbg($msg) { |
224 | if ($this->debug) echo '* ',$msg, "\n"; | 224 | if ($this->debug) echo '* ',$msg, "\n"; |
225 | } | 225 | } |
226 | 226 | ||
227 | /** | 227 | /** |
228 | * Run any post-process modifications to article content as necessary. | 228 | * Run any post-process modifications to article content as necessary. |
229 | * | 229 | * |
230 | * @param DOMElement | 230 | * @param DOMElement |
231 | * @return void | 231 | * @return void |
232 | */ | 232 | */ |
233 | public function postProcessContent($articleContent) { | 233 | public function postProcessContent($articleContent) { |
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | 234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { |
235 | $this->addFootnotes($articleContent); | 235 | $this->addFootnotes($articleContent); |
236 | } | 236 | } |
237 | } | 237 | } |
238 | 238 | ||
239 | /** | 239 | /** |
240 | * Get the article title as an H1. | 240 | * Get the article title as an H1. |
241 | * | 241 | * |
242 | * @return DOMElement | 242 | * @return DOMElement |
243 | */ | 243 | */ |
244 | protected function getArticleTitle() { | 244 | protected function getArticleTitle() { |
245 | $curTitle = ''; | 245 | $curTitle = ''; |
246 | $origTitle = ''; | 246 | $origTitle = ''; |
247 | 247 | ||
248 | try { | 248 | try { |
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | 249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); |
250 | } catch(Exception $e) {} | 250 | } catch(Exception $e) {} |
251 | 251 | ||
252 | if (preg_match('/ [\|\-] /', $curTitle)) | 252 | if (preg_match('/ [\|\-] /', $curTitle)) |
253 | { | 253 | { |
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | 254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); |
255 | 255 | ||
256 | if (count(explode(' ', $curTitle)) < 3) { | 256 | if (count(explode(' ', $curTitle)) < 3) { |
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | 257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); |
258 | } | 258 | } |
259 | } | 259 | } |
260 | else if (strpos($curTitle, ': ') !== false) | 260 | else if (strpos($curTitle, ': ') !== false) |
261 | { | 261 | { |
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | 262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); |
263 | 263 | ||
264 | if (count(explode(' ', $curTitle)) < 3) { | 264 | if (count(explode(' ', $curTitle)) < 3) { |
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | 265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); |
266 | } | 266 | } |
267 | } | 267 | } |
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | 268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) |
269 | { | 269 | { |
270 | $hOnes = $this->dom->getElementsByTagName('h1'); | 270 | $hOnes = $this->dom->getElementsByTagName('h1'); |
271 | if($hOnes->length == 1) | 271 | if($hOnes->length == 1) |
272 | { | 272 | { |
273 | $curTitle = $this->getInnerText($hOnes->item(0)); | 273 | $curTitle = $this->getInnerText($hOnes->item(0)); |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | $curTitle = trim($curTitle); | 277 | $curTitle = trim($curTitle); |
278 | 278 | ||
279 | if (count(explode(' ', $curTitle)) <= 4) { | 279 | if (count(explode(' ', $curTitle)) <= 4) { |
280 | $curTitle = $origTitle; | 280 | $curTitle = $origTitle; |
281 | } | 281 | } |
282 | 282 | ||
283 | $articleTitle = $this->dom->createElement('h1'); | 283 | $articleTitle = $this->dom->createElement('h1'); |
284 | $articleTitle->innerHTML = $curTitle; | 284 | $articleTitle->innerHTML = $curTitle; |
285 | 285 | ||
286 | return $articleTitle; | 286 | return $articleTitle; |
287 | } | 287 | } |
288 | 288 | ||
289 | /** | 289 | /** |
290 | * Prepare the HTML document for readability to scrape it. | 290 | * Prepare the HTML document for readability to scrape it. |
291 | * This includes things like stripping javascript, CSS, and handling terrible markup. | 291 | * This includes things like stripping javascript, CSS, and handling terrible markup. |
292 | * | 292 | * |
293 | * @return void | 293 | * @return void |
294 | **/ | 294 | **/ |
295 | protected function prepDocument() { | 295 | protected function prepDocument() { |
296 | /** | 296 | /** |
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | 297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) |
298 | * so we create a new body node and append it to the document. | 298 | * so we create a new body node and append it to the document. |
299 | */ | 299 | */ |
300 | if ($this->body == null) | 300 | if ($this->body == null) |
301 | { | 301 | { |
302 | $this->body = $this->dom->createElement('body'); | 302 | $this->body = $this->dom->createElement('body'); |
303 | $this->dom->documentElement->appendChild($this->body); | 303 | $this->dom->documentElement->appendChild($this->body); |
304 | } | 304 | } |
305 | $this->body->setAttribute('id', 'readabilityBody'); | 305 | $this->body->setAttribute('id', 'readabilityBody'); |
306 | 306 | ||
307 | /* Remove all style tags in head */ | 307 | /* Remove all style tags in head */ |
308 | $styleTags = $this->dom->getElementsByTagName('style'); | 308 | $styleTags = $this->dom->getElementsByTagName('style'); |
309 | for ($i = $styleTags->length-1; $i >= 0; $i--) | 309 | for ($i = $styleTags->length-1; $i >= 0; $i--) |
310 | { | 310 | { |
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | 311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); |
312 | } | 312 | } |
313 | 313 | ||
314 | /* Turn all double br's into p's */ | 314 | /* Turn all double br's into p's */ |
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | 315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ |
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | 316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); |
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | 317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. |
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | 318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. |
319 | } | 319 | } |
320 | 320 | ||
321 | /** | 321 | /** |
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | 322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. |
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | 323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php |
324 | * | 324 | * |
325 | * @return void | 325 | * @return void |
326 | **/ | 326 | **/ |
327 | public function addFootnotes($articleContent) { | 327 | public function addFootnotes($articleContent) { |
328 | $footnotesWrapper = $this->dom->createElement('div'); | 328 | $footnotesWrapper = $this->dom->createElement('div'); |
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | 329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); |
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | 330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; |
331 | 331 | ||
332 | $articleFootnotes = $this->dom->createElement('ol'); | 332 | $articleFootnotes = $this->dom->createElement('ol'); |
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | 333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); |
334 | $footnotesWrapper->appendChild($articleFootnotes); | 334 | $footnotesWrapper->appendChild($articleFootnotes); |
335 | 335 | ||
336 | $articleLinks = $articleContent->getElementsByTagName('a'); | 336 | $articleLinks = $articleContent->getElementsByTagName('a'); |
337 | 337 | ||
338 | $linkCount = 0; | 338 | $linkCount = 0; |
339 | for ($i = 0; $i < $articleLinks->length; $i++) | 339 | for ($i = 0; $i < $articleLinks->length; $i++) |
340 | { | 340 | { |
341 | $articleLink = $articleLinks->item($i); | 341 | $articleLink = $articleLinks->item($i); |
342 | $footnoteLink = $articleLink->cloneNode(true); | 342 | $footnoteLink = $articleLink->cloneNode(true); |
343 | $refLink = $this->dom->createElement('a'); | 343 | $refLink = $this->dom->createElement('a'); |
344 | $footnote = $this->dom->createElement('li'); | 344 | $footnote = $this->dom->createElement('li'); |
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | 345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); |
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | 346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); |
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | 347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, |
348 | $linkText = $this->getInnerText($articleLink); | 348 | $linkText = $this->getInnerText($articleLink); |
349 | 349 | ||
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | 350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { |
351 | continue; | 351 | continue; |
352 | } | 352 | } |
353 | 353 | ||
354 | $linkCount++; | 354 | $linkCount++; |
355 | 355 | ||
356 | /** Add a superscript reference after the article link */ | 356 | /** Add a superscript reference after the article link */ |
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | 357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); |
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | 358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; |
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | 359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); |
360 | $refLink->setAttribute('style', 'color: inherit;'); | 360 | $refLink->setAttribute('style', 'color: inherit;'); |
361 | 361 | ||
362 | //TODO: does this work or should we use DOMNode.isSameNode()? | 362 | //TODO: does this work or should we use DOMNode.isSameNode()? |
363 | if ($articleLink->parentNode->lastChild == $articleLink) { | 363 | if ($articleLink->parentNode->lastChild == $articleLink) { |
364 | $articleLink->parentNode->appendChild($refLink); | 364 | $articleLink->parentNode->appendChild($refLink); |
365 | } else { | 365 | } else { |
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | 366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); |
367 | } | 367 | } |
368 | 368 | ||
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | 369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); |
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | 370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); |
371 | 371 | ||
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | 372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; |
373 | 373 | ||
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | 374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); |
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | 375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); |
376 | 376 | ||
377 | $footnote->appendChild($footnoteLink); | 377 | $footnote->appendChild($footnoteLink); |
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | 378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; |
379 | 379 | ||
380 | $articleFootnotes->appendChild($footnote); | 380 | $articleFootnotes->appendChild($footnote); |
381 | } | 381 | } |
382 | 382 | ||
383 | if ($linkCount > 0) { | 383 | if ($linkCount > 0) { |
384 | $articleContent->appendChild($footnotesWrapper); | 384 | $articleContent->appendChild($footnotesWrapper); |
385 | } | 385 | } |
386 | } | 386 | } |
387 | 387 | ||
388 | /** | 388 | /** |
389 | * Reverts P elements with class 'readability-styled' | 389 | * Reverts P elements with class 'readability-styled' |
390 | * to text nodes - which is what they were before. | 390 | * to text nodes - which is what they were before. |
391 | * | 391 | * |
392 | * @param DOMElement | 392 | * @param DOMElement |
393 | * @return void | 393 | * @return void |
394 | */ | 394 | */ |
395 | function revertReadabilityStyledElements($articleContent) { | 395 | function revertReadabilityStyledElements($articleContent) { |
396 | $xpath = new DOMXPath($articleContent->ownerDocument); | 396 | $xpath = new DOMXPath($articleContent->ownerDocument); |
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | 397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); |
398 | //$elems = $articleContent->getElementsByTagName('p'); | 398 | //$elems = $articleContent->getElementsByTagName('p'); |
399 | for ($i = $elems->length-1; $i >= 0; $i--) { | 399 | for ($i = $elems->length-1; $i >= 0; $i--) { |
400 | $e = $elems->item($i); | 400 | $e = $elems->item($i); |
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | 401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); |
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | 402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { |
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | 403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); |
404 | //} | 404 | //} |
405 | } | 405 | } |
406 | } | 406 | } |
407 | 407 | ||
408 | /** | 408 | /** |
409 | * Prepare the article node for display. Clean out any inline styles, | 409 | * Prepare the article node for display. Clean out any inline styles, |
410 | * iframes, forms, strip extraneous <p> tags, etc. | 410 | * iframes, forms, strip extraneous <p> tags, etc. |
411 | * | 411 | * |
412 | * @param DOMElement | 412 | * @param DOMElement |
413 | * @return void | 413 | * @return void |
414 | */ | 414 | */ |
415 | function prepArticle($articleContent) { | 415 | function prepArticle($articleContent) { |
416 | $this->cleanStyles($articleContent); | 416 | $this->cleanStyles($articleContent); |
417 | $this->killBreaks($articleContent); | 417 | $this->killBreaks($articleContent); |
418 | if ($this->revertForcedParagraphElements) { | 418 | if ($this->revertForcedParagraphElements) { |
419 | $this->revertReadabilityStyledElements($articleContent); | 419 | $this->revertReadabilityStyledElements($articleContent); |
420 | } | 420 | } |
421 | 421 | ||
422 | /* Clean out junk from the article content */ | 422 | /* Clean out junk from the article content */ |
423 | $this->cleanConditionally($articleContent, 'form'); | 423 | $this->cleanConditionally($articleContent, 'form'); |
424 | $this->clean($articleContent, 'object'); | 424 | $this->clean($articleContent, 'object'); |
425 | $this->clean($articleContent, 'h1'); | 425 | $this->clean($articleContent, 'h1'); |
426 | 426 | ||
427 | /** | 427 | /** |
428 | * If there is only one h2, they are probably using it | 428 | * If there is only one h2, they are probably using it |
429 | * as a header and not a subheader, so remove it since we already have a header. | 429 | * as a header and not a subheader, so remove it since we already have a header. |
430 | ***/ | 430 | ***/ |
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { | 431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { |
432 | $this->clean($articleContent, 'h2'); | 432 | $this->clean($articleContent, 'h2'); |
433 | } | 433 | } |
434 | $this->clean($articleContent, 'iframe'); | 434 | $this->clean($articleContent, 'iframe'); |
435 | 435 | ||
436 | $this->cleanHeaders($articleContent); | 436 | $this->cleanHeaders($articleContent); |
437 | 437 | ||
438 | /* Do these last as the previous stuff may have removed junk that will affect these */ | 438 | /* Do these last as the previous stuff may have removed junk that will affect these */ |
439 | $this->cleanConditionally($articleContent, 'table'); | 439 | $this->cleanConditionally($articleContent, 'table'); |
440 | $this->cleanConditionally($articleContent, 'ul'); | 440 | $this->cleanConditionally($articleContent, 'ul'); |
441 | $this->cleanConditionally($articleContent, 'div'); | 441 | $this->cleanConditionally($articleContent, 'div'); |
442 | 442 | ||
443 | /* Remove extra paragraphs */ | 443 | /* Remove extra paragraphs */ |
444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | 444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); |
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | 445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) |
446 | { | 446 | { |
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | 447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; |
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | 448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; |
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | 449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; |
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; | 450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; |
451 | 451 | ||
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | 452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') |
453 | { | 453 | { |
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | 454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); |
455 | } | 455 | } |
456 | } | 456 | } |
457 | 457 | ||
458 | try { | 458 | try { |
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | 459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); |
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | 460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); |
461 | } | 461 | } |
462 | catch (Exception $e) { | 462 | catch (Exception $e) { |
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | 463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); |
464 | } | 464 | } |
465 | } | 465 | } |
466 | 466 | ||
467 | /** | 467 | /** |
468 | * Initialize a node with the readability object. Also checks the | 468 | * Initialize a node with the readability object. Also checks the |
469 | * className/id for special names to add to its score. | 469 | * className/id for special names to add to its score. |
470 | * | 470 | * |
471 | * @param Element | 471 | * @param Element |
472 | * @return void | 472 | * @return void |
473 | **/ | 473 | **/ |
474 | protected function initializeNode($node) { | 474 | protected function initializeNode($node) { |
475 | $readability = $this->dom->createAttribute('readability'); | 475 | $readability = $this->dom->createAttribute('readability'); |
476 | $readability->value = 0; // this is our contentScore | 476 | $readability->value = 0; // this is our contentScore |
477 | $node->setAttributeNode($readability); | 477 | $node->setAttributeNode($readability); |
478 | 478 | ||
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | 479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case |
480 | case 'DIV': | 480 | case 'DIV': |
481 | $readability->value += 5; | 481 | $readability->value += 5; |
482 | break; | 482 | break; |
483 | 483 | ||
484 | case 'PRE': | 484 | case 'PRE': |
485 | case 'TD': | 485 | case 'TD': |
486 | case 'BLOCKQUOTE': | 486 | case 'BLOCKQUOTE': |
487 | $readability->value += 3; | 487 | $readability->value += 3; |
488 | break; | 488 | break; |
489 | 489 | ||
490 | case 'ADDRESS': | 490 | case 'ADDRESS': |
491 | case 'OL': | 491 | case 'OL': |
492 | case 'UL': | 492 | case 'UL': |
493 | case 'DL': | 493 | case 'DL': |
494 | case 'DD': | 494 | case 'DD': |
495 | case 'DT': | 495 | case 'DT': |
496 | case 'LI': | 496 | case 'LI': |
497 | case 'FORM': | 497 | case 'FORM': |
498 | $readability->value -= 3; | 498 | $readability->value -= 3; |
499 | break; | 499 | break; |
500 | 500 | ||
501 | case 'H1': | 501 | case 'H1': |
502 | case 'H2': | 502 | case 'H2': |
503 | case 'H3': | 503 | case 'H3': |
504 | case 'H4': | 504 | case 'H4': |
505 | case 'H5': | 505 | case 'H5': |
506 | case 'H6': | 506 | case 'H6': |
507 | case 'TH': | 507 | case 'TH': |
508 | $readability->value -= 5; | 508 | $readability->value -= 5; |
509 | break; | 509 | break; |
510 | } | 510 | } |
511 | $readability->value += $this->getClassWeight($node); | 511 | $readability->value += $this->getClassWeight($node); |
512 | } | 512 | } |
513 | 513 | ||
514 | /*** | 514 | /*** |
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | 515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | 516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
517 | * | 517 | * |
518 | * @return DOMElement | 518 | * @return DOMElement |
519 | **/ | 519 | **/ |
520 | protected function grabArticle($page=null) { | 520 | protected function grabArticle($page=null) { |
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | 521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); |
522 | if (!$page) $page = $this->dom; | 522 | if (!$page) $page = $this->dom; |
523 | $allElements = $page->getElementsByTagName('*'); | 523 | $allElements = $page->getElementsByTagName('*'); |
524 | /** | 524 | /** |
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | 525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs |
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | 526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) |
527 | * | 527 | * |
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | 528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
529 | * TODO: Shouldn't this be a reverse traversal? | 529 | * TODO: Shouldn't this be a reverse traversal? |
530 | **/ | 530 | **/ |
531 | $node = null; | 531 | $node = null; |
532 | $nodesToScore = array(); | 532 | $nodesToScore = array(); |
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | 533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { |
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | 534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { |
535 | //$node = $targetList->item($nodeIndex); | 535 | //$node = $targetList->item($nodeIndex); |
536 | $tagName = strtoupper($node->tagName); | 536 | $tagName = strtoupper($node->tagName); |
537 | /* Remove unlikely candidates */ | 537 | /* Remove unlikely candidates */ |
538 | if ($stripUnlikelyCandidates) { | 538 | if ($stripUnlikelyCandidates) { |
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | 539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); |
540 | if ( | 540 | if ( |
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | 541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && |
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | 542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && |
543 | $tagName != 'BODY' | 543 | $tagName != 'BODY' |
544 | ) | 544 | ) |
545 | { | 545 | { |
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | 546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); |
547 | //$nodesToRemove[] = $node; | 547 | //$nodesToRemove[] = $node; |
548 | $node->parentNode->removeChild($node); | 548 | $node->parentNode->removeChild($node); |
549 | $nodeIndex--; | 549 | $nodeIndex--; |
550 | continue; | 550 | continue; |
551 | } | 551 | } |
552 | } | 552 | } |
553 | 553 | ||
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | 554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { |
555 | $nodesToScore[] = $node; | 555 | $nodesToScore[] = $node; |
556 | } | 556 | } |
557 | 557 | ||
558 | /* Turn all divs that don't have children block level elements into p's */ | 558 | /* Turn all divs that don't have children block level elements into p's */ |
559 | if ($tagName == 'DIV') { | 559 | if ($tagName == 'DIV') { |
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | 560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { |
561 | //$this->dbg('Altering div to p'); | 561 | //$this->dbg('Altering div to p'); |
562 | $newNode = $this->dom->createElement('p'); | 562 | $newNode = $this->dom->createElement('p'); |
563 | try { | 563 | try { |
564 | $newNode->innerHTML = $node->innerHTML; | 564 | $newNode->innerHTML = $node->innerHTML; |
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | 565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); |
566 | $node->parentNode->replaceChild($newNode, $node); | 566 | $node->parentNode->replaceChild($newNode, $node); |
567 | $nodeIndex--; | 567 | $nodeIndex--; |
568 | $nodesToScore[] = $node; // or $newNode? | 568 | $nodesToScore[] = $node; // or $newNode? |
569 | } | 569 | } |
570 | catch(Exception $e) { | 570 | catch(Exception $e) { |
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | 571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); |
572 | } | 572 | } |
573 | } | 573 | } |
574 | else | 574 | else |
575 | { | 575 | { |
576 | /* EXPERIMENTAL */ | 576 | /* EXPERIMENTAL */ |
577 | // TODO: change these p elements back to text nodes after processing | 577 | // TODO: change these p elements back to text nodes after processing |
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | 578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { |
579 | $childNode = $node->childNodes->item($i); | 579 | $childNode = $node->childNodes->item($i); |
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | 580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE |
581 | //$this->dbg('replacing text node with a p tag with the same content.'); | 581 | //$this->dbg('replacing text node with a p tag with the same content.'); |
582 | $p = $this->dom->createElement('p'); | 582 | $p = $this->dom->createElement('p'); |
583 | $p->innerHTML = $childNode->nodeValue; | 583 | $p->innerHTML = $childNode->nodeValue; |
584 | $p->setAttribute('style', 'display: inline;'); | 584 | $p->setAttribute('style', 'display: inline;'); |
585 | $p->setAttribute('class', 'readability-styled'); | 585 | $p->setAttribute('class', 'readability-styled'); |
586 | $childNode->parentNode->replaceChild($p, $childNode); | 586 | $childNode->parentNode->replaceChild($p, $childNode); |
587 | } | 587 | } |
588 | } | 588 | } |
589 | } | 589 | } |
590 | } | 590 | } |
591 | } | 591 | } |
592 | 592 | ||
593 | /** | 593 | /** |
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | 594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
595 | * Then add their score to their parent node. | 595 | * Then add their score to their parent node. |
596 | * | 596 | * |
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | 597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
598 | **/ | 598 | **/ |
599 | $candidates = array(); | 599 | $candidates = array(); |
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | 600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { |
601 | $parentNode = $nodesToScore[$pt]->parentNode; | 601 | $parentNode = $nodesToScore[$pt]->parentNode; |
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | 602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; |
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | 603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); |
604 | $innerText = $this->getInnerText($nodesToScore[$pt]); | 604 | $innerText = $this->getInnerText($nodesToScore[$pt]); |
605 | 605 | ||
606 | if (!$parentNode || !isset($parentNode->tagName)) { | 606 | if (!$parentNode || !isset($parentNode->tagName)) { |
607 | continue; | 607 | continue; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* If this paragraph is less than 25 characters, don't even count it. */ | 610 | /* If this paragraph is less than 25 characters, don't even count it. */ |
611 | if(strlen($innerText) < 25) { | 611 | if(strlen($innerText) < 25) { |
612 | continue; | 612 | continue; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* Initialize readability data for the parent. */ | 615 | /* Initialize readability data for the parent. */ |
616 | if (!$parentNode->hasAttribute('readability')) | 616 | if (!$parentNode->hasAttribute('readability')) |
617 | { | 617 | { |
618 | $this->initializeNode($parentNode); | 618 | $this->initializeNode($parentNode); |
619 | $candidates[] = $parentNode; | 619 | $candidates[] = $parentNode; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* Initialize readability data for the grandparent. */ | 622 | /* Initialize readability data for the grandparent. */ |
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | 623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) |
624 | { | 624 | { |
625 | $this->initializeNode($grandParentNode); | 625 | $this->initializeNode($grandParentNode); |
626 | $candidates[] = $grandParentNode; | 626 | $candidates[] = $grandParentNode; |
627 | } | 627 | } |
628 | 628 | ||
629 | $contentScore = 0; | 629 | $contentScore = 0; |
630 | 630 | ||
631 | /* Add a point for the paragraph itself as a base. */ | 631 | /* Add a point for the paragraph itself as a base. */ |
632 | $contentScore++; | 632 | $contentScore++; |
633 | 633 | ||
634 | /* Add points for any commas within this paragraph */ | 634 | /* Add points for any commas within this paragraph */ |
635 | $contentScore += count(explode(',', $innerText)); | 635 | $contentScore += count(explode(',', $innerText)); |
636 | 636 | ||
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | 637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
638 | $contentScore += min(floor(strlen($innerText) / 100), 3); | 638 | $contentScore += min(floor(strlen($innerText) / 100), 3); |
639 | 639 | ||
640 | /* Add the score to the parent. The grandparent gets half. */ | 640 | /* Add the score to the parent. The grandparent gets half. */ |
641 | $parentNode->getAttributeNode('readability')->value += $contentScore; | 641 | $parentNode->getAttributeNode('readability')->value += $contentScore; |
642 | 642 | ||
643 | if ($grandParentNode) { | 643 | if ($grandParentNode) { |
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | 644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; |
645 | } | 645 | } |
646 | } | 646 | } |
647 | 647 | ||
648 | /** | 648 | /** |
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found | 649 | * After we've calculated scores, loop through all of the possible candidate nodes we found |
650 | * and find the one with the highest score. | 650 | * and find the one with the highest score. |
651 | **/ | 651 | **/ |
652 | $topCandidate = null; | 652 | $topCandidate = null; |
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | 653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) |
654 | { | 654 | { |
655 | /** | 655 | /** |
656 | * Scale the final candidates score based on link density. Good content should have a | 656 | * Scale the final candidates score based on link density. Good content should have a |
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | 657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. |
658 | **/ | 658 | **/ |
659 | $readability = $candidates[$c]->getAttributeNode('readability'); | 659 | $readability = $candidates[$c]->getAttributeNode('readability'); |
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | 660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); |
661 | 661 | ||
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | 662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); |
663 | 663 | ||
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | 664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { |
665 | $topCandidate = $candidates[$c]; | 665 | $topCandidate = $candidates[$c]; |
666 | } | 666 | } |
667 | } | 667 | } |
668 | 668 | ||
669 | /** | 669 | /** |
670 | * If we still have no top candidate, just use the body as a last resort. | 670 | * If we still have no top candidate, just use the body as a last resort. |
671 | * We also have to copy the body node so it is something we can modify. | 671 | * We also have to copy the body node so it is something we can modify. |
672 | **/ | 672 | **/ |
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | 673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') |
674 | { | 674 | { |
675 | $topCandidate = $this->dom->createElement('div'); | 675 | $topCandidate = $this->dom->createElement('div'); |
676 | if ($page instanceof DOMDocument) { | 676 | if ($page instanceof DOMDocument) { |
677 | if (!isset($page->documentElement)) { | 677 | if (!isset($page->documentElement)) { |
678 | // we don't have a body either? what a mess! :) | 678 | // we don't have a body either? what a mess! :) |
679 | } else { | 679 | } else { |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | 680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; |
681 | $page->documentElement->innerHTML = ''; | 681 | $page->documentElement->innerHTML = ''; |
682 | $page->documentElement->appendChild($topCandidate); | 682 | $page->documentElement->appendChild($topCandidate); |
683 | } | 683 | } |
684 | } else { | 684 | } else { |
685 | $topCandidate->innerHTML = $page->innerHTML; | 685 | $topCandidate->innerHTML = $page->innerHTML; |
686 | $page->innerHTML = ''; | 686 | $page->innerHTML = ''; |
687 | $page->appendChild($topCandidate); | 687 | $page->appendChild($topCandidate); |
688 | } | 688 | } |
689 | $this->initializeNode($topCandidate); | 689 | $this->initializeNode($topCandidate); |
690 | } | 690 | } |
691 | 691 | ||
692 | /** | 692 | /** |
693 | * Now that we have the top candidate, look through its siblings for content that might also be related. | 693 | * Now that we have the top candidate, look through its siblings for content that might also be related. |
694 | * Things like preambles, content split by ads that we removed, etc. | 694 | * Things like preambles, content split by ads that we removed, etc. |
695 | **/ | 695 | **/ |
696 | $articleContent = $this->dom->createElement('div'); | 696 | $articleContent = $this->dom->createElement('div'); |
697 | $articleContent->setAttribute('id', 'readability-content'); | 697 | $articleContent->setAttribute('id', 'readability-content'); |
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | 698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); |
699 | $siblingNodes = $topCandidate->parentNode->childNodes; | 699 | $siblingNodes = $topCandidate->parentNode->childNodes; |
700 | if (!isset($siblingNodes)) { | 700 | if (!isset($siblingNodes)) { |
701 | $siblingNodes = new stdClass; | 701 | $siblingNodes = new stdClass; |
702 | $siblingNodes->length = 0; | 702 | $siblingNodes->length = 0; |
703 | } | 703 | } |
704 | 704 | ||
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | 705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) |
706 | { | 706 | { |
707 | $siblingNode = $siblingNodes->item($s); | 707 | $siblingNode = $siblingNodes->item($s); |
708 | $append = false; | 708 | $append = false; |
709 | 709 | ||
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | 710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); |
711 | 711 | ||
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | 712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); |
713 | 713 | ||
714 | if ($siblingNode === $topCandidate) | 714 | if ($siblingNode === $topCandidate) |
715 | // or if ($siblingNode->isSameNode($topCandidate)) | 715 | // or if ($siblingNode->isSameNode($topCandidate)) |
716 | { | 716 | { |
717 | $append = true; | 717 | $append = true; |
718 | } | 718 | } |
719 | 719 | ||
720 | $contentBonus = 0; | 720 | $contentBonus = 0; |
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | 721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | 722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { |
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | 723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; |
724 | } | 724 | } |
725 | 725 | ||
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | 726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) |
727 | { | 727 | { |
728 | $append = true; | 728 | $append = true; |
729 | } | 729 | } |
730 | 730 | ||
731 | if (strtoupper($siblingNode->nodeName) == 'P') { | 731 | if (strtoupper($siblingNode->nodeName) == 'P') { |
732 | $linkDensity = $this->getLinkDensity($siblingNode); | 732 | $linkDensity = $this->getLinkDensity($siblingNode); |
733 | $nodeContent = $this->getInnerText($siblingNode); | 733 | $nodeContent = $this->getInnerText($siblingNode); |
734 | $nodeLength = strlen($nodeContent); | 734 | $nodeLength = strlen($nodeContent); |
735 | 735 | ||
736 | if ($nodeLength > 80 && $linkDensity < 0.25) | 736 | if ($nodeLength > 80 && $linkDensity < 0.25) |
737 | { | 737 | { |
738 | $append = true; | 738 | $append = true; |
739 | } | 739 | } |
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | 740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) |
741 | { | 741 | { |
742 | $append = true; | 742 | $append = true; |
743 | } | 743 | } |
744 | } | 744 | } |
745 | 745 | ||
746 | if ($append) | 746 | if ($append) |
747 | { | 747 | { |
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | 748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); |
749 | 749 | ||
750 | $nodeToAppend = null; | 750 | $nodeToAppend = null; |
751 | $sibNodeName = strtoupper($siblingNode->nodeName); | 751 | $sibNodeName = strtoupper($siblingNode->nodeName); |
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | 752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { |
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | 753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
754 | 754 | ||
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | 755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); |
756 | $nodeToAppend = $this->dom->createElement('div'); | 756 | $nodeToAppend = $this->dom->createElement('div'); |
757 | try { | 757 | try { |
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | 758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); |
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | 759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; |
760 | } | 760 | } |
761 | catch(Exception $e) | 761 | catch(Exception $e) |
762 | { | 762 | { |
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | 763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); |
764 | $nodeToAppend = $siblingNode; | 764 | $nodeToAppend = $siblingNode; |
765 | $s--; | 765 | $s--; |
766 | $sl--; | 766 | $sl--; |
767 | } | 767 | } |
768 | } else { | 768 | } else { |
769 | $nodeToAppend = $siblingNode; | 769 | $nodeToAppend = $siblingNode; |
770 | $s--; | 770 | $s--; |
771 | $sl--; | 771 | $sl--; |
772 | } | 772 | } |
773 | 773 | ||
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | 774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ |
775 | $nodeToAppend->removeAttribute('class'); | 775 | $nodeToAppend->removeAttribute('class'); |
776 | 776 | ||
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | 777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
778 | $articleContent->appendChild($nodeToAppend); | 778 | $articleContent->appendChild($nodeToAppend); |
779 | } | 779 | } |
780 | } | 780 | } |
781 | 781 | ||
782 | /** | 782 | /** |
783 | * So we have all of the content that we need. Now we clean it up for presentation. | 783 | * So we have all of the content that we need. Now we clean it up for presentation. |
784 | **/ | 784 | **/ |
785 | $this->prepArticle($articleContent); | 785 | $this->prepArticle($articleContent); |
786 | 786 | ||
787 | /** | 787 | /** |
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | 788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. |
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | 789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | 790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
791 | * finding the -right- content. | 791 | * finding the -right- content. |
792 | **/ | 792 | **/ |
793 | if (strlen($this->getInnerText($articleContent, false)) < 250) | 793 | if (strlen($this->getInnerText($articleContent, false)) < 250) |
794 | { | 794 | { |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | 795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 |
796 | // in the meantime, we check and create an empty element if it's not there. | 796 | // in the meantime, we check and create an empty element if it's not there. |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | 797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); |
798 | $this->body->innerHTML = $this->bodyCache; | 798 | $this->body->innerHTML = $this->bodyCache; |
799 | 799 | ||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | 800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | 801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); |
802 | return $this->grabArticle($this->body); | 802 | return $this->grabArticle($this->body); |
803 | } | 803 | } |
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | 805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); |
806 | return $this->grabArticle($this->body); | 806 | return $this->grabArticle($this->body); |
807 | } | 807 | } |
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | 809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); |
810 | return $this->grabArticle($this->body); | 810 | return $this->grabArticle($this->body); |
811 | } | 811 | } |
812 | else { | 812 | else { |
813 | return false; | 813 | return false; |
814 | } | 814 | } |
815 | } | 815 | } |
816 | return $articleContent; | 816 | return $articleContent; |
817 | } | 817 | } |
818 | 818 | ||
819 | /** | 819 | /** |
820 | * Remove script tags from document | 820 | * Remove script tags from document |
821 | * | 821 | * |
822 | * @param DOMElement | 822 | * @param DOMElement |
823 | * @return void | 823 | * @return void |
824 | */ | 824 | */ |
825 | public function removeScripts($doc) { | 825 | public function removeScripts($doc) { |
826 | $scripts = $doc->getElementsByTagName('script'); | 826 | $scripts = $doc->getElementsByTagName('script'); |
827 | for($i = $scripts->length-1; $i >= 0; $i--) | 827 | for($i = $scripts->length-1; $i >= 0; $i--) |
828 | { | 828 | { |
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | 829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); |
830 | } | 830 | } |
831 | } | 831 | } |
832 | 832 | ||
833 | /** | 833 | /** |
834 | * Get the inner text of a node. | 834 | * Get the inner text of a node. |
835 | * This also strips out any excess whitespace to be found. | 835 | * This also strips out any excess whitespace to be found. |
836 | * | 836 | * |
837 | * @param DOMElement $ | 837 | * @param DOMElement $ |
838 | * @param boolean $normalizeSpaces (default: true) | 838 | * @param boolean $normalizeSpaces (default: true) |
839 | * @return string | 839 | * @return string |
840 | **/ | 840 | **/ |
841 | public function getInnerText($e, $normalizeSpaces=true) { | 841 | public function getInnerText($e, $normalizeSpaces=true) { |
842 | $textContent = ''; | 842 | $textContent = ''; |
843 | 843 | ||
844 | if (!isset($e->textContent) || $e->textContent == '') { | 844 | if (!isset($e->textContent) || $e->textContent == '') { |
845 | return ''; | 845 | return ''; |
846 | } | 846 | } |
847 | 847 | ||
848 | $textContent = trim($e->textContent); | 848 | $textContent = trim($e->textContent); |
849 | 849 | ||
850 | if ($normalizeSpaces) { | 850 | if ($normalizeSpaces) { |
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | 851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); |
852 | } else { | 852 | } else { |
853 | return $textContent; | 853 | return $textContent; |
854 | } | 854 | } |
855 | } | 855 | } |
856 | 856 | ||
857 | /** | 857 | /** |
858 | * Get the number of times a string $s appears in the node $e. | 858 | * Get the number of times a string $s appears in the node $e. |
859 | * | 859 | * |
860 | * @param DOMElement $e | 860 | * @param DOMElement $e |
861 | * @param string - what to count. Default is "," | 861 | * @param string - what to count. Default is "," |
862 | * @return number (integer) | 862 | * @return number (integer) |
863 | **/ | 863 | **/ |
864 | public function getCharCount($e, $s=',') { | 864 | public function getCharCount($e, $s=',') { |
865 | return substr_count($this->getInnerText($e), $s); | 865 | return substr_count($this->getInnerText($e), $s); |
866 | } | 866 | } |
867 | 867 | ||
868 | /** | 868 | /** |
869 | * Remove the style attribute on every $e and under. | 869 | * Remove the style attribute on every $e and under. |
870 | * | 870 | * |
871 | * @param DOMElement $e | 871 | * @param DOMElement $e |
872 | * @return void | 872 | * @return void |
873 | */ | 873 | */ |
874 | public function cleanStyles($e) { | 874 | public function cleanStyles($e) { |
875 | if (!is_object($e)) return; | 875 | if (!is_object($e)) return; |
876 | $elems = $e->getElementsByTagName('*'); | 876 | $elems = $e->getElementsByTagName('*'); |
877 | foreach ($elems as $elem) { | 877 | foreach ($elems as $elem) { |
878 | $elem->removeAttribute('style'); | 878 | $elem->removeAttribute('style'); |
879 | } | 879 | } |
880 | } | 880 | } |
881 | 881 | ||
882 | /** | 882 | /** |
883 | * Get the density of links as a percentage of the content | 883 | * Get the density of links as a percentage of the content |
884 | * This is the amount of text that is inside a link divided by the total text in the node. | 884 | * This is the amount of text that is inside a link divided by the total text in the node. |
885 | * | 885 | * |
886 | * @param DOMElement $e | 886 | * @param DOMElement $e |
887 | * @return number (float) | 887 | * @return number (float) |
888 | */ | 888 | */ |
889 | public function getLinkDensity($e) { | 889 | public function getLinkDensity($e) { |
890 | $links = $e->getElementsByTagName('a'); | 890 | $links = $e->getElementsByTagName('a'); |
891 | $textLength = strlen($this->getInnerText($e)); | 891 | $textLength = strlen($this->getInnerText($e)); |
892 | $linkLength = 0; | 892 | $linkLength = 0; |
893 | for ($i=0, $il=$links->length; $i < $il; $i++) | 893 | for ($i=0, $il=$links->length; $i < $il; $i++) |
894 | { | 894 | { |
895 | $linkLength += strlen($this->getInnerText($links->item($i))); | 895 | $linkLength += strlen($this->getInnerText($links->item($i))); |
896 | } | 896 | } |
897 | if ($textLength > 0) { | 897 | if ($textLength > 0) { |
898 | return $linkLength / $textLength; | 898 | return $linkLength / $textLength; |
899 | } else { | 899 | } else { |
900 | return 0; | 900 | return 0; |
901 | } | 901 | } |
902 | } | 902 | } |
903 | 903 | ||
904 | /** | 904 | /** |
905 | * Get an elements class/id weight. Uses regular expressions to tell if this | 905 | * Get an elements class/id weight. Uses regular expressions to tell if this |
906 | * element looks good or bad. | 906 | * element looks good or bad. |
907 | * | 907 | * |
908 | * @param DOMElement $e | 908 | * @param DOMElement $e |
909 | * @return number (Integer) | 909 | * @return number (Integer) |
910 | */ | 910 | */ |
911 | public function getClassWeight($e) { | 911 | public function getClassWeight($e) { |
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | 912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { |
913 | return 0; | 913 | return 0; |
914 | } | 914 | } |
915 | 915 | ||
916 | $weight = 0; | 916 | $weight = 0; |
917 | 917 | ||
918 | /* Look for a special classname */ | 918 | /* Look for a special classname */ |
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | 919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') |
920 | { | 920 | { |
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | 921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { |
922 | $weight -= 25; | 922 | $weight -= 25; |
923 | } | 923 | } |
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | 924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { |
925 | $weight += 25; | 925 | $weight += 25; |
926 | } | 926 | } |
927 | } | 927 | } |
928 | 928 | ||
929 | /* Look for a special ID */ | 929 | /* Look for a special ID */ |
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | 930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') |
931 | { | 931 | { |
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | 932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { |
933 | $weight -= 25; | 933 | $weight -= 25; |
934 | } | 934 | } |
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | 935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { |
936 | $weight += 25; | 936 | $weight += 25; |
937 | } | 937 | } |
938 | } | 938 | } |
939 | return $weight; | 939 | return $weight; |
940 | } | 940 | } |
941 | 941 | ||
942 | /** | 942 | /** |
943 | * Remove extraneous break tags from a node. | 943 | * Remove extraneous break tags from a node. |
944 | * | 944 | * |
945 | * @param DOMElement $node | 945 | * @param DOMElement $node |
946 | * @return void | 946 | * @return void |
947 | */ | 947 | */ |
948 | public function killBreaks($node) { | 948 | public function killBreaks($node) { |
949 | $html = $node->innerHTML; | 949 | $html = $node->innerHTML; |
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | 950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); |
951 | $node->innerHTML = $html; | 951 | $node->innerHTML = $html; |
952 | } | 952 | } |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * Clean a node of all elements of type "tag". | 955 | * Clean a node of all elements of type "tag". |
956 | * (Unless it's a youtube/vimeo video. People love movies.) | 956 | * (Unless it's a youtube/vimeo video. People love movies.) |
957 | * | 957 | * |
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes | 958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes |
959 | * | 959 | * |
960 | * @param DOMElement $e | 960 | * @param DOMElement $e |
961 | * @param string $tag | 961 | * @param string $tag |
962 | * @return void | 962 | * @return void |
963 | */ | 963 | */ |
964 | public function clean($e, $tag) { | 964 | public function clean($e, $tag) { |
965 | $targetList = $e->getElementsByTagName($tag); | 965 | $targetList = $e->getElementsByTagName($tag); |
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); | 966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); |
967 | 967 | ||
968 | for ($y=$targetList->length-1; $y >= 0; $y--) { | 968 | for ($y=$targetList->length-1; $y >= 0; $y--) { |
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | 969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ |
970 | if ($isEmbed) { | 970 | if ($isEmbed) { |
971 | $attributeValues = ''; | 971 | $attributeValues = ''; |
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | 972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { |
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | 973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) |
974 | } | 974 | } |
975 | 975 | ||
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | 976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ |
977 | if (preg_match($this->regexps['video'], $attributeValues)) { | 977 | if (preg_match($this->regexps['video'], $attributeValues)) { |
978 | continue; | 978 | continue; |
979 | } | 979 | } |
980 | 980 | ||
981 | /* Then check the elements inside this element for the same. */ | 981 | /* Then check the elements inside this element for the same. */ |
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | 982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { |
983 | continue; | 983 | continue; |
984 | } | 984 | } |
985 | } | 985 | } |
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | 986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); |
987 | } | 987 | } |
988 | } | 988 | } |
989 | 989 | ||
990 | /** | 990 | /** |
991 | * Clean an element of all tags of type "tag" if they look fishy. | 991 | * Clean an element of all tags of type "tag" if they look fishy. |
992 | * "Fishy" is an algorithm based on content length, classnames, | 992 | * "Fishy" is an algorithm based on content length, classnames, |
993 | * link density, number of images & embeds, etc. | 993 | * link density, number of images & embeds, etc. |
994 | * | 994 | * |
995 | * @param DOMElement $e | 995 | * @param DOMElement $e |
996 | * @param string $tag | 996 | * @param string $tag |
997 | * @return void | 997 | * @return void |
998 | */ | 998 | */ |
999 | public function cleanConditionally($e, $tag) { | 999 | public function cleanConditionally($e, $tag) { |
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | 1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { |
1001 | return; | 1001 | return; |
1002 | } | 1002 | } |
1003 | 1003 | ||
1004 | $tagsList = $e->getElementsByTagName($tag); | 1004 | $tagsList = $e->getElementsByTagName($tag); |
1005 | $curTagsLength = $tagsList->length; | 1005 | $curTagsLength = $tagsList->length; |
1006 | 1006 | ||
1007 | /** | 1007 | /** |
1008 | * Gather counts for other typical elements embedded within. | 1008 | * Gather counts for other typical elements embedded within. |
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | 1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. |
1010 | * | 1010 | * |
1011 | * TODO: Consider taking into account original contentScore here. | 1011 | * TODO: Consider taking into account original contentScore here. |
1012 | */ | 1012 | */ |
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | 1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { |
1014 | $weight = $this->getClassWeight($tagsList->item($i)); | 1014 | $weight = $this->getClassWeight($tagsList->item($i)); |
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | 1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; |
1016 | 1016 | ||
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | 1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); |
1018 | 1018 | ||
1019 | if ($weight + $contentScore < 0) { | 1019 | if ($weight + $contentScore < 0) { |
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1021 | } | 1021 | } |
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | 1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { |
1023 | /** | 1023 | /** |
1024 | * If there are not very many commas, and the number of | 1024 | * If there are not very many commas, and the number of |
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | 1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. |
1026 | **/ | 1026 | **/ |
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | 1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; |
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | 1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; |
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | 1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; |
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | 1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; |
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; | 1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; |
1032 | 1032 | ||
1033 | $embedCount = 0; | 1033 | $embedCount = 0; |
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | 1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); |
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1037 | $embedCount++; | 1037 | $embedCount++; |
1038 | } | 1038 | } |
1039 | } | 1039 | } |
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); | 1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); |
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | 1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { |
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | 1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { |
1043 | $embedCount++; | 1043 | $embedCount++; |
1044 | } | 1044 | } |
1045 | } | 1045 | } |
1046 | 1046 | ||
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | 1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); |
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | 1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); |
1049 | $toRemove = false; | 1049 | $toRemove = false; |
1050 | 1050 | ||
1051 | if ($this->lightClean) { | 1051 | if ($this->lightClean) { |
1052 | $this->dbg('Light clean...'); | 1052 | $this->dbg('Light clean...'); |
1053 | if ( ($img > $p) && ($img > 4) ) { | 1053 | if ( ($img > $p) && ($img > 4) ) { |
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); | 1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); |
1055 | $toRemove = true; | 1055 | $toRemove = true; |
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1058 | $toRemove = true; | 1058 | $toRemove = true; |
1059 | } else if ( $input > floor($p/3) ) { | 1059 | } else if ( $input > floor($p/3) ) { |
1060 | $this->dbg(' too many <input> elements'); | 1060 | $this->dbg(' too many <input> elements'); |
1061 | $toRemove = true; | 1061 | $toRemove = true; |
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { | 1062 | } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { |
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); | 1063 | $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); |
1064 | $toRemove = true; | 1064 | $toRemove = true; |
1065 | } else if($weight < 25 && $linkDensity > 0.2) { | 1065 | } else if($weight < 25 && $linkDensity > 0.2) { |
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1067 | $toRemove = true; | 1067 | $toRemove = true; |
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { | 1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { |
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); | 1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); |
1070 | $toRemove = true; | 1070 | $toRemove = true; |
1071 | } else if($embedCount > 3) { | 1071 | } else if($embedCount > 3) { |
1072 | $this->dbg(' more than 3 embeds'); | 1072 | $this->dbg(' more than 3 embeds'); |
1073 | $toRemove = true; | 1073 | $toRemove = true; |
1074 | } | 1074 | } |
1075 | } else { | 1075 | } else { |
1076 | $this->dbg('Standard clean...'); | 1076 | $this->dbg('Standard clean...'); |
1077 | if ( $img > $p ) { | 1077 | if ( $img > $p ) { |
1078 | $this->dbg(' more image elements than paragraph elements'); | 1078 | $this->dbg(' more image elements than paragraph elements'); |
1079 | $toRemove = true; | 1079 | $toRemove = true; |
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | 1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { |
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | 1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); |
1082 | $toRemove = true; | 1082 | $toRemove = true; |
1083 | } else if ( $input > floor($p/3) ) { | 1083 | } else if ( $input > floor($p/3) ) { |
1084 | $this->dbg(' too many <input> elements'); | 1084 | $this->dbg(' too many <input> elements'); |
1085 | $toRemove = true; | 1085 | $toRemove = true; |
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | 1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { |
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); | 1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); |
1088 | $toRemove = true; | 1088 | $toRemove = true; |
1089 | } else if($weight < 25 && $linkDensity > 0.2) { | 1089 | } else if($weight < 25 && $linkDensity > 0.2) { |
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | 1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); |
1091 | $toRemove = true; | 1091 | $toRemove = true; |
1092 | } else if($weight >= 25 && $linkDensity > 0.5) { | 1092 | } else if($weight >= 25 && $linkDensity > 0.5) { |
1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); | 1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); |
1094 | $toRemove = true; | 1094 | $toRemove = true; |
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | 1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { |
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); | 1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); |
1097 | $toRemove = true; | 1097 | $toRemove = true; |
1098 | } | 1098 | } |
1099 | } | 1099 | } |
1100 | 1100 | ||
1101 | if ($toRemove) { | 1101 | if ($toRemove) { |
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); | 1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); |
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | 1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); |
1104 | } | 1104 | } |
1105 | } | 1105 | } |
1106 | } | 1106 | } |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | /** | 1109 | /** |
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | 1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
1111 | * | 1111 | * |
1112 | * @param DOMElement $e | 1112 | * @param DOMElement $e |
1113 | * @return void | 1113 | * @return void |
1114 | */ | 1114 | */ |
1115 | public function cleanHeaders($e) { | 1115 | public function cleanHeaders($e) { |
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | 1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); | 1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); |
1118 | for ($i=$headers->length-1; $i >=0; $i--) { | 1118 | for ($i=$headers->length-1; $i >=0; $i--) { |
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | 1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { |
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | 1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); |
1121 | } | 1121 | } |
1122 | } | 1122 | } |
1123 | } | 1123 | } |
1124 | } | 1124 | } |
1125 | 1125 | ||
1126 | public function flagIsActive($flag) { | 1126 | public function flagIsActive($flag) { |
1127 | return ($this->flags & $flag) > 0; | 1127 | return ($this->flags & $flag) > 0; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | public function addFlag($flag) { | 1130 | public function addFlag($flag) { |
1131 | $this->flags = $this->flags | $flag; | 1131 | $this->flags = $this->flags | $flag; |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | public function removeFlag($flag) { | 1134 | public function removeFlag($flag) { |
1135 | $this->flags = $this->flags & ~$flag; | 1135 | $this->flags = $this->flags & ~$flag; |
1136 | } | 1136 | } |
1137 | } | 1137 | } |
1138 | ?> \ No newline at end of file | 1138 | ?> \ No newline at end of file |