]>
Commit | Line | Data |
---|---|---|
42c80841 NL |
1 | <?php\r |
2 | /** \r | |
3 | * Arc90's Readability ported to PHP for FiveFilters.org\r | |
4 | * Based on readability.js version 1.7.1 (without multi-page support)\r | |
5 | * Updated to allow HTML5 parsing with html5lib\r | |
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds\r | |
7 | * ------------------------------------------------------\r | |
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js\r | |
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/\r | |
10 | * JS Source: http://code.google.com/p/arc90labs-readability\r | |
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net\r | |
12 | * More information: http://fivefilters.org/content-only/\r | |
13 | * License: Apache License, Version 2.0\r | |
14 | * Requires: PHP5\r | |
15 | * Date: 2012-09-19\r | |
16 | * \r | |
17 | * Differences between the PHP port and the original\r | |
18 | * ------------------------------------------------------\r | |
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM \r | |
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and \r | |
21 | * Javascript code executed. This PHP port does not run inside a browser. \r | |
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot \r | |
23 | * rely on CSS or Javascript support. As such, the results will not always \r | |
24 | * match Arc90's Readability. (For example, if a web page contains CSS style \r | |
25 | * rules or Javascript code which hide certain HTML elements from display, \r | |
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, \r | |
27 | * unable to understand CSS or Javascript, will not know any better.)\r | |
28 | * \r | |
29 | * Another significant difference is that the aim of Arc90's Readability is \r | |
30 | * to re-present the main content block of a given web page so users can \r | |
31 | * read it more easily in their browsers. Correct identification, clean up, \r | |
32 | * and separation of the content block is only a part of this process. \r | |
33 | * This PHP port is only concerned with this part, it does not include code \r | |
34 | * that relates to presentation in the browser - Arc90 already do \r | |
35 | * that extremely well, and for PDF output there's FiveFilters.org's \r | |
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/.\r | |
37 | * \r | |
38 | * Finally, this class contains methods that might be useful for developers \r | |
39 | * working on HTML document fragments. So without deviating too much from \r | |
40 | * the original code (which I don't want to do because it makes debugging \r | |
41 | * and updating more difficult), I've tried to make it a little more \r | |
42 | * developer friendly. You should be able to use the methods here on \r | |
43 | * existing DOMElement objects without passing an entire HTML document to \r | |
44 | * be parsed.\r | |
45 | */\r | |
46 | \r | |
47 | // This class allows us to do JavaScript like assignements to innerHTML\r | |
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');\r | |
49 | \r | |
50 | // Alternative usage (for testing only!)\r | |
51 | // uncomment the lines below and call Readability.php in your browser \r | |
52 | // passing it the URL of the page you'd like content from, e.g.:\r | |
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php\r | |
54 | \r | |
55 | /*\r | |
56 | if (!isset($_GET['url']) || $_GET['url'] == '') {\r | |
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');\r | |
58 | }\r | |
59 | $url = $_GET['url'];\r | |
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;\r | |
61 | $html = file_get_contents($url);\r | |
62 | $r = new Readability($html, $url);\r | |
63 | $r->init();\r | |
64 | echo $r->articleContent->innerHTML;\r | |
65 | */\r | |
66 | \r | |
67 | class Readability\r | |
68 | {\r | |
69 | public $version = '1.7.1-without-multi-page';\r | |
70 | public $convertLinksToFootnotes = false;\r | |
71 | public $revertForcedParagraphElements = true;\r | |
72 | public $articleTitle;\r | |
73 | public $articleContent;\r | |
74 | public $dom;\r | |
75 | public $url = null; // optional - URL where HTML was retrieved\r | |
76 | public $debug = false;\r | |
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19\r | |
78 | protected $body = null; // \r | |
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later\r | |
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.\r | |
81 | protected $success = false; // indicates whether we were able to extract or not\r | |
82 | \r | |
83 | /**\r | |
84 | * All of the regular expressions in use within readability.\r | |
85 | * Defined up here so we don't instantiate them repeatedly in loops.\r | |
86 | **/\r | |
87 | public $regexps = array(\r | |
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',\r | |
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',\r | |
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',\r | |
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',\r | |
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',\r | |
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',\r | |
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i',\r | |
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()\r | |
96 | 'normalize' => '/\s{2,}/',\r | |
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/',\r | |
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',\r | |
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'\r | |
100 | ); \r | |
101 | \r | |
102 | /* constants */\r | |
103 | const FLAG_STRIP_UNLIKELYS = 1;\r | |
104 | const FLAG_WEIGHT_CLASSES = 2;\r | |
105 | const FLAG_CLEAN_CONDITIONALLY = 4;\r | |
106 | \r | |
107 | /**\r | |
108 | * Create instance of Readability\r | |
109 | * @param string UTF-8 encoded string\r | |
110 | * @param string (optional) URL associated with HTML (used for footnotes)\r | |
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')\r | |
112 | */ \r | |
113 | function __construct($html, $url=null, $parser='libxml')\r | |
114 | {\r | |
115 | $this->url = $url;\r | |
116 | /* Turn all double br's into p's */\r | |
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);\r | |
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);\r | |
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");\r | |
120 | if (trim($html) == '') $html = '<html></html>';\r | |
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {\r | |
122 | // all good\r | |
123 | } else {\r | |
124 | $this->dom = new DOMDocument();\r | |
125 | $this->dom->preserveWhiteSpace = false;\r | |
126 | @$this->dom->loadHTML($html);\r | |
127 | }\r | |
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');\r | |
129 | }\r | |
130 | \r | |
131 | /**\r | |
132 | * Get article title element\r | |
133 | * @return DOMElement\r | |
134 | */\r | |
135 | public function getTitle() {\r | |
136 | return $this->articleTitle;\r | |
137 | }\r | |
138 | \r | |
139 | /**\r | |
140 | * Get article content element\r | |
141 | * @return DOMElement\r | |
142 | */\r | |
143 | public function getContent() {\r | |
144 | return $this->articleContent;\r | |
145 | } \r | |
146 | \r | |
147 | /**\r | |
148 | * Runs readability.\r | |
149 | * \r | |
150 | * Workflow:\r | |
151 | * 1. Prep the document by removing script tags, css, etc.\r | |
152 | * 2. Build readability's DOM tree.\r | |
153 | * 3. Grab the article content from the current dom tree.\r | |
154 | * 4. Replace the current DOM tree with the new one.\r | |
155 | * 5. Read peacefully.\r | |
156 | *\r | |
157 | * @return boolean true if we found content, false otherwise\r | |
158 | **/\r | |
159 | public function init()\r | |
160 | {\r | |
161 | if (!isset($this->dom->documentElement)) return false;\r | |
162 | $this->removeScripts($this->dom);\r | |
163 | //die($this->getInnerHTML($this->dom->documentElement));\r | |
164 | \r | |
165 | // Assume successful outcome\r | |
166 | $this->success = true;\r | |
167 | \r | |
168 | $bodyElems = $this->dom->getElementsByTagName('body');\r | |
169 | if ($bodyElems->length > 0) {\r | |
170 | if ($this->bodyCache == null) {\r | |
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML;\r | |
172 | }\r | |
173 | if ($this->body == null) {\r | |
174 | $this->body = $bodyElems->item(0);\r | |
175 | }\r | |
176 | }\r | |
177 | \r | |
178 | $this->prepDocument();\r | |
179 | \r | |
180 | //die($this->dom->documentElement->parentNode->nodeType);\r | |
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));\r | |
182 | //die($this->getInnerHTML($this->dom->documentElement));\r | |
183 | \r | |
184 | /* Build readability's DOM tree */\r | |
185 | $overlay = $this->dom->createElement('div');\r | |
186 | $innerDiv = $this->dom->createElement('div');\r | |
187 | $articleTitle = $this->getArticleTitle();\r | |
188 | $articleContent = $this->grabArticle();\r | |
189 | \r | |
190 | if (!$articleContent) {\r | |
191 | $this->success = false;\r | |
192 | $articleContent = $this->dom->createElement('div');\r | |
193 | $articleContent->setAttribute('id', 'readability-content');\r | |
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; \r | |
195 | }\r | |
196 | \r | |
197 | $overlay->setAttribute('id', 'readOverlay');\r | |
198 | $innerDiv->setAttribute('id', 'readInner');\r | |
199 | \r | |
200 | /* Glue the structure of our document together. */\r | |
201 | $innerDiv->appendChild($articleTitle);\r | |
202 | $innerDiv->appendChild($articleContent);\r | |
203 | $overlay->appendChild($innerDiv);\r | |
204 | \r | |
205 | /* Clear the old HTML, insert the new content. */\r | |
206 | $this->body->innerHTML = '';\r | |
207 | $this->body->appendChild($overlay);\r | |
208 | //document.body.insertBefore(overlay, document.body.firstChild);\r | |
209 | $this->body->removeAttribute('style');\r | |
210 | \r | |
211 | $this->postProcessContent($articleContent);\r | |
212 | \r | |
213 | // Set title and content instance variables\r | |
214 | $this->articleTitle = $articleTitle;\r | |
215 | $this->articleContent = $articleContent;\r | |
216 | \r | |
217 | return $this->success;\r | |
218 | }\r | |
219 | \r | |
220 | /**\r | |
221 | * Debug\r | |
222 | */\r | |
223 | protected function dbg($msg) {\r | |
224 | if ($this->debug) echo '* ',$msg, "\n";\r | |
225 | }\r | |
226 | \r | |
227 | /**\r | |
228 | * Run any post-process modifications to article content as necessary.\r | |
229 | *\r | |
230 | * @param DOMElement\r | |
231 | * @return void\r | |
232 | */\r | |
233 | public function postProcessContent($articleContent) {\r | |
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { \r | |
235 | $this->addFootnotes($articleContent);\r | |
236 | }\r | |
237 | }\r | |
238 | \r | |
239 | /**\r | |
240 | * Get the article title as an H1.\r | |
241 | *\r | |
242 | * @return DOMElement\r | |
243 | */\r | |
244 | protected function getArticleTitle() {\r | |
245 | $curTitle = '';\r | |
246 | $origTitle = '';\r | |
247 | \r | |
248 | try {\r | |
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));\r | |
250 | } catch(Exception $e) {}\r | |
251 | \r | |
252 | if (preg_match('/ [\|\-] /', $curTitle))\r | |
253 | {\r | |
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);\r | |
255 | \r | |
256 | if (count(explode(' ', $curTitle)) < 3) {\r | |
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);\r | |
258 | }\r | |
259 | }\r | |
260 | else if (strpos($curTitle, ': ') !== false)\r | |
261 | {\r | |
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);\r | |
263 | \r | |
264 | if (count(explode(' ', $curTitle)) < 3) {\r | |
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);\r | |
266 | }\r | |
267 | }\r | |
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)\r | |
269 | {\r | |
270 | $hOnes = $this->dom->getElementsByTagName('h1');\r | |
271 | if($hOnes->length == 1)\r | |
272 | {\r | |
273 | $curTitle = $this->getInnerText($hOnes->item(0));\r | |
274 | }\r | |
275 | }\r | |
276 | \r | |
277 | $curTitle = trim($curTitle);\r | |
278 | \r | |
279 | if (count(explode(' ', $curTitle)) <= 4) {\r | |
280 | $curTitle = $origTitle;\r | |
281 | }\r | |
282 | \r | |
283 | $articleTitle = $this->dom->createElement('h1');\r | |
284 | $articleTitle->innerHTML = $curTitle;\r | |
285 | \r | |
286 | return $articleTitle;\r | |
287 | }\r | |
288 | \r | |
289 | /**\r | |
290 | * Prepare the HTML document for readability to scrape it.\r | |
291 | * This includes things like stripping javascript, CSS, and handling terrible markup.\r | |
292 | * \r | |
293 | * @return void\r | |
294 | **/\r | |
295 | protected function prepDocument() {\r | |
296 | /**\r | |
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example)\r | |
298 | * so we create a new body node and append it to the document.\r | |
299 | */\r | |
300 | if ($this->body == null)\r | |
301 | {\r | |
302 | $this->body = $this->dom->createElement('body');\r | |
303 | $this->dom->documentElement->appendChild($this->body);\r | |
304 | }\r | |
305 | $this->body->setAttribute('id', 'readabilityBody');\r | |
306 | \r | |
307 | /* Remove all style tags in head */\r | |
308 | $styleTags = $this->dom->getElementsByTagName('style');\r | |
309 | for ($i = $styleTags->length-1; $i >= 0; $i--)\r | |
310 | {\r | |
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));\r | |
312 | }\r | |
313 | \r | |
314 | /* Turn all double br's into p's */\r | |
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */\r | |
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');\r | |
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.\r | |
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP.\r | |
319 | }\r | |
320 | \r | |
321 | /**\r | |
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links.\r | |
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php\r | |
324 | *\r | |
325 | * @return void\r | |
326 | **/\r | |
327 | public function addFootnotes($articleContent) {\r | |
328 | $footnotesWrapper = $this->dom->createElement('div');\r | |
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes');\r | |
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>';\r | |
331 | \r | |
332 | $articleFootnotes = $this->dom->createElement('ol');\r | |
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list');\r | |
334 | $footnotesWrapper->appendChild($articleFootnotes);\r | |
335 | \r | |
336 | $articleLinks = $articleContent->getElementsByTagName('a');\r | |
337 | \r | |
338 | $linkCount = 0;\r | |
339 | for ($i = 0; $i < $articleLinks->length; $i++)\r | |
340 | {\r | |
341 | $articleLink = $articleLinks->item($i);\r | |
342 | $footnoteLink = $articleLink->cloneNode(true);\r | |
343 | $refLink = $this->dom->createElement('a');\r | |
344 | $footnote = $this->dom->createElement('li');\r | |
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);\r | |
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);\r | |
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,\r | |
348 | $linkText = $this->getInnerText($articleLink);\r | |
349 | \r | |
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {\r | |
351 | continue;\r | |
352 | }\r | |
353 | \r | |
354 | $linkCount++;\r | |
355 | \r | |
356 | /** Add a superscript reference after the article link */\r | |
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);\r | |
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';\r | |
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote');\r | |
360 | $refLink->setAttribute('style', 'color: inherit;');\r | |
361 | \r | |
362 | //TODO: does this work or should we use DOMNode.isSameNode()?\r | |
363 | if ($articleLink->parentNode->lastChild == $articleLink) {\r | |
364 | $articleLink->parentNode->appendChild($refLink);\r | |
365 | } else {\r | |
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);\r | |
367 | }\r | |
368 | \r | |
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');\r | |
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);\r | |
371 | \r | |
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';\r | |
373 | \r | |
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);\r | |
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);\r | |
376 | \r | |
377 | $footnote->appendChild($footnoteLink);\r | |
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';\r | |
379 | \r | |
380 | $articleFootnotes->appendChild($footnote);\r | |
381 | }\r | |
382 | \r | |
383 | if ($linkCount > 0) {\r | |
384 | $articleContent->appendChild($footnotesWrapper); \r | |
385 | }\r | |
386 | }\r | |
387 | \r | |
388 | /**\r | |
389 | * Reverts P elements with class 'readability-styled'\r | |
390 | * to text nodes - which is what they were before.\r | |
391 | *\r | |
392 | * @param DOMElement\r | |
393 | * @return void\r | |
394 | */\r | |
395 | function revertReadabilityStyledElements($articleContent) {\r | |
396 | $xpath = new DOMXPath($articleContent->ownerDocument);\r | |
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);\r | |
398 | //$elems = $articleContent->getElementsByTagName('p');\r | |
399 | for ($i = $elems->length-1; $i >= 0; $i--) {\r | |
400 | $e = $elems->item($i);\r | |
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);\r | |
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {\r | |
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);\r | |
404 | //}\r | |
405 | }\r | |
406 | }\r | |
407 | \r | |
408 | /**\r | |
409 | * Prepare the article node for display. Clean out any inline styles,\r | |
410 | * iframes, forms, strip extraneous <p> tags, etc.\r | |
411 | *\r | |
412 | * @param DOMElement\r | |
413 | * @return void\r | |
414 | */\r | |
415 | function prepArticle($articleContent) {\r | |
416 | $this->cleanStyles($articleContent);\r | |
417 | $this->killBreaks($articleContent);\r | |
418 | if ($this->revertForcedParagraphElements) {\r | |
419 | $this->revertReadabilityStyledElements($articleContent);\r | |
420 | }\r | |
421 | \r | |
422 | /* Clean out junk from the article content */\r | |
423 | $this->cleanConditionally($articleContent, 'form');\r | |
424 | $this->clean($articleContent, 'object');\r | |
425 | $this->clean($articleContent, 'h1');\r | |
426 | \r | |
427 | /**\r | |
428 | * If there is only one h2, they are probably using it\r | |
429 | * as a header and not a subheader, so remove it since we already have a header.\r | |
430 | ***/\r | |
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {\r | |
432 | $this->clean($articleContent, 'h2'); \r | |
433 | }\r | |
434 | $this->clean($articleContent, 'iframe');\r | |
435 | \r | |
436 | $this->cleanHeaders($articleContent);\r | |
437 | \r | |
438 | /* Do these last as the previous stuff may have removed junk that will affect these */\r | |
439 | $this->cleanConditionally($articleContent, 'table');\r | |
440 | $this->cleanConditionally($articleContent, 'ul');\r | |
441 | $this->cleanConditionally($articleContent, 'div');\r | |
442 | \r | |
443 | /* Remove extra paragraphs */\r | |
444 | $articleParagraphs = $articleContent->getElementsByTagName('p');\r | |
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--)\r | |
446 | {\r | |
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;\r | |
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;\r | |
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;\r | |
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;\r | |
451 | \r | |
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')\r | |
453 | {\r | |
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));\r | |
455 | }\r | |
456 | }\r | |
457 | \r | |
458 | try {\r | |
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);\r | |
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); \r | |
461 | }\r | |
462 | catch (Exception $e) {\r | |
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);\r | |
464 | }\r | |
465 | }\r | |
466 | \r | |
467 | /**\r | |
468 | * Initialize a node with the readability object. Also checks the\r | |
469 | * className/id for special names to add to its score.\r | |
470 | *\r | |
471 | * @param Element\r | |
472 | * @return void\r | |
473 | **/\r | |
474 | protected function initializeNode($node) {\r | |
475 | $readability = $this->dom->createAttribute('readability');\r | |
476 | $readability->value = 0; // this is our contentScore\r | |
477 | $node->setAttributeNode($readability); \r | |
478 | \r | |
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case\r | |
480 | case 'DIV':\r | |
481 | $readability->value += 5;\r | |
482 | break;\r | |
483 | \r | |
484 | case 'PRE':\r | |
485 | case 'TD':\r | |
486 | case 'BLOCKQUOTE':\r | |
487 | $readability->value += 3;\r | |
488 | break;\r | |
489 | \r | |
490 | case 'ADDRESS':\r | |
491 | case 'OL':\r | |
492 | case 'UL':\r | |
493 | case 'DL':\r | |
494 | case 'DD':\r | |
495 | case 'DT':\r | |
496 | case 'LI':\r | |
497 | case 'FORM':\r | |
498 | $readability->value -= 3;\r | |
499 | break;\r | |
500 | \r | |
501 | case 'H1':\r | |
502 | case 'H2':\r | |
503 | case 'H3':\r | |
504 | case 'H4':\r | |
505 | case 'H5':\r | |
506 | case 'H6':\r | |
507 | case 'TH':\r | |
508 | $readability->value -= 5;\r | |
509 | break;\r | |
510 | }\r | |
511 | $readability->value += $this->getClassWeight($node);\r | |
512 | }\r | |
513 | \r | |
514 | /***\r | |
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is\r | |
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.\r | |
517 | *\r | |
518 | * @return DOMElement\r | |
519 | **/\r | |
520 | protected function grabArticle($page=null) {\r | |
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);\r | |
522 | if (!$page) $page = $this->dom;\r | |
523 | $allElements = $page->getElementsByTagName('*');\r | |
524 | /**\r | |
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs\r | |
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)\r | |
527 | *\r | |
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5\r | |
529 | * TODO: Shouldn't this be a reverse traversal?\r | |
530 | **/\r | |
531 | $node = null;\r | |
532 | $nodesToScore = array();\r | |
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {\r | |
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {\r | |
535 | //$node = $targetList->item($nodeIndex);\r | |
536 | $tagName = strtoupper($node->tagName);\r | |
537 | /* Remove unlikely candidates */\r | |
538 | if ($stripUnlikelyCandidates) {\r | |
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');\r | |
540 | if (\r | |
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&\r | |
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&\r | |
543 | $tagName != 'BODY'\r | |
544 | )\r | |
545 | {\r | |
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);\r | |
547 | //$nodesToRemove[] = $node;\r | |
548 | $node->parentNode->removeChild($node);\r | |
549 | $nodeIndex--;\r | |
550 | continue;\r | |
551 | } \r | |
552 | }\r | |
553 | \r | |
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {\r | |
555 | $nodesToScore[] = $node;\r | |
556 | }\r | |
557 | \r | |
558 | /* Turn all divs that don't have children block level elements into p's */\r | |
559 | if ($tagName == 'DIV') {\r | |
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {\r | |
561 | //$this->dbg('Altering div to p');\r | |
562 | $newNode = $this->dom->createElement('p');\r | |
563 | try {\r | |
564 | $newNode->innerHTML = $node->innerHTML;\r | |
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);\r | |
566 | $node->parentNode->replaceChild($newNode, $node);\r | |
567 | $nodeIndex--;\r | |
568 | $nodesToScore[] = $node; // or $newNode?\r | |
569 | }\r | |
570 | catch(Exception $e) {\r | |
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);\r | |
572 | }\r | |
573 | }\r | |
574 | else\r | |
575 | {\r | |
576 | /* EXPERIMENTAL */\r | |
577 | // TODO: change these p elements back to text nodes after processing\r | |
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {\r | |
579 | $childNode = $node->childNodes->item($i);\r | |
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE\r | |
581 | //$this->dbg('replacing text node with a p tag with the same content.');\r | |
582 | $p = $this->dom->createElement('p');\r | |
583 | $p->innerHTML = $childNode->nodeValue;\r | |
584 | $p->setAttribute('style', 'display: inline;');\r | |
585 | $p->setAttribute('class', 'readability-styled');\r | |
586 | $childNode->parentNode->replaceChild($p, $childNode);\r | |
587 | }\r | |
588 | }\r | |
589 | }\r | |
590 | }\r | |
591 | }\r | |
592 | \r | |
593 | /**\r | |
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look.\r | |
595 | * Then add their score to their parent node.\r | |
596 | *\r | |
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.\r | |
598 | **/\r | |
599 | $candidates = array();\r | |
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) {\r | |
601 | $parentNode = $nodesToScore[$pt]->parentNode;\r | |
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null;\r | |
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);\r | |
604 | $innerText = $this->getInnerText($nodesToScore[$pt]);\r | |
605 | \r | |
606 | if (!$parentNode || !isset($parentNode->tagName)) {\r | |
607 | continue;\r | |
608 | }\r | |
609 | \r | |
610 | /* If this paragraph is less than 25 characters, don't even count it. */\r | |
611 | if(strlen($innerText) < 25) {\r | |
612 | continue;\r | |
613 | }\r | |
614 | \r | |
615 | /* Initialize readability data for the parent. */\r | |
616 | if (!$parentNode->hasAttribute('readability')) \r | |
617 | {\r | |
618 | $this->initializeNode($parentNode);\r | |
619 | $candidates[] = $parentNode;\r | |
620 | }\r | |
621 | \r | |
622 | /* Initialize readability data for the grandparent. */\r | |
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))\r | |
624 | {\r | |
625 | $this->initializeNode($grandParentNode);\r | |
626 | $candidates[] = $grandParentNode;\r | |
627 | }\r | |
628 | \r | |
629 | $contentScore = 0;\r | |
630 | \r | |
631 | /* Add a point for the paragraph itself as a base. */\r | |
632 | $contentScore++;\r | |
633 | \r | |
634 | /* Add points for any commas within this paragraph */\r | |
635 | $contentScore += count(explode(',', $innerText));\r | |
636 | \r | |
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */\r | |
638 | $contentScore += min(floor(strlen($innerText) / 100), 3);\r | |
639 | \r | |
640 | /* Add the score to the parent. The grandparent gets half. */\r | |
641 | $parentNode->getAttributeNode('readability')->value += $contentScore;\r | |
642 | \r | |
643 | if ($grandParentNode) {\r | |
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; \r | |
645 | }\r | |
646 | }\r | |
647 | \r | |
648 | /**\r | |
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found\r | |
650 | * and find the one with the highest score.\r | |
651 | **/\r | |
652 | $topCandidate = null;\r | |
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++)\r | |
654 | {\r | |
655 | /**\r | |
656 | * Scale the final candidates score based on link density. Good content should have a\r | |
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation.\r | |
658 | **/\r | |
659 | $readability = $candidates[$c]->getAttributeNode('readability');\r | |
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));\r | |
661 | \r | |
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);\r | |
663 | \r | |
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {\r | |
665 | $topCandidate = $candidates[$c];\r | |
666 | }\r | |
667 | }\r | |
668 | \r | |
669 | /**\r | |
670 | * If we still have no top candidate, just use the body as a last resort.\r | |
671 | * We also have to copy the body node so it is something we can modify.\r | |
672 | **/\r | |
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')\r | |
674 | {\r | |
675 | $topCandidate = $this->dom->createElement('div');\r | |
676 | if ($page instanceof DOMDocument) {\r | |
677 | if (!isset($page->documentElement)) {\r | |
678 | // we don't have a body either? what a mess! :)\r | |
679 | } else {\r | |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML;\r | |
681 | $page->documentElement->innerHTML = '';\r | |
682 | $page->documentElement->appendChild($topCandidate);\r | |
683 | }\r | |
684 | } else {\r | |
685 | $topCandidate->innerHTML = $page->innerHTML;\r | |
686 | $page->innerHTML = '';\r | |
687 | $page->appendChild($topCandidate);\r | |
688 | }\r | |
689 | $this->initializeNode($topCandidate);\r | |
690 | }\r | |
691 | \r | |
692 | /**\r | |
693 | * Now that we have the top candidate, look through its siblings for content that might also be related.\r | |
694 | * Things like preambles, content split by ads that we removed, etc.\r | |
695 | **/\r | |
696 | $articleContent = $this->dom->createElement('div');\r | |
697 | $articleContent->setAttribute('id', 'readability-content');\r | |
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);\r | |
699 | $siblingNodes = $topCandidate->parentNode->childNodes;\r | |
700 | if (!isset($siblingNodes)) {\r | |
701 | $siblingNodes = new stdClass;\r | |
702 | $siblingNodes->length = 0;\r | |
703 | }\r | |
704 | \r | |
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)\r | |
706 | {\r | |
707 | $siblingNode = $siblingNodes->item($s);\r | |
708 | $append = false;\r | |
709 | \r | |
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));\r | |
711 | \r | |
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));\r | |
713 | \r | |
714 | if ($siblingNode === $topCandidate)\r | |
715 | // or if ($siblingNode->isSameNode($topCandidate))\r | |
716 | {\r | |
717 | $append = true;\r | |
718 | }\r | |
719 | \r | |
720 | $contentBonus = 0;\r | |
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */\r | |
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {\r | |
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;\r | |
724 | }\r | |
725 | \r | |
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)\r | |
727 | {\r | |
728 | $append = true;\r | |
729 | }\r | |
730 | \r | |
731 | if (strtoupper($siblingNode->nodeName) == 'P') {\r | |
732 | $linkDensity = $this->getLinkDensity($siblingNode);\r | |
733 | $nodeContent = $this->getInnerText($siblingNode);\r | |
734 | $nodeLength = strlen($nodeContent);\r | |
735 | \r | |
736 | if ($nodeLength > 80 && $linkDensity < 0.25)\r | |
737 | {\r | |
738 | $append = true;\r | |
739 | }\r | |
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))\r | |
741 | {\r | |
742 | $append = true;\r | |
743 | }\r | |
744 | }\r | |
745 | \r | |
746 | if ($append)\r | |
747 | {\r | |
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName);\r | |
749 | \r | |
750 | $nodeToAppend = null;\r | |
751 | $sibNodeName = strtoupper($siblingNode->nodeName);\r | |
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {\r | |
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */\r | |
754 | \r | |
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');\r | |
756 | $nodeToAppend = $this->dom->createElement('div');\r | |
757 | try {\r | |
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));\r | |
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML;\r | |
760 | }\r | |
761 | catch(Exception $e)\r | |
762 | {\r | |
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.');\r | |
764 | $nodeToAppend = $siblingNode;\r | |
765 | $s--;\r | |
766 | $sl--;\r | |
767 | }\r | |
768 | } else {\r | |
769 | $nodeToAppend = $siblingNode;\r | |
770 | $s--;\r | |
771 | $sl--;\r | |
772 | }\r | |
773 | \r | |
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */\r | |
775 | $nodeToAppend->removeAttribute('class');\r | |
776 | \r | |
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */\r | |
778 | $articleContent->appendChild($nodeToAppend);\r | |
779 | }\r | |
780 | }\r | |
781 | \r | |
782 | /**\r | |
783 | * So we have all of the content that we need. Now we clean it up for presentation.\r | |
784 | **/\r | |
785 | $this->prepArticle($articleContent);\r | |
786 | \r | |
787 | /**\r | |
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content.\r | |
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher\r | |
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of\r | |
791 | * finding the -right- content.\r | |
792 | **/\r | |
793 | if (strlen($this->getInnerText($articleContent, false)) < 250)\r | |
794 | {\r | |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7\r | |
796 | // in the meantime, we check and create an empty element if it's not there.\r | |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');\r | |
798 | $this->body->innerHTML = $this->bodyCache;\r | |
799 | \r | |
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {\r | |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);\r | |
802 | return $this->grabArticle($this->body);\r | |
803 | }\r | |
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {\r | |
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES);\r | |
806 | return $this->grabArticle($this->body); \r | |
807 | }\r | |
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {\r | |
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);\r | |
810 | return $this->grabArticle($this->body);\r | |
811 | }\r | |
812 | else {\r | |
813 | return false;\r | |
814 | }\r | |
815 | }\r | |
816 | return $articleContent;\r | |
817 | }\r | |
818 | \r | |
819 | /**\r | |
820 | * Remove script tags from document\r | |
821 | *\r | |
822 | * @param DOMElement\r | |
823 | * @return void\r | |
824 | */\r | |
825 | public function removeScripts($doc) {\r | |
826 | $scripts = $doc->getElementsByTagName('script');\r | |
827 | for($i = $scripts->length-1; $i >= 0; $i--)\r | |
828 | {\r | |
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i));\r | |
830 | }\r | |
831 | }\r | |
832 | \r | |
833 | /**\r | |
834 | * Get the inner text of a node.\r | |
835 | * This also strips out any excess whitespace to be found.\r | |
836 | *\r | |
837 | * @param DOMElement $\r | |
838 | * @param boolean $normalizeSpaces (default: true)\r | |
839 | * @return string\r | |
840 | **/\r | |
841 | public function getInnerText($e, $normalizeSpaces=true) {\r | |
842 | $textContent = '';\r | |
843 | \r | |
844 | if (!isset($e->textContent) || $e->textContent == '') {\r | |
845 | return '';\r | |
846 | }\r | |
847 | \r | |
848 | $textContent = trim($e->textContent);\r | |
849 | \r | |
850 | if ($normalizeSpaces) {\r | |
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent);\r | |
852 | } else {\r | |
853 | return $textContent;\r | |
854 | }\r | |
855 | }\r | |
856 | \r | |
857 | /**\r | |
858 | * Get the number of times a string $s appears in the node $e.\r | |
859 | *\r | |
860 | * @param DOMElement $e\r | |
861 | * @param string - what to count. Default is ","\r | |
862 | * @return number (integer)\r | |
863 | **/\r | |
864 | public function getCharCount($e, $s=',') {\r | |
865 | return substr_count($this->getInnerText($e), $s);\r | |
866 | }\r | |
867 | \r | |
868 | /**\r | |
869 | * Remove the style attribute on every $e and under.\r | |
870 | *\r | |
871 | * @param DOMElement $e\r | |
872 | * @return void\r | |
873 | */\r | |
874 | public function cleanStyles($e) {\r | |
875 | if (!is_object($e)) return;\r | |
876 | $elems = $e->getElementsByTagName('*');\r | |
877 | foreach ($elems as $elem) {\r | |
878 | $elem->removeAttribute('style');\r | |
879 | }\r | |
880 | }\r | |
881 | \r | |
882 | /**\r | |
883 | * Get the density of links as a percentage of the content\r | |
884 | * This is the amount of text that is inside a link divided by the total text in the node.\r | |
885 | * \r | |
886 | * @param DOMElement $e\r | |
887 | * @return number (float)\r | |
888 | */\r | |
889 | public function getLinkDensity($e) {\r | |
890 | $links = $e->getElementsByTagName('a');\r | |
891 | $textLength = strlen($this->getInnerText($e));\r | |
892 | $linkLength = 0;\r | |
893 | for ($i=0, $il=$links->length; $i < $il; $i++)\r | |
894 | {\r | |
895 | $linkLength += strlen($this->getInnerText($links->item($i)));\r | |
896 | }\r | |
897 | if ($textLength > 0) {\r | |
898 | return $linkLength / $textLength;\r | |
899 | } else {\r | |
900 | return 0;\r | |
901 | }\r | |
902 | }\r | |
903 | \r | |
904 | /**\r | |
905 | * Get an elements class/id weight. Uses regular expressions to tell if this \r | |
906 | * element looks good or bad.\r | |
907 | *\r | |
908 | * @param DOMElement $e\r | |
909 | * @return number (Integer)\r | |
910 | */\r | |
911 | public function getClassWeight($e) {\r | |
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {\r | |
913 | return 0;\r | |
914 | }\r | |
915 | \r | |
916 | $weight = 0;\r | |
917 | \r | |
918 | /* Look for a special classname */\r | |
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '')\r | |
920 | {\r | |
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {\r | |
922 | $weight -= 25;\r | |
923 | }\r | |
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {\r | |
925 | $weight += 25;\r | |
926 | }\r | |
927 | }\r | |
928 | \r | |
929 | /* Look for a special ID */\r | |
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '')\r | |
931 | {\r | |
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {\r | |
933 | $weight -= 25;\r | |
934 | }\r | |
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {\r | |
936 | $weight += 25;\r | |
937 | }\r | |
938 | }\r | |
939 | return $weight;\r | |
940 | }\r | |
941 | \r | |
942 | /**\r | |
943 | * Remove extraneous break tags from a node.\r | |
944 | *\r | |
945 | * @param DOMElement $node\r | |
946 | * @return void\r | |
947 | */\r | |
948 | public function killBreaks($node) {\r | |
949 | $html = $node->innerHTML;\r | |
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);\r | |
951 | $node->innerHTML = $html;\r | |
952 | }\r | |
953 | \r | |
954 | /**\r | |
955 | * Clean a node of all elements of type "tag".\r | |
956 | * (Unless it's a youtube/vimeo video. People love movies.)\r | |
957 | *\r | |
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes\r | |
959 | *\r | |
960 | * @param DOMElement $e\r | |
961 | * @param string $tag\r | |
962 | * @return void\r | |
963 | */\r | |
964 | public function clean($e, $tag) {\r | |
965 | $targetList = $e->getElementsByTagName($tag);\r | |
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');\r | |
967 | \r | |
968 | for ($y=$targetList->length-1; $y >= 0; $y--) {\r | |
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */\r | |
970 | if ($isEmbed) {\r | |
971 | $attributeValues = '';\r | |
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {\r | |
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)\r | |
974 | }\r | |
975 | \r | |
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */\r | |
977 | if (preg_match($this->regexps['video'], $attributeValues)) {\r | |
978 | continue;\r | |
979 | }\r | |
980 | \r | |
981 | /* Then check the elements inside this element for the same. */\r | |
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {\r | |
983 | continue;\r | |
984 | }\r | |
985 | }\r | |
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y));\r | |
987 | }\r | |
988 | }\r | |
989 | \r | |
990 | /**\r | |
991 | * Clean an element of all tags of type "tag" if they look fishy.\r | |
992 | * "Fishy" is an algorithm based on content length, classnames, \r | |
993 | * link density, number of images & embeds, etc.\r | |
994 | *\r | |
995 | * @param DOMElement $e\r | |
996 | * @param string $tag\r | |
997 | * @return void\r | |
998 | */\r | |
999 | public function cleanConditionally($e, $tag) {\r | |
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {\r | |
1001 | return;\r | |
1002 | }\r | |
1003 | \r | |
1004 | $tagsList = $e->getElementsByTagName($tag);\r | |
1005 | $curTagsLength = $tagsList->length;\r | |
1006 | \r | |
1007 | /**\r | |
1008 | * Gather counts for other typical elements embedded within.\r | |
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal.\r | |
1010 | *\r | |
1011 | * TODO: Consider taking into account original contentScore here.\r | |
1012 | */\r | |
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) {\r | |
1014 | $weight = $this->getClassWeight($tagsList->item($i));\r | |
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;\r | |
1016 | \r | |
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));\r | |
1018 | \r | |
1019 | if ($weight + $contentScore < 0) {\r | |
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));\r | |
1021 | }\r | |
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {\r | |
1023 | /**\r | |
1024 | * If there are not very many commas, and the number of\r | |
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.\r | |
1026 | **/\r | |
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length;\r | |
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length;\r | |
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;\r | |
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length;\r | |
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length;\r | |
1032 | \r | |
1033 | $embedCount = 0;\r | |
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed');\r | |
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {\r | |
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {\r | |
1037 | $embedCount++; \r | |
1038 | }\r | |
1039 | }\r | |
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe');\r | |
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {\r | |
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {\r | |
1043 | $embedCount++; \r | |
1044 | }\r | |
1045 | }\r | |
1046 | \r | |
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i));\r | |
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i)));\r | |
1049 | $toRemove = false;\r | |
1050 | \r | |
1051 | if ($this->lightClean) {\r | |
1052 | $this->dbg('Light clean...');\r | |
1053 | if ( ($img > $p) && ($img > 4) ) {\r | |
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements');\r | |
1055 | $toRemove = true;\r | |
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {\r | |
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');\r | |
1058 | $toRemove = true;\r | |
1059 | } else if ( $input > floor($p/3) ) {\r | |
1060 | $this->dbg(' too many <input> elements');\r | |
1061 | $toRemove = true; \r | |
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {\r | |
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');\r | |
1064 | $toRemove = true;\r | |
1065 | } else if($weight < 25 && $linkDensity > 0.2) {\r | |
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2');\r | |
1067 | $toRemove = true;\r | |
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {\r | |
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');\r | |
1070 | $toRemove = true;\r | |
1071 | } else if($embedCount > 3) {\r | |
1072 | $this->dbg(' more than 3 embeds');\r | |
1073 | $toRemove = true;\r | |
1074 | }\r | |
1075 | } else {\r | |
1076 | $this->dbg('Standard clean...');\r | |
1077 | if ( $img > $p ) {\r | |
1078 | $this->dbg(' more image elements than paragraph elements');\r | |
1079 | $toRemove = true;\r | |
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {\r | |
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');\r | |
1082 | $toRemove = true;\r | |
1083 | } else if ( $input > floor($p/3) ) {\r | |
1084 | $this->dbg(' too many <input> elements');\r | |
1085 | $toRemove = true; \r | |
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {\r | |
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');\r | |
1088 | $toRemove = true;\r | |
1089 | } else if($weight < 25 && $linkDensity > 0.2) {\r | |
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2');\r | |
1091 | $toRemove = true;\r | |
1092 | } else if($weight >= 25 && $linkDensity > 0.5) {\r | |
1093 | $this->dbg(' weight above 25 but link density greater than 0.5');\r | |
1094 | $toRemove = true;\r | |
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {\r | |
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');\r | |
1097 | $toRemove = true;\r | |
1098 | }\r | |
1099 | }\r | |
1100 | \r | |
1101 | if ($toRemove) {\r | |
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);\r | |
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));\r | |
1104 | }\r | |
1105 | }\r | |
1106 | }\r | |
1107 | }\r | |
1108 | \r | |
1109 | /**\r | |
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density.\r | |
1111 | *\r | |
1112 | * @param DOMElement $e\r | |
1113 | * @return void\r | |
1114 | */\r | |
1115 | public function cleanHeaders($e) {\r | |
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {\r | |
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex);\r | |
1118 | for ($i=$headers->length-1; $i >=0; $i--) {\r | |
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {\r | |
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i));\r | |
1121 | }\r | |
1122 | }\r | |
1123 | }\r | |
1124 | }\r | |
1125 | \r | |
1126 | public function flagIsActive($flag) {\r | |
1127 | return ($this->flags & $flag) > 0;\r | |
1128 | }\r | |
1129 | \r | |
1130 | public function addFlag($flag) {\r | |
1131 | $this->flags = $this->flags | $flag;\r | |
1132 | }\r | |
1133 | \r | |
1134 | public function removeFlag($flag) {\r | |
1135 | $this->flags = $this->flags & ~$flag;\r | |
1136 | }\r | |
1137 | }\r | |
1138 | ?> |