diff options
author | Thomas Citharel <tcit@tcit.fr> | 2014-12-14 00:16:39 +0100 |
---|---|---|
committer | Thomas Citharel <tcit@tcit.fr> | 2014-12-14 00:16:39 +0100 |
commit | 1256e4c645c86f4165d052bf8116b957b373b837 (patch) | |
tree | 3cc3bd960df403a6e0645afcd9e4f5bbb68bfc0e /inc/3rdparty/libraries/MOBIClass/readability | |
parent | 9c55ed0923273da886497cb62302a79622e34a74 (diff) | |
parent | dc69d3e8d895c87f9d26c2d1309e40d6090d4c8d (diff) | |
download | wallabag-1256e4c645c86f4165d052bf8116b957b373b837.tar.gz wallabag-1256e4c645c86f4165d052bf8116b957b373b837.tar.zst wallabag-1256e4c645c86f4165d052bf8116b957b373b837.zip |
Merge branch 'tcitworld-ebook' into dev
Diffstat (limited to 'inc/3rdparty/libraries/MOBIClass/readability')
-rw-r--r-- | inc/3rdparty/libraries/MOBIClass/readability/JSLikeHTMLElement.php | 110 | ||||
-rw-r--r-- | inc/3rdparty/libraries/MOBIClass/readability/Readability.php | 1069 |
2 files changed, 1179 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/MOBIClass/readability/JSLikeHTMLElement.php b/inc/3rdparty/libraries/MOBIClass/readability/JSLikeHTMLElement.php new file mode 100644 index 00000000..1a8ec88c --- /dev/null +++ b/inc/3rdparty/libraries/MOBIClass/readability/JSLikeHTMLElement.php | |||
@@ -0,0 +1,110 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * JavaScript-like HTML DOM Element | ||
4 | * | ||
5 | * This class extends PHP's DOMElement to allow | ||
6 | * users to get and set the innerHTML property of | ||
7 | * HTML elements in the same way it's done in | ||
8 | * JavaScript. | ||
9 | * | ||
10 | * Example usage: | ||
11 | * @code | ||
12 | * require_once 'JSLikeHTMLElement.php'; | ||
13 | * header('Content-Type: text/plain'); | ||
14 | * $doc = new DOMDocument(); | ||
15 | * $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | ||
16 | * $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); | ||
17 | * $elem = $doc->getElementsByTagName('div')->item(0); | ||
18 | * | ||
19 | * // print innerHTML | ||
20 | * echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' | ||
21 | * echo "\n\n"; | ||
22 | * | ||
23 | * // set innerHTML | ||
24 | * $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; | ||
25 | * echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' | ||
26 | * echo "\n\n"; | ||
27 | * | ||
28 | * // print document (with our changes) | ||
29 | * echo $doc->saveXML(); | ||
30 | * @endcode | ||
31 | * | ||
32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net | ||
33 | * @see http://fivefilters.org (the project this was written for) | ||
34 | */ | ||
35 | class JSLikeHTMLElement extends DOMElement | ||
36 | { | ||
37 | /** | ||
38 | * Used for setting innerHTML like it's done in JavaScript: | ||
39 | * @code | ||
40 | * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; | ||
41 | * @endcode | ||
42 | */ | ||
43 | public function __set($name, $value) { | ||
44 | if ($name == 'innerHTML') { | ||
45 | // first, empty the element | ||
46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { | ||
47 | $this->removeChild($this->childNodes->item($x)); | ||
48 | } | ||
49 | // $value holds our new inner HTML | ||
50 | if ($value != '') { | ||
51 | $f = $this->ownerDocument->createDocumentFragment(); | ||
52 | // appendXML() expects well-formed markup (XHTML) | ||
53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings | ||
54 | if ($result) { | ||
55 | if ($f->hasChildNodes()) $this->appendChild($f); | ||
56 | } else { | ||
57 | // $value is probably ill-formed | ||
58 | $f = new DOMDocument(); | ||
59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); | ||
60 | // Using <htmlfragment> will generate a warning, but so will bad HTML | ||
61 | // (and by this point, bad HTML is what we've got). | ||
62 | // We use it (and suppress the warning) because an HTML fragment will | ||
63 | // be wrapped around <html><body> tags which we don't really want to keep. | ||
64 | // Note: despite the warning, if loadHTML succeeds it will return true. | ||
65 | $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); | ||
66 | if ($result) { | ||
67 | $import = $f->getElementsByTagName('htmlfragment')->item(0); | ||
68 | foreach ($import->childNodes as $child) { | ||
69 | $importedNode = $this->ownerDocument->importNode($child, true); | ||
70 | $this->appendChild($importedNode); | ||
71 | } | ||
72 | } else { | ||
73 | // oh well, we tried, we really did. :( | ||
74 | // this element is now empty | ||
75 | } | ||
76 | } | ||
77 | } | ||
78 | } else { | ||
79 | $trace = debug_backtrace(); | ||
80 | trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); | ||
81 | } | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * Used for getting innerHTML like it's done in JavaScript: | ||
86 | * @code | ||
87 | * $string = $div->innerHTML; | ||
88 | * @endcode | ||
89 | */ | ||
90 | public function __get($name) | ||
91 | { | ||
92 | if ($name == 'innerHTML') { | ||
93 | $inner = ''; | ||
94 | foreach ($this->childNodes as $child) { | ||
95 | $inner .= $this->ownerDocument->saveXML($child); | ||
96 | } | ||
97 | return $inner; | ||
98 | } | ||
99 | |||
100 | $trace = debug_backtrace(); | ||
101 | trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); | ||
102 | return null; | ||
103 | } | ||
104 | |||
105 | public function __toString() | ||
106 | { | ||
107 | return '['.$this->tagName.']'; | ||
108 | } | ||
109 | } | ||
110 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/libraries/MOBIClass/readability/Readability.php b/inc/3rdparty/libraries/MOBIClass/readability/Readability.php new file mode 100644 index 00000000..91554243 --- /dev/null +++ b/inc/3rdparty/libraries/MOBIClass/readability/Readability.php | |||
@@ -0,0 +1,1069 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Arc90's Readability ported to PHP for FiveFilters.org | ||
4 | * Based on readability.js version 1.7.1 (without multi-page support) | ||
5 | * ------------------------------------------------------ | ||
6 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | ||
7 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | ||
8 | * JS Source: http://code.google.com/p/arc90labs-readability | ||
9 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | ||
10 | * More information: http://fivefilters.org/content-only/ | ||
11 | * License: Apache License, Version 2.0 | ||
12 | * Requires: PHP5 | ||
13 | * Date: 2010-10-29 | ||
14 | * | ||
15 | * Differences between the PHP port and the original | ||
16 | * ------------------------------------------------------ | ||
17 | * Arc90's Readability is designed to run in the browser. It works on the DOM | ||
18 | * tree (the parsed HTML) after the page's CSS styles have been applied and | ||
19 | * Javascript code executed. This PHP port does not run inside a browser. | ||
20 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | ||
21 | * rely on CSS or Javascript support. As such, the results will not always | ||
22 | * match Arc90's Readability. (For example, if a web page contains CSS style | ||
23 | * rules or Javascript code which hide certain HTML elements from display, | ||
24 | * Arc90's Readability will dismiss those from consideration but our PHP port, | ||
25 | * unable to understand CSS or Javascript, will not know any better.) | ||
26 | * | ||
27 | * Another significant difference is that the aim of Arc90's Readability is | ||
28 | * to re-present the main content block of a given web page so users can | ||
29 | * read it more easily in their browsers. Correct identification, clean up, | ||
30 | * and separation of the content block is only a part of this process. | ||
31 | * This PHP port is only concerned with this part, it does not include code | ||
32 | * that relates to presentation in the browser - Arc90 already do | ||
33 | * that extremely well, and for PDF output there's FiveFilters.org's | ||
34 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | ||
35 | * | ||
36 | * Finally, this class contains methods that might be useful for developers | ||
37 | * working on HTML document fragments. So without deviating too much from | ||
38 | * the original code (which I don't want to do because it makes debugging | ||
39 | * and updating more difficult), I've tried to make it a little more | ||
40 | * developer friendly. You should be able to use the methods here on | ||
41 | * existing DOMElement objects without passing an entire HTML document to | ||
42 | * be parsed. | ||
43 | */ | ||
44 | |||
45 | // This class allows us to do JavaScript like assignements to innerHTML | ||
46 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | ||
47 | |||
48 | // Alternative usage (for testing only!) | ||
49 | // uncomment the lins below and call Readability.php in your browser | ||
50 | // passing it the URL of the page you'd like content from, e.g.: | ||
51 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | ||
52 | |||
53 | /* | ||
54 | if (!isset($_GET['url']) || $_GET['url'] == '') { | ||
55 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | ||
56 | } | ||
57 | $url = $_GET['url']; | ||
58 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | ||
59 | $html = file_get_contents($url); | ||
60 | $r = new Readability($html, $url); | ||
61 | $r->init(); | ||
62 | echo $r->articleContent->innerHTML; | ||
63 | */ | ||
64 | |||
65 | class Readability | ||
66 | { | ||
67 | public $version = '1.7.1-without-multi-page'; | ||
68 | public $convertLinksToFootnotes = false; | ||
69 | public $revertForcedParagraphElements = true; | ||
70 | public $articleTitle; | ||
71 | public $articleContent; | ||
72 | public $dom; | ||
73 | public $url = null; // optional - URL where HTML was retrieved | ||
74 | public $debug = false; | ||
75 | protected $body = null; // | ||
76 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | ||
77 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | ||
78 | protected $success = false; // indicates whether we were able to extract or not | ||
79 | |||
80 | /** | ||
81 | * All of the regular expressions in use within readability. | ||
82 | * Defined up here so we don't instantiate them repeatedly in loops. | ||
83 | **/ | ||
84 | public $regexps = array( | ||
85 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', | ||
86 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | ||
87 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', | ||
88 | 'negative' => '/combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | ||
89 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | ||
90 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | ||
91 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | ||
92 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | ||
93 | 'normalize' => '/\s{2,}/', | ||
94 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | ||
95 | 'video' => '/http:\/\/(www\.)?(youtube|vimeo)\.com/i', | ||
96 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | ||
97 | ); | ||
98 | |||
99 | /* constants */ | ||
100 | const FLAG_STRIP_UNLIKELYS = 1; | ||
101 | const FLAG_WEIGHT_CLASSES = 2; | ||
102 | const FLAG_CLEAN_CONDITIONALLY = 4; | ||
103 | |||
104 | /** | ||
105 | * Create instance of Readability | ||
106 | * @param string UTF-8 encoded string | ||
107 | * @param string (optional) URL associated with HTML (used for footnotes) | ||
108 | */ | ||
109 | function __construct($html, $url=null) | ||
110 | { | ||
111 | /* Turn all double br's into p's */ | ||
112 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | ||
113 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | ||
114 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | ||
115 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | ||
116 | $this->dom = new DOMDocument(); | ||
117 | $this->dom->preserveWhiteSpace = false; | ||
118 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | ||
119 | @$this->dom->loadHTML($html); | ||
120 | $this->url = $url; | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * Get article title element | ||
125 | * @return DOMElement | ||
126 | */ | ||
127 | public function getTitle() { | ||
128 | return $this->articleTitle; | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * Get article content element | ||
133 | * @return DOMElement | ||
134 | */ | ||
135 | public function getContent() { | ||
136 | return $this->articleContent; | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * Runs readability. | ||
141 | * | ||
142 | * Workflow: | ||
143 | * 1. Prep the document by removing script tags, css, etc. | ||
144 | * 2. Build readability's DOM tree. | ||
145 | * 3. Grab the article content from the current dom tree. | ||
146 | * 4. Replace the current DOM tree with the new one. | ||
147 | * 5. Read peacefully. | ||
148 | * | ||
149 | * @return boolean true if we found content, false otherwise | ||
150 | **/ | ||
151 | public function init() | ||
152 | { | ||
153 | $this->removeScripts($this->dom); | ||
154 | |||
155 | // Assume successful outcome | ||
156 | $this->success = true; | ||
157 | |||
158 | $bodyElems = $this->dom->getElementsByTagName('body'); | ||
159 | if ($bodyElems->length > 0) { | ||
160 | if ($this->bodyCache == null) { | ||
161 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | ||
162 | } | ||
163 | if ($this->body == null) { | ||
164 | $this->body = $bodyElems->item(0); | ||
165 | } | ||
166 | } | ||
167 | |||
168 | $this->prepDocument(); | ||
169 | |||
170 | //die($this->dom->documentElement->parentNode->nodeType); | ||
171 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | ||
172 | //die($this->getInnerHTML($this->dom->documentElement)); | ||
173 | |||
174 | /* Build readability's DOM tree */ | ||
175 | $overlay = $this->dom->createElement('div'); | ||
176 | $innerDiv = $this->dom->createElement('div'); | ||
177 | $articleTitle = $this->getArticleTitle(); | ||
178 | $articleContent = $this->grabArticle(); | ||
179 | |||
180 | if (!$articleContent) { | ||
181 | $this->success = false; | ||
182 | $articleContent = $this->dom->createElement('div'); | ||
183 | $articleContent->setAttribute('id', 'readability-content'); | ||
184 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | ||
185 | } | ||
186 | |||
187 | $overlay->setAttribute('id', 'readOverlay'); | ||
188 | $innerDiv->setAttribute('id', 'readInner'); | ||
189 | |||
190 | /* Glue the structure of our document together. */ | ||
191 | $innerDiv->appendChild($articleTitle); | ||
192 | $innerDiv->appendChild($articleContent); | ||
193 | $overlay->appendChild($innerDiv); | ||
194 | |||
195 | /* Clear the old HTML, insert the new content. */ | ||
196 | $this->body->innerHTML = ''; | ||
197 | $this->body->appendChild($overlay); | ||
198 | //document.body.insertBefore(overlay, document.body.firstChild); | ||
199 | $this->body->removeAttribute('style'); | ||
200 | |||
201 | $this->postProcessContent($articleContent); | ||
202 | |||
203 | // Set title and content instance variables | ||
204 | $this->articleTitle = $articleTitle; | ||
205 | $this->articleContent = $articleContent; | ||
206 | |||
207 | return $this->success; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * Debug | ||
212 | */ | ||
213 | protected function dbg($msg) { | ||
214 | if ($this->debug) echo '* ',$msg, '<br />', "\n"; | ||
215 | } | ||
216 | |||
217 | /** | ||
218 | * Run any post-process modifications to article content as necessary. | ||
219 | * | ||
220 | * @param DOMElement | ||
221 | * @return void | ||
222 | */ | ||
223 | public function postProcessContent($articleContent) { | ||
224 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | ||
225 | $this->addFootnotes($articleContent); | ||
226 | } | ||
227 | } | ||
228 | |||
229 | /** | ||
230 | * Get the article title as an H1. | ||
231 | * | ||
232 | * @return DOMElement | ||
233 | */ | ||
234 | protected function getArticleTitle() { | ||
235 | $curTitle = ''; | ||
236 | $origTitle = ''; | ||
237 | |||
238 | try { | ||
239 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | ||
240 | } catch(Exception $e) {} | ||
241 | |||
242 | if (preg_match('/ [\|\-] /', $curTitle)) | ||
243 | { | ||
244 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | ||
245 | |||
246 | if (count(explode(' ', $curTitle)) < 3) { | ||
247 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | ||
248 | } | ||
249 | } | ||
250 | else if (strpos($curTitle, ': ') !== false) | ||
251 | { | ||
252 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | ||
253 | |||
254 | if (count(explode(' ', $curTitle)) < 3) { | ||
255 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | ||
256 | } | ||
257 | } | ||
258 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | ||
259 | { | ||
260 | $hOnes = $this->dom->getElementsByTagName('h1'); | ||
261 | if($hOnes->length == 1) | ||
262 | { | ||
263 | $curTitle = $this->getInnerText($hOnes->item(0)); | ||
264 | } | ||
265 | } | ||
266 | |||
267 | $curTitle = trim($curTitle); | ||
268 | |||
269 | if (count(explode(' ', $curTitle)) <= 4) { | ||
270 | $curTitle = $origTitle; | ||
271 | } | ||
272 | |||
273 | $articleTitle = $this->dom->createElement('h1'); | ||
274 | $articleTitle->innerHTML = $curTitle; | ||
275 | |||
276 | return $articleTitle; | ||
277 | } | ||
278 | |||
279 | /** | ||
280 | * Prepare the HTML document for readability to scrape it. | ||
281 | * This includes things like stripping javascript, CSS, and handling terrible markup. | ||
282 | * | ||
283 | * @return void | ||
284 | **/ | ||
285 | protected function prepDocument() { | ||
286 | /** | ||
287 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | ||
288 | * so we create a new body node and append it to the document. | ||
289 | */ | ||
290 | if($this->dom->documentElement == null){ | ||
291 | throw new Exception("No document element"); | ||
292 | } | ||
293 | if ($this->body == null) | ||
294 | { | ||
295 | $this->body = $this->dom->createElement('body'); | ||
296 | $this->dom->documentElement->appendChild($this->body); | ||
297 | } | ||
298 | |||
299 | $this->body->setAttribute('id', 'readabilityBody'); | ||
300 | |||
301 | /* Remove all style tags in head */ | ||
302 | $styleTags = $this->dom->getElementsByTagName('style'); | ||
303 | for ($i = $styleTags->length-1; $i >= 0; $i--) | ||
304 | { | ||
305 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | ||
306 | } | ||
307 | |||
308 | /* Turn all double br's into p's */ | ||
309 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | ||
310 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | ||
311 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | ||
312 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | ||
313 | } | ||
314 | |||
315 | /** | ||
316 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | ||
317 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | ||
318 | * | ||
319 | * @return void | ||
320 | **/ | ||
321 | public function addFootnotes($articleContent) { | ||
322 | $footnotesWrapper = $this->dom->createElement('div'); | ||
323 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | ||
324 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | ||
325 | |||
326 | $articleFootnotes = $this->dom->createElement('ol'); | ||
327 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | ||
328 | $footnotesWrapper->appendChild($articleFootnotes); | ||
329 | |||
330 | $articleLinks = $articleContent->getElementsByTagName('a'); | ||
331 | |||
332 | $linkCount = 0; | ||
333 | for ($i = 0; $i < $articleLinks->length; $i++) | ||
334 | { | ||
335 | $articleLink = $articleLinks->item($i); | ||
336 | $footnoteLink = $articleLink->cloneNode(true); | ||
337 | $refLink = $this->dom->createElement('a'); | ||
338 | $footnote = $this->dom->createElement('li'); | ||
339 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | ||
340 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | ||
341 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | ||
342 | $linkText = $this->getInnerText($articleLink); | ||
343 | |||
344 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | ||
345 | continue; | ||
346 | } | ||
347 | |||
348 | $linkCount++; | ||
349 | |||
350 | /** Add a superscript reference after the article link */ | ||
351 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | ||
352 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | ||
353 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | ||
354 | $refLink->setAttribute('style', 'color: inherit;'); | ||
355 | |||
356 | //TODO: does this work or should we use DOMNode.isSameNode()? | ||
357 | if ($articleLink->parentNode->lastChild == $articleLink) { | ||
358 | $articleLink->parentNode->appendChild($refLink); | ||
359 | } else { | ||
360 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | ||
361 | } | ||
362 | |||
363 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | ||
364 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | ||
365 | |||
366 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | ||
367 | |||
368 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | ||
369 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | ||
370 | |||
371 | $footnote->appendChild($footnoteLink); | ||
372 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | ||
373 | |||
374 | $articleFootnotes->appendChild($footnote); | ||
375 | } | ||
376 | |||
377 | if ($linkCount > 0) { | ||
378 | $articleContent->appendChild($footnotesWrapper); | ||
379 | } | ||
380 | } | ||
381 | |||
382 | /** | ||
383 | * Reverts P elements with class 'readability-styled' | ||
384 | * to text nodes - which is what they were before. | ||
385 | * | ||
386 | * @param DOMElement | ||
387 | * @return void | ||
388 | */ | ||
389 | function revertReadabilityStyledElements($articleContent) { | ||
390 | $xpath = new DOMXPath($articleContent->ownerDocument); | ||
391 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | ||
392 | //$elems = $articleContent->getElementsByTagName('p'); | ||
393 | for ($i = $elems->length-1; $i >= 0; $i--) { | ||
394 | $e = $elems->item($i); | ||
395 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | ||
396 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | ||
397 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | ||
398 | //} | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /** | ||
403 | * Prepare the article node for display. Clean out any inline styles, | ||
404 | * iframes, forms, strip extraneous <p> tags, etc. | ||
405 | * | ||
406 | * @param DOMElement | ||
407 | * @return void | ||
408 | */ | ||
409 | function prepArticle($articleContent) { | ||
410 | $this->cleanStyles($articleContent); | ||
411 | $this->killBreaks($articleContent); | ||
412 | if ($this->revertForcedParagraphElements) { | ||
413 | $this->revertReadabilityStyledElements($articleContent); | ||
414 | } | ||
415 | |||
416 | /* Clean out junk from the article content */ | ||
417 | $this->cleanConditionally($articleContent, 'form'); | ||
418 | $this->clean($articleContent, 'object'); | ||
419 | $this->clean($articleContent, 'h1'); | ||
420 | |||
421 | /** | ||
422 | * If there is only one h2, they are probably using it | ||
423 | * as a header and not a subheader, so remove it since we already have a header. | ||
424 | ***/ | ||
425 | if ($articleContent->getElementsByTagName('h2')->length == 1) { | ||
426 | $this->clean($articleContent, 'h2'); | ||
427 | } | ||
428 | $this->clean($articleContent, 'iframe'); | ||
429 | |||
430 | $this->cleanHeaders($articleContent); | ||
431 | |||
432 | /* Do these last as the previous stuff may have removed junk that will affect these */ | ||
433 | $this->cleanConditionally($articleContent, 'table'); | ||
434 | $this->cleanConditionally($articleContent, 'ul'); | ||
435 | $this->cleanConditionally($articleContent, 'div'); | ||
436 | |||
437 | /* Remove extra paragraphs */ | ||
438 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | ||
439 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | ||
440 | { | ||
441 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | ||
442 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | ||
443 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | ||
444 | |||
445 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | ||
446 | { | ||
447 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | ||
448 | } | ||
449 | } | ||
450 | |||
451 | try { | ||
452 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | ||
453 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | ||
454 | } | ||
455 | catch (Exception $e) { | ||
456 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | ||
457 | } | ||
458 | } | ||
459 | |||
460 | /** | ||
461 | * Initialize a node with the readability object. Also checks the | ||
462 | * className/id for special names to add to its score. | ||
463 | * | ||
464 | * @param Element | ||
465 | * @return void | ||
466 | **/ | ||
467 | protected function initializeNode($node) { | ||
468 | $readability = $this->dom->createAttribute('readability'); | ||
469 | $readability->value = 0; // this is our contentScore | ||
470 | $node->setAttributeNode($readability); | ||
471 | |||
472 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | ||
473 | case 'DIV': | ||
474 | $readability->value += 5; | ||
475 | break; | ||
476 | |||
477 | case 'PRE': | ||
478 | case 'TD': | ||
479 | case 'BLOCKQUOTE': | ||
480 | $readability->value += 3; | ||
481 | break; | ||
482 | |||
483 | case 'ADDRESS': | ||
484 | case 'OL': | ||
485 | case 'UL': | ||
486 | case 'DL': | ||
487 | case 'DD': | ||
488 | case 'DT': | ||
489 | case 'LI': | ||
490 | case 'FORM': | ||
491 | $readability->value -= 3; | ||
492 | break; | ||
493 | |||
494 | case 'H1': | ||
495 | case 'H2': | ||
496 | case 'H3': | ||
497 | case 'H4': | ||
498 | case 'H5': | ||
499 | case 'H6': | ||
500 | case 'TH': | ||
501 | $readability->value -= 5; | ||
502 | break; | ||
503 | } | ||
504 | $readability->value += $this->getClassWeight($node); | ||
505 | } | ||
506 | |||
507 | /*** | ||
508 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | ||
509 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | ||
510 | * | ||
511 | * @return DOMElement | ||
512 | **/ | ||
513 | protected function grabArticle($page=null) { | ||
514 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | ||
515 | if (!$page) $page = $this->dom; | ||
516 | $allElements = $page->getElementsByTagName('*'); | ||
517 | /** | ||
518 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | ||
519 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | ||
520 | * | ||
521 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | ||
522 | * TODO: Shouldn't this be a reverse traversal? | ||
523 | **/ | ||
524 | $node = null; | ||
525 | $nodesToScore = array(); | ||
526 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | ||
527 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | ||
528 | //$node = $targetList->item($nodeIndex); | ||
529 | $tagName = strtoupper($node->tagName); | ||
530 | /* Remove unlikely candidates */ | ||
531 | if ($stripUnlikelyCandidates) { | ||
532 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | ||
533 | if ( | ||
534 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | ||
535 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | ||
536 | $tagName != 'BODY' | ||
537 | ) | ||
538 | { | ||
539 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | ||
540 | //$nodesToRemove[] = $node; | ||
541 | $node->parentNode->removeChild($node); | ||
542 | $nodeIndex--; | ||
543 | continue; | ||
544 | } | ||
545 | } | ||
546 | |||
547 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | ||
548 | $nodesToScore[] = $node; | ||
549 | } | ||
550 | |||
551 | /* Turn all divs that don't have children block level elements into p's */ | ||
552 | if ($tagName == 'DIV') { | ||
553 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | ||
554 | //$this->dbg('Altering div to p'); | ||
555 | $newNode = $this->dom->createElement('p'); | ||
556 | try { | ||
557 | $newNode->innerHTML = $node->innerHTML; | ||
558 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | ||
559 | $node->parentNode->replaceChild($newNode, $node); | ||
560 | $nodeIndex--; | ||
561 | $nodesToScore[] = $node; // or $newNode? | ||
562 | } | ||
563 | catch(Exception $e) { | ||
564 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | ||
565 | } | ||
566 | } | ||
567 | else | ||
568 | { | ||
569 | /* EXPERIMENTAL */ | ||
570 | // TODO: change these p elements back to text nodes after processing | ||
571 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | ||
572 | $childNode = $node->childNodes->item($i); | ||
573 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | ||
574 | //$this->dbg('replacing text node with a p tag with the same content.'); | ||
575 | $p = $this->dom->createElement('p'); | ||
576 | $p->innerHTML = $childNode->nodeValue; | ||
577 | $p->setAttribute('style', 'display: inline;'); | ||
578 | $p->setAttribute('class', 'readability-styled'); | ||
579 | $childNode->parentNode->replaceChild($p, $childNode); | ||
580 | } | ||
581 | } | ||
582 | } | ||
583 | } | ||
584 | } | ||
585 | |||
586 | /** | ||
587 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | ||
588 | * Then add their score to their parent node. | ||
589 | * | ||
590 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | ||
591 | **/ | ||
592 | $candidates = array(); | ||
593 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | ||
594 | $parentNode = $nodesToScore[$pt]->parentNode; | ||
595 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | ||
596 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | ||
597 | $innerText = $this->getInnerText($nodesToScore[$pt]); | ||
598 | |||
599 | if (!$parentNode || !isset($parentNode->tagName)) { | ||
600 | continue; | ||
601 | } | ||
602 | |||
603 | /* If this paragraph is less than 25 characters, don't even count it. */ | ||
604 | if(strlen($innerText) < 25) { | ||
605 | continue; | ||
606 | } | ||
607 | |||
608 | /* Initialize readability data for the parent. */ | ||
609 | if (!$parentNode->hasAttribute('readability')) | ||
610 | { | ||
611 | $this->initializeNode($parentNode); | ||
612 | $candidates[] = $parentNode; | ||
613 | } | ||
614 | |||
615 | /* Initialize readability data for the grandparent. */ | ||
616 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | ||
617 | { | ||
618 | $this->initializeNode($grandParentNode); | ||
619 | $candidates[] = $grandParentNode; | ||
620 | } | ||
621 | |||
622 | $contentScore = 0; | ||
623 | |||
624 | /* Add a point for the paragraph itself as a base. */ | ||
625 | $contentScore++; | ||
626 | |||
627 | /* Add points for any commas within this paragraph */ | ||
628 | $contentScore += count(explode(',', $innerText)); | ||
629 | |||
630 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | ||
631 | $contentScore += min(floor(strlen($innerText) / 100), 3); | ||
632 | |||
633 | /* Add the score to the parent. The grandparent gets half. */ | ||
634 | $parentNode->getAttributeNode('readability')->value += $contentScore; | ||
635 | |||
636 | if ($grandParentNode) { | ||
637 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | ||
638 | } | ||
639 | } | ||
640 | |||
641 | /** | ||
642 | * After we've calculated scores, loop through all of the possible candidate nodes we found | ||
643 | * and find the one with the highest score. | ||
644 | **/ | ||
645 | $topCandidate = null; | ||
646 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | ||
647 | { | ||
648 | /** | ||
649 | * Scale the final candidates score based on link density. Good content should have a | ||
650 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | ||
651 | **/ | ||
652 | $readability = $candidates[$c]->getAttributeNode('readability'); | ||
653 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | ||
654 | |||
655 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | ||
656 | |||
657 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | ||
658 | $topCandidate = $candidates[$c]; | ||
659 | } | ||
660 | } | ||
661 | |||
662 | /** | ||
663 | * If we still have no top candidate, just use the body as a last resort. | ||
664 | * We also have to copy the body node so it is something we can modify. | ||
665 | **/ | ||
666 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | ||
667 | { | ||
668 | $topCandidate = $this->dom->createElement('div'); | ||
669 | $topCandidate->innerHTML = ($page instanceof DOMDocument) ? $page->saveXML($page->documentElement) : $page->innerHTML; | ||
670 | $page->innerHTML = ''; | ||
671 | $page->appendChild($topCandidate); | ||
672 | $this->initializeNode($topCandidate); | ||
673 | } | ||
674 | |||
675 | /** | ||
676 | * Now that we have the top candidate, look through its siblings for content that might also be related. | ||
677 | * Things like preambles, content split by ads that we removed, etc. | ||
678 | **/ | ||
679 | $articleContent = $this->dom->createElement('div'); | ||
680 | $articleContent->setAttribute('id', 'readability-content'); | ||
681 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | ||
682 | $siblingNodes = $topCandidate->parentNode->childNodes; | ||
683 | |||
684 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | ||
685 | { | ||
686 | $siblingNode = $siblingNodes->item($s); | ||
687 | $append = false; | ||
688 | |||
689 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | ||
690 | |||
691 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | ||
692 | |||
693 | if ($siblingNode === $topCandidate) | ||
694 | // or if ($siblingNode->isSameNode($topCandidate)) | ||
695 | { | ||
696 | $append = true; | ||
697 | } | ||
698 | |||
699 | $contentBonus = 0; | ||
700 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | ||
701 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | ||
702 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | ||
703 | } | ||
704 | |||
705 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | ||
706 | { | ||
707 | $append = true; | ||
708 | } | ||
709 | |||
710 | if (strtoupper($siblingNode->nodeName) == 'P') { | ||
711 | $linkDensity = $this->getLinkDensity($siblingNode); | ||
712 | $nodeContent = $this->getInnerText($siblingNode); | ||
713 | $nodeLength = strlen($nodeContent); | ||
714 | |||
715 | if ($nodeLength > 80 && $linkDensity < 0.25) | ||
716 | { | ||
717 | $append = true; | ||
718 | } | ||
719 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | ||
720 | { | ||
721 | $append = true; | ||
722 | } | ||
723 | } | ||
724 | |||
725 | if ($append) | ||
726 | { | ||
727 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | ||
728 | |||
729 | $nodeToAppend = null; | ||
730 | $sibNodeName = strtoupper($siblingNode->nodeName); | ||
731 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | ||
732 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | ||
733 | |||
734 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | ||
735 | $nodeToAppend = $this->dom->createElement('div'); | ||
736 | try { | ||
737 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | ||
738 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | ||
739 | } | ||
740 | catch(Exception $e) | ||
741 | { | ||
742 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | ||
743 | $nodeToAppend = $siblingNode; | ||
744 | $s--; | ||
745 | $sl--; | ||
746 | } | ||
747 | } else { | ||
748 | $nodeToAppend = $siblingNode; | ||
749 | $s--; | ||
750 | $sl--; | ||
751 | } | ||
752 | |||
753 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | ||
754 | $nodeToAppend->removeAttribute('class'); | ||
755 | |||
756 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | ||
757 | $articleContent->appendChild($nodeToAppend); | ||
758 | } | ||
759 | } | ||
760 | |||
761 | /** | ||
762 | * So we have all of the content that we need. Now we clean it up for presentation. | ||
763 | **/ | ||
764 | $this->prepArticle($articleContent); | ||
765 | |||
766 | /** | ||
767 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | ||
768 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | ||
769 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | ||
770 | * finding the -right- content. | ||
771 | **/ | ||
772 | if (strlen($this->getInnerText($articleContent, false)) < 250) | ||
773 | { | ||
774 | $this->body->innerHTML = $this->bodyCache; | ||
775 | |||
776 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | ||
777 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | ||
778 | return $this->grabArticle($this->body); | ||
779 | } | ||
780 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | ||
781 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | ||
782 | return $this->grabArticle($this->body); | ||
783 | } | ||
784 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | ||
785 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | ||
786 | return $this->grabArticle($this->body); | ||
787 | } | ||
788 | else { | ||
789 | return false; | ||
790 | } | ||
791 | } | ||
792 | return $articleContent; | ||
793 | } | ||
794 | |||
795 | /** | ||
796 | * Remove script tags from document | ||
797 | * | ||
798 | * @param DOMElement | ||
799 | * @return void | ||
800 | */ | ||
801 | public function removeScripts($doc) { | ||
802 | $scripts = $doc->getElementsByTagName('script'); | ||
803 | for($i = $scripts->length-1; $i >= 0; $i--) | ||
804 | { | ||
805 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | ||
806 | } | ||
807 | } | ||
808 | |||
809 | /** | ||
810 | * Get the inner text of a node. | ||
811 | * This also strips out any excess whitespace to be found. | ||
812 | * | ||
813 | * @param DOMElement $ | ||
814 | * @param boolean $normalizeSpaces (default: true) | ||
815 | * @return string | ||
816 | **/ | ||
817 | public function getInnerText($e, $normalizeSpaces=true) { | ||
818 | $textContent = ''; | ||
819 | |||
820 | if (!isset($e->textContent) || $e->textContent == '') { | ||
821 | return ''; | ||
822 | } | ||
823 | |||
824 | $textContent = trim($e->textContent); | ||
825 | |||
826 | if ($normalizeSpaces) { | ||
827 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | ||
828 | } else { | ||
829 | return $textContent; | ||
830 | } | ||
831 | } | ||
832 | |||
833 | /** | ||
834 | * Get the number of times a string $s appears in the node $e. | ||
835 | * | ||
836 | * @param DOMElement $e | ||
837 | * @param string - what to count. Default is "," | ||
838 | * @return number (integer) | ||
839 | **/ | ||
840 | public function getCharCount($e, $s=',') { | ||
841 | return substr_count($this->getInnerText($e), $s); | ||
842 | } | ||
843 | |||
844 | /** | ||
845 | * Remove the style attribute on every $e and under. | ||
846 | * | ||
847 | * @param DOMElement $e | ||
848 | * @return void | ||
849 | */ | ||
850 | public function cleanStyles($e) { | ||
851 | $elems = $e->getElementsByTagName('*'); | ||
852 | foreach ($elems as $elem) { | ||
853 | $elem->removeAttribute('style'); | ||
854 | } | ||
855 | } | ||
856 | |||
857 | /** | ||
858 | * Get the density of links as a percentage of the content | ||
859 | * This is the amount of text that is inside a link divided by the total text in the node. | ||
860 | * | ||
861 | * @param DOMElement $e | ||
862 | * @return number (float) | ||
863 | */ | ||
864 | public function getLinkDensity($e) { | ||
865 | $links = $e->getElementsByTagName('a'); | ||
866 | $textLength = strlen($this->getInnerText($e)); | ||
867 | $linkLength = 0; | ||
868 | for ($i=0, $il=$links->length; $i < $il; $i++) | ||
869 | { | ||
870 | $linkLength += strlen($this->getInnerText($links->item($i))); | ||
871 | } | ||
872 | if ($textLength > 0) { | ||
873 | return $linkLength / $textLength; | ||
874 | } else { | ||
875 | return 0; | ||
876 | } | ||
877 | } | ||
878 | |||
879 | /** | ||
880 | * Get an elements class/id weight. Uses regular expressions to tell if this | ||
881 | * element looks good or bad. | ||
882 | * | ||
883 | * @param DOMElement $e | ||
884 | * @return number (Integer) | ||
885 | */ | ||
886 | public function getClassWeight($e) { | ||
887 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | ||
888 | return 0; | ||
889 | } | ||
890 | |||
891 | $weight = 0; | ||
892 | |||
893 | /* Look for a special classname */ | ||
894 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | ||
895 | { | ||
896 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | ||
897 | $weight -= 25; | ||
898 | } | ||
899 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | ||
900 | $weight += 25; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | /* Look for a special ID */ | ||
905 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | ||
906 | { | ||
907 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | ||
908 | $weight -= 25; | ||
909 | } | ||
910 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | ||
911 | $weight += 25; | ||
912 | } | ||
913 | } | ||
914 | return $weight; | ||
915 | } | ||
916 | |||
917 | /** | ||
918 | * Remove extraneous break tags from a node. | ||
919 | * | ||
920 | * @param DOMElement $node | ||
921 | * @return void | ||
922 | */ | ||
923 | public function killBreaks($node) { | ||
924 | $html = $node->innerHTML; | ||
925 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | ||
926 | $node->innerHTML = $html; | ||
927 | } | ||
928 | |||
929 | /** | ||
930 | * Clean a node of all elements of type "tag". | ||
931 | * (Unless it's a youtube/vimeo video. People love movies.) | ||
932 | * | ||
933 | * @param DOMElement $e | ||
934 | * @param string $tag | ||
935 | * @return void | ||
936 | */ | ||
937 | public function clean($e, $tag) { | ||
938 | $targetList = $e->getElementsByTagName($tag); | ||
939 | $isEmbed = ($tag == 'object' || $tag == 'embed'); | ||
940 | |||
941 | for ($y=$targetList->length-1; $y >= 0; $y--) { | ||
942 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | ||
943 | if ($isEmbed) { | ||
944 | $attributeValues = ''; | ||
945 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | ||
946 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | ||
947 | } | ||
948 | |||
949 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | ||
950 | if (preg_match($this->regexps['video'], $attributeValues)) { | ||
951 | continue; | ||
952 | } | ||
953 | |||
954 | /* Then check the elements inside this element for the same. */ | ||
955 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | ||
956 | continue; | ||
957 | } | ||
958 | } | ||
959 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | ||
960 | } | ||
961 | } | ||
962 | |||
963 | /** | ||
964 | * Clean an element of all tags of type "tag" if they look fishy. | ||
965 | * "Fishy" is an algorithm based on content length, classnames, | ||
966 | * link density, number of images & embeds, etc. | ||
967 | * | ||
968 | * @param DOMElement $e | ||
969 | * @param string $tag | ||
970 | * @return void | ||
971 | */ | ||
972 | public function cleanConditionally($e, $tag) { | ||
973 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | ||
974 | return; | ||
975 | } | ||
976 | |||
977 | $tagsList = $e->getElementsByTagName($tag); | ||
978 | $curTagsLength = $tagsList->length; | ||
979 | |||
980 | /** | ||
981 | * Gather counts for other typical elements embedded within. | ||
982 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | ||
983 | * | ||
984 | * TODO: Consider taking into account original contentScore here. | ||
985 | */ | ||
986 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | ||
987 | $weight = $this->getClassWeight($tagsList->item($i)); | ||
988 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | ||
989 | |||
990 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | ||
991 | |||
992 | if ($weight + $contentScore < 0) { | ||
993 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | ||
994 | } | ||
995 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | ||
996 | /** | ||
997 | * If there are not very many commas, and the number of | ||
998 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | ||
999 | **/ | ||
1000 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | ||
1001 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | ||
1002 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | ||
1003 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | ||
1004 | |||
1005 | $embedCount = 0; | ||
1006 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | ||
1007 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | ||
1008 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | ||
1009 | $embedCount++; | ||
1010 | } | ||
1011 | } | ||
1012 | |||
1013 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | ||
1014 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | ||
1015 | $toRemove = false; | ||
1016 | |||
1017 | if ( $img > $p ) { | ||
1018 | $toRemove = true; | ||
1019 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | ||
1020 | $toRemove = true; | ||
1021 | } else if ( $input > floor($p/3) ) { | ||
1022 | $toRemove = true; | ||
1023 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | ||
1024 | $toRemove = true; | ||
1025 | } else if($weight < 25 && $linkDensity > 0.2) { | ||
1026 | $toRemove = true; | ||
1027 | } else if($weight >= 25 && $linkDensity > 0.5) { | ||
1028 | $toRemove = true; | ||
1029 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | ||
1030 | $toRemove = true; | ||
1031 | } | ||
1032 | |||
1033 | if ($toRemove) { | ||
1034 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | ||
1035 | } | ||
1036 | } | ||
1037 | } | ||
1038 | } | ||
1039 | |||
1040 | /** | ||
1041 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | ||
1042 | * | ||
1043 | * @param DOMElement $e | ||
1044 | * @return void | ||
1045 | */ | ||
1046 | public function cleanHeaders($e) { | ||
1047 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | ||
1048 | $headers = $e->getElementsByTagName('h' . $headerIndex); | ||
1049 | for ($i=$headers->length-1; $i >=0; $i--) { | ||
1050 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | ||
1051 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | ||
1052 | } | ||
1053 | } | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | public function flagIsActive($flag) { | ||
1058 | return ($this->flags & $flag) > 0; | ||
1059 | } | ||
1060 | |||
1061 | public function addFlag($flag) { | ||
1062 | $this->flags = $this->flags | $flag; | ||
1063 | } | ||
1064 | |||
1065 | public function removeFlag($flag) { | ||
1066 | $this->flags = $this->flags & ~$flag; | ||
1067 | } | ||
1068 | } | ||
1069 | ?> \ No newline at end of file | ||