diff options
author | Nicolas Lœuillet <nicolas.loeuillet@gmail.com> | 2013-08-07 10:41:26 -0700 |
---|---|---|
committer | Nicolas Lœuillet <nicolas.loeuillet@gmail.com> | 2013-08-07 10:41:26 -0700 |
commit | 01c0e050ad8eca54f115dfa21db99e4f61ab7ca7 (patch) | |
tree | e1bdacb68b3a56644f4525974844dd954d6e3c6b /inc/3rdparty | |
parent | da2c5d6fc33587c775a7d8a738c2c18de41f83b2 (diff) | |
parent | 339d510fda0a43b08981309f7540acedf3a4976c (diff) | |
download | wallabag-01c0e050ad8eca54f115dfa21db99e4f61ab7ca7.tar.gz wallabag-01c0e050ad8eca54f115dfa21db99e4f61ab7ca7.tar.zst wallabag-01c0e050ad8eca54f115dfa21db99e4f61ab7ca7.zip |
Merge pull request #104 from inthepoche/twig
Twig version on dev branch
Diffstat (limited to 'inc/3rdparty')
-rw-r--r-- | inc/3rdparty/Encoding.php | 262 | ||||
-rw-r--r-- | inc/3rdparty/JSLikeHTMLElement.php | 109 | ||||
-rw-r--r-- | inc/3rdparty/Readability.php | 1137 | ||||
-rw-r--r-- | inc/3rdparty/Session.class.php | 136 | ||||
-rwxr-xr-x | inc/3rdparty/class.messages.php | 231 | ||||
-rw-r--r-- | inc/3rdparty/paginator.php | 202 | ||||
-rw-r--r-- | inc/3rdparty/simple_html_dom.php | 1722 |
7 files changed, 3799 insertions, 0 deletions
diff --git a/inc/3rdparty/Encoding.php b/inc/3rdparty/Encoding.php new file mode 100644 index 00000000..577763b4 --- /dev/null +++ b/inc/3rdparty/Encoding.php | |||
@@ -0,0 +1,262 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * @author "Sebastián Grignoli" <grignoli@framework2.com.ar> | ||
4 | * @package Encoding | ||
5 | * @version 1.1 | ||
6 | * @link http://www.framework2.com.ar/dzone/forceUTF8-es/ | ||
7 | * @example http://www.framework2.com.ar/dzone/forceUTF8-es/ | ||
8 | */ | ||
9 | |||
10 | class Encoding { | ||
11 | |||
12 | protected static $win1252ToUtf8 = array( | ||
13 | 128 => "\xe2\x82\xac", | ||
14 | |||
15 | 130 => "\xe2\x80\x9a", | ||
16 | 131 => "\xc6\x92", | ||
17 | 132 => "\xe2\x80\x9e", | ||
18 | 133 => "\xe2\x80\xa6", | ||
19 | 134 => "\xe2\x80\xa0", | ||
20 | 135 => "\xe2\x80\xa1", | ||
21 | 136 => "\xcb\x86", | ||
22 | 137 => "\xe2\x80\xb0", | ||
23 | 138 => "\xc5\xa0", | ||
24 | 139 => "\xe2\x80\xb9", | ||
25 | 140 => "\xc5\x92", | ||
26 | |||
27 | 142 => "\xc5\xbd", | ||
28 | |||
29 | |||
30 | 145 => "\xe2\x80\x98", | ||
31 | 146 => "\xe2\x80\x99", | ||
32 | 147 => "\xe2\x80\x9c", | ||
33 | 148 => "\xe2\x80\x9d", | ||
34 | 149 => "\xe2\x80\xa2", | ||
35 | 150 => "\xe2\x80\x93", | ||
36 | 151 => "\xe2\x80\x94", | ||
37 | 152 => "\xcb\x9c", | ||
38 | 153 => "\xe2\x84\xa2", | ||
39 | 154 => "\xc5\xa1", | ||
40 | 155 => "\xe2\x80\xba", | ||
41 | 156 => "\xc5\x93", | ||
42 | |||
43 | 158 => "\xc5\xbe", | ||
44 | 159 => "\xc5\xb8" | ||
45 | ); | ||
46 | |||
47 | protected static $brokenUtf8ToUtf8 = array( | ||
48 | "\xc2\x80" => "\xe2\x82\xac", | ||
49 | |||
50 | "\xc2\x82" => "\xe2\x80\x9a", | ||
51 | "\xc2\x83" => "\xc6\x92", | ||
52 | "\xc2\x84" => "\xe2\x80\x9e", | ||
53 | "\xc2\x85" => "\xe2\x80\xa6", | ||
54 | "\xc2\x86" => "\xe2\x80\xa0", | ||
55 | "\xc2\x87" => "\xe2\x80\xa1", | ||
56 | "\xc2\x88" => "\xcb\x86", | ||
57 | "\xc2\x89" => "\xe2\x80\xb0", | ||
58 | "\xc2\x8a" => "\xc5\xa0", | ||
59 | "\xc2\x8b" => "\xe2\x80\xb9", | ||
60 | "\xc2\x8c" => "\xc5\x92", | ||
61 | |||
62 | "\xc2\x8e" => "\xc5\xbd", | ||
63 | |||
64 | |||
65 | "\xc2\x91" => "\xe2\x80\x98", | ||
66 | "\xc2\x92" => "\xe2\x80\x99", | ||
67 | "\xc2\x93" => "\xe2\x80\x9c", | ||
68 | "\xc2\x94" => "\xe2\x80\x9d", | ||
69 | "\xc2\x95" => "\xe2\x80\xa2", | ||
70 | "\xc2\x96" => "\xe2\x80\x93", | ||
71 | "\xc2\x97" => "\xe2\x80\x94", | ||
72 | "\xc2\x98" => "\xcb\x9c", | ||
73 | "\xc2\x99" => "\xe2\x84\xa2", | ||
74 | "\xc2\x9a" => "\xc5\xa1", | ||
75 | "\xc2\x9b" => "\xe2\x80\xba", | ||
76 | "\xc2\x9c" => "\xc5\x93", | ||
77 | |||
78 | "\xc2\x9e" => "\xc5\xbe", | ||
79 | "\xc2\x9f" => "\xc5\xb8" | ||
80 | ); | ||
81 | |||
82 | protected static $utf8ToWin1252 = array( | ||
83 | "\xe2\x82\xac" => "\x80", | ||
84 | |||
85 | "\xe2\x80\x9a" => "\x82", | ||
86 | "\xc6\x92" => "\x83", | ||
87 | "\xe2\x80\x9e" => "\x84", | ||
88 | "\xe2\x80\xa6" => "\x85", | ||
89 | "\xe2\x80\xa0" => "\x86", | ||
90 | "\xe2\x80\xa1" => "\x87", | ||
91 | "\xcb\x86" => "\x88", | ||
92 | "\xe2\x80\xb0" => "\x89", | ||
93 | "\xc5\xa0" => "\x8a", | ||
94 | "\xe2\x80\xb9" => "\x8b", | ||
95 | "\xc5\x92" => "\x8c", | ||
96 | |||
97 | "\xc5\xbd" => "\x8e", | ||
98 | |||
99 | |||
100 | "\xe2\x80\x98" => "\x91", | ||
101 | "\xe2\x80\x99" => "\x92", | ||
102 | "\xe2\x80\x9c" => "\x93", | ||
103 | "\xe2\x80\x9d" => "\x94", | ||
104 | "\xe2\x80\xa2" => "\x95", | ||
105 | "\xe2\x80\x93" => "\x96", | ||
106 | "\xe2\x80\x94" => "\x97", | ||
107 | "\xcb\x9c" => "\x98", | ||
108 | "\xe2\x84\xa2" => "\x99", | ||
109 | "\xc5\xa1" => "\x9a", | ||
110 | "\xe2\x80\xba" => "\x9b", | ||
111 | "\xc5\x93" => "\x9c", | ||
112 | |||
113 | "\xc5\xbe" => "\x9e", | ||
114 | "\xc5\xb8" => "\x9f" | ||
115 | ); | ||
116 | |||
117 | static function toUTF8($text){ | ||
118 | /** | ||
119 | * Function Encoding::toUTF8 | ||
120 | * | ||
121 | * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. | ||
122 | * | ||
123 | * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. | ||
124 | * | ||
125 | * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: | ||
126 | * | ||
127 | * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß | ||
128 | * are followed by any of these: ("group B") | ||
129 | * ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿ | ||
130 | * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» | ||
131 | * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) | ||
132 | * is also a valid unicode character, and will be left unchanged. | ||
133 | * | ||
134 | * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, | ||
135 | * 3) when any of these: ðñòó are followed by THREE chars from group B. | ||
136 | * | ||
137 | * @name toUTF8 | ||
138 | * @param string $text Any string. | ||
139 | * @return string The same string, UTF8 encoded | ||
140 | * | ||
141 | */ | ||
142 | |||
143 | if(is_array($text)) | ||
144 | { | ||
145 | foreach($text as $k => $v) | ||
146 | { | ||
147 | $text[$k] = self::toUTF8($v); | ||
148 | } | ||
149 | return $text; | ||
150 | } elseif(is_string($text)) { | ||
151 | |||
152 | $max = strlen($text); | ||
153 | $buf = ""; | ||
154 | for($i = 0; $i < $max; $i++){ | ||
155 | $c1 = $text{$i}; | ||
156 | if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already | ||
157 | $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; | ||
158 | $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; | ||
159 | $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; | ||
160 | if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 | ||
161 | if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
162 | $buf .= $c1 . $c2; | ||
163 | $i++; | ||
164 | } else { //not valid UTF8. Convert it. | ||
165 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
166 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
167 | $buf .= $cc1 . $cc2; | ||
168 | } | ||
169 | } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 | ||
170 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
171 | $buf .= $c1 . $c2 . $c3; | ||
172 | $i = $i + 2; | ||
173 | } else { //not valid UTF8. Convert it. | ||
174 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
175 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
176 | $buf .= $cc1 . $cc2; | ||
177 | } | ||
178 | } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 | ||
179 | if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already | ||
180 | $buf .= $c1 . $c2 . $c3; | ||
181 | $i = $i + 2; | ||
182 | } else { //not valid UTF8. Convert it. | ||
183 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
184 | $cc2 = ($c1 & "\x3f") | "\x80"; | ||
185 | $buf .= $cc1 . $cc2; | ||
186 | } | ||
187 | } else { //doesn't look like UTF8, but should be converted | ||
188 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
189 | $cc2 = (($c1 & "\x3f") | "\x80"); | ||
190 | $buf .= $cc1 . $cc2; | ||
191 | } | ||
192 | } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion | ||
193 | if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases | ||
194 | $buf .= self::$win1252ToUtf8[ord($c1)]; | ||
195 | } else { | ||
196 | $cc1 = (chr(ord($c1) / 64) | "\xc0"); | ||
197 | $cc2 = (($c1 & "\x3f") | "\x80"); | ||
198 | $buf .= $cc1 . $cc2; | ||
199 | } | ||
200 | } else { // it doesn't need convesion | ||
201 | $buf .= $c1; | ||
202 | } | ||
203 | } | ||
204 | return $buf; | ||
205 | } else { | ||
206 | return $text; | ||
207 | } | ||
208 | } | ||
209 | |||
210 | static function toWin1252($text) { | ||
211 | if(is_array($text)) { | ||
212 | foreach($text as $k => $v) { | ||
213 | $text[$k] = self::toWin1252($v); | ||
214 | } | ||
215 | return $text; | ||
216 | } elseif(is_string($text)) { | ||
217 | return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text))); | ||
218 | } else { | ||
219 | return $text; | ||
220 | } | ||
221 | } | ||
222 | |||
223 | static function toISO8859($text) { | ||
224 | return self::toWin1252($text); | ||
225 | } | ||
226 | |||
227 | static function toLatin1($text) { | ||
228 | return self::toWin1252($text); | ||
229 | } | ||
230 | |||
231 | static function fixUTF8($text){ | ||
232 | if(is_array($text)) { | ||
233 | foreach($text as $k => $v) { | ||
234 | $text[$k] = self::fixUTF8($v); | ||
235 | } | ||
236 | return $text; | ||
237 | } | ||
238 | |||
239 | $last = ""; | ||
240 | while($last <> $text){ | ||
241 | $last = $text; | ||
242 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); | ||
243 | } | ||
244 | $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text))); | ||
245 | return $text; | ||
246 | } | ||
247 | |||
248 | static function UTF8FixWin1252Chars($text){ | ||
249 | // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 | ||
250 | // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. | ||
251 | // See: http://en.wikipedia.org/wiki/Windows-1252 | ||
252 | |||
253 | return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); | ||
254 | } | ||
255 | |||
256 | static function removeBOM($str=""){ | ||
257 | if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) { | ||
258 | $str=substr($str, 3); | ||
259 | } | ||
260 | return $str; | ||
261 | } | ||
262 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/JSLikeHTMLElement.php b/inc/3rdparty/JSLikeHTMLElement.php new file mode 100644 index 00000000..238ba8a8 --- /dev/null +++ b/inc/3rdparty/JSLikeHTMLElement.php | |||
@@ -0,0 +1,109 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * JavaScript-like HTML DOM Element | ||
4 | * | ||
5 | * This class extends PHP's DOMElement to allow | ||
6 | * users to get and set the innerHTML property of | ||
7 | * HTML elements in the same way it's done in | ||
8 | * JavaScript. | ||
9 | * | ||
10 | * Example usage: | ||
11 | * @code | ||
12 | * require_once 'JSLikeHTMLElement.php'; | ||
13 | * header('Content-Type: text/plain'); | ||
14 | * $doc = new DOMDocument(); | ||
15 | * $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | ||
16 | * $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>'); | ||
17 | * $elem = $doc->getElementsByTagName('div')->item(0); | ||
18 | * | ||
19 | * // print innerHTML | ||
20 | * echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>' | ||
21 | * echo "\n\n"; | ||
22 | * | ||
23 | * // set innerHTML | ||
24 | * $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>'; | ||
25 | * echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>' | ||
26 | * echo "\n\n"; | ||
27 | * | ||
28 | * // print document (with our changes) | ||
29 | * echo $doc->saveXML(); | ||
30 | * @endcode | ||
31 | * | ||
32 | * @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net | ||
33 | * @see http://fivefilters.org (the project this was written for) | ||
34 | */ | ||
35 | class JSLikeHTMLElement extends DOMElement | ||
36 | { | ||
37 | /** | ||
38 | * Used for setting innerHTML like it's done in JavaScript: | ||
39 | * @code | ||
40 | * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>'; | ||
41 | * @endcode | ||
42 | */ | ||
43 | public function __set($name, $value) { | ||
44 | if ($name == 'innerHTML') { | ||
45 | // first, empty the element | ||
46 | for ($x=$this->childNodes->length-1; $x>=0; $x--) { | ||
47 | $this->removeChild($this->childNodes->item($x)); | ||
48 | } | ||
49 | // $value holds our new inner HTML | ||
50 | if ($value != '') { | ||
51 | $f = $this->ownerDocument->createDocumentFragment(); | ||
52 | // appendXML() expects well-formed markup (XHTML) | ||
53 | $result = @$f->appendXML($value); // @ to suppress PHP warnings | ||
54 | if ($result) { | ||
55 | if ($f->hasChildNodes()) $this->appendChild($f); | ||
56 | } else { | ||
57 | // $value is probably ill-formed | ||
58 | $f = new DOMDocument(); | ||
59 | $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8'); | ||
60 | // Using <htmlfragment> will generate a warning, but so will bad HTML | ||
61 | // (and by this point, bad HTML is what we've got). | ||
62 | // We use it (and suppress the warning) because an HTML fragment will | ||
63 | // be wrapped around <html><body> tags which we don't really want to keep. | ||
64 | // Note: despite the warning, if loadHTML succeeds it will return true. | ||
65 | $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>'); | ||
66 | if ($result) { | ||
67 | $import = $f->getElementsByTagName('htmlfragment')->item(0); | ||
68 | foreach ($import->childNodes as $child) { | ||
69 | $importedNode = $this->ownerDocument->importNode($child, true); | ||
70 | $this->appendChild($importedNode); | ||
71 | } | ||
72 | } else { | ||
73 | // oh well, we tried, we really did. :( | ||
74 | // this element is now empty | ||
75 | } | ||
76 | } | ||
77 | } | ||
78 | } else { | ||
79 | $trace = debug_backtrace(); | ||
80 | trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); | ||
81 | } | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * Used for getting innerHTML like it's done in JavaScript: | ||
86 | * @code | ||
87 | * $string = $div->innerHTML; | ||
88 | * @endcode | ||
89 | */ | ||
90 | public function __get($name) | ||
91 | { | ||
92 | if ($name == 'innerHTML') { | ||
93 | $inner = ''; | ||
94 | foreach ($this->childNodes as $child) { | ||
95 | $inner .= $this->ownerDocument->saveXML($child); | ||
96 | } | ||
97 | return $inner; | ||
98 | } | ||
99 | |||
100 | $trace = debug_backtrace(); | ||
101 | trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE); | ||
102 | return null; | ||
103 | } | ||
104 | |||
105 | public function __toString() | ||
106 | { | ||
107 | return '['.$this->tagName.']'; | ||
108 | } | ||
109 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/Readability.php b/inc/3rdparty/Readability.php new file mode 100644 index 00000000..e1e8738b --- /dev/null +++ b/inc/3rdparty/Readability.php | |||
@@ -0,0 +1,1137 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Arc90's Readability ported to PHP for FiveFilters.org | ||
4 | * Based on readability.js version 1.7.1 (without multi-page support) | ||
5 | * Updated to allow HTML5 parsing with html5lib | ||
6 | * Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds | ||
7 | * ------------------------------------------------------ | ||
8 | * Original URL: http://lab.arc90.com/experiments/readability/js/readability.js | ||
9 | * Arc90's project URL: http://lab.arc90.com/experiments/readability/ | ||
10 | * JS Source: http://code.google.com/p/arc90labs-readability | ||
11 | * Ported by: Keyvan Minoukadeh, http://www.keyvan.net | ||
12 | * More information: http://fivefilters.org/content-only/ | ||
13 | * License: Apache License, Version 2.0 | ||
14 | * Requires: PHP5 | ||
15 | * Date: 2012-09-19 | ||
16 | * | ||
17 | * Differences between the PHP port and the original | ||
18 | * ------------------------------------------------------ | ||
19 | * Arc90's Readability is designed to run in the browser. It works on the DOM | ||
20 | * tree (the parsed HTML) after the page's CSS styles have been applied and | ||
21 | * Javascript code executed. This PHP port does not run inside a browser. | ||
22 | * We use PHP's ability to parse HTML to build our DOM tree, but we cannot | ||
23 | * rely on CSS or Javascript support. As such, the results will not always | ||
24 | * match Arc90's Readability. (For example, if a web page contains CSS style | ||
25 | * rules or Javascript code which hide certain HTML elements from display, | ||
26 | * Arc90's Readability will dismiss those from consideration but our PHP port, | ||
27 | * unable to understand CSS or Javascript, will not know any better.) | ||
28 | * | ||
29 | * Another significant difference is that the aim of Arc90's Readability is | ||
30 | * to re-present the main content block of a given web page so users can | ||
31 | * read it more easily in their browsers. Correct identification, clean up, | ||
32 | * and separation of the content block is only a part of this process. | ||
33 | * This PHP port is only concerned with this part, it does not include code | ||
34 | * that relates to presentation in the browser - Arc90 already do | ||
35 | * that extremely well, and for PDF output there's FiveFilters.org's | ||
36 | * PDF Newspaper: http://fivefilters.org/pdf-newspaper/. | ||
37 | * | ||
38 | * Finally, this class contains methods that might be useful for developers | ||
39 | * working on HTML document fragments. So without deviating too much from | ||
40 | * the original code (which I don't want to do because it makes debugging | ||
41 | * and updating more difficult), I've tried to make it a little more | ||
42 | * developer friendly. You should be able to use the methods here on | ||
43 | * existing DOMElement objects without passing an entire HTML document to | ||
44 | * be parsed. | ||
45 | */ | ||
46 | |||
47 | // This class allows us to do JavaScript like assignements to innerHTML | ||
48 | require_once(dirname(__FILE__).'/JSLikeHTMLElement.php'); | ||
49 | |||
50 | // Alternative usage (for testing only!) | ||
51 | // uncomment the lines below and call Readability.php in your browser | ||
52 | // passing it the URL of the page you'd like content from, e.g.: | ||
53 | // Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php | ||
54 | |||
55 | /* | ||
56 | if (!isset($_GET['url']) || $_GET['url'] == '') { | ||
57 | die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html'); | ||
58 | } | ||
59 | $url = $_GET['url']; | ||
60 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | ||
61 | $html = file_get_contents($url); | ||
62 | $r = new Readability($html, $url); | ||
63 | $r->init(); | ||
64 | echo $r->articleContent->innerHTML; | ||
65 | */ | ||
66 | |||
67 | class Readability | ||
68 | { | ||
69 | public $version = '1.7.1-without-multi-page'; | ||
70 | public $convertLinksToFootnotes = false; | ||
71 | public $revertForcedParagraphElements = true; | ||
72 | public $articleTitle; | ||
73 | public $articleContent; | ||
74 | public $dom; | ||
75 | public $url = null; // optional - URL where HTML was retrieved | ||
76 | public $debug = false; | ||
77 | public $lightClean = true; // preserves more content (experimental) added 2012-09-19 | ||
78 | protected $body = null; // | ||
79 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | ||
80 | protected $flags = 7; // 1 | 2 | 4; // Start with all flags set. | ||
81 | protected $success = false; // indicates whether we were able to extract or not | ||
82 | |||
83 | /** | ||
84 | * All of the regular expressions in use within readability. | ||
85 | * Defined up here so we don't instantiate them repeatedly in loops. | ||
86 | **/ | ||
87 | public $regexps = array( | ||
88 | 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i', | ||
89 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | ||
90 | 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i', | ||
91 | 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | ||
92 | 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i', | ||
93 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | ||
94 | 'replaceFonts' => '/<(\/?)font[^>]*>/i', | ||
95 | // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() | ||
96 | 'normalize' => '/\s{2,}/', | ||
97 | 'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/', | ||
98 | 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i', | ||
99 | 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' | ||
100 | ); | ||
101 | |||
102 | /* constants */ | ||
103 | const FLAG_STRIP_UNLIKELYS = 1; | ||
104 | const FLAG_WEIGHT_CLASSES = 2; | ||
105 | const FLAG_CLEAN_CONDITIONALLY = 4; | ||
106 | |||
107 | /** | ||
108 | * Create instance of Readability | ||
109 | * @param string UTF-8 encoded string | ||
110 | * @param string (optional) URL associated with HTML (used for footnotes) | ||
111 | * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | ||
112 | */ | ||
113 | function __construct($html, $url=null, $parser='libxml') | ||
114 | { | ||
115 | $this->url = $url; | ||
116 | /* Turn all double br's into p's */ | ||
117 | $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); | ||
118 | $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); | ||
119 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); | ||
120 | if (trim($html) == '') $html = '<html></html>'; | ||
121 | if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) { | ||
122 | // all good | ||
123 | } else { | ||
124 | $this->dom = new DOMDocument(); | ||
125 | $this->dom->preserveWhiteSpace = false; | ||
126 | @$this->dom->loadHTML($html); | ||
127 | } | ||
128 | $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * Get article title element | ||
133 | * @return DOMElement | ||
134 | */ | ||
135 | public function getTitle() { | ||
136 | return $this->articleTitle; | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * Get article content element | ||
141 | * @return DOMElement | ||
142 | */ | ||
143 | public function getContent() { | ||
144 | return $this->articleContent; | ||
145 | } | ||
146 | |||
147 | /** | ||
148 | * Runs readability. | ||
149 | * | ||
150 | * Workflow: | ||
151 | * 1. Prep the document by removing script tags, css, etc. | ||
152 | * 2. Build readability's DOM tree. | ||
153 | * 3. Grab the article content from the current dom tree. | ||
154 | * 4. Replace the current DOM tree with the new one. | ||
155 | * 5. Read peacefully. | ||
156 | * | ||
157 | * @return boolean true if we found content, false otherwise | ||
158 | **/ | ||
159 | public function init() | ||
160 | { | ||
161 | if (!isset($this->dom->documentElement)) return false; | ||
162 | $this->removeScripts($this->dom); | ||
163 | //die($this->getInnerHTML($this->dom->documentElement)); | ||
164 | |||
165 | // Assume successful outcome | ||
166 | $this->success = true; | ||
167 | |||
168 | $bodyElems = $this->dom->getElementsByTagName('body'); | ||
169 | if ($bodyElems->length > 0) { | ||
170 | if ($this->bodyCache == null) { | ||
171 | $this->bodyCache = $bodyElems->item(0)->innerHTML; | ||
172 | } | ||
173 | if ($this->body == null) { | ||
174 | $this->body = $bodyElems->item(0); | ||
175 | } | ||
176 | } | ||
177 | |||
178 | $this->prepDocument(); | ||
179 | |||
180 | //die($this->dom->documentElement->parentNode->nodeType); | ||
181 | //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); | ||
182 | //die($this->getInnerHTML($this->dom->documentElement)); | ||
183 | |||
184 | /* Build readability's DOM tree */ | ||
185 | $overlay = $this->dom->createElement('div'); | ||
186 | $innerDiv = $this->dom->createElement('div'); | ||
187 | $articleTitle = $this->getArticleTitle(); | ||
188 | $articleContent = $this->grabArticle(); | ||
189 | |||
190 | if (!$articleContent) { | ||
191 | $this->success = false; | ||
192 | $articleContent = $this->dom->createElement('div'); | ||
193 | $articleContent->setAttribute('id', 'readability-content'); | ||
194 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | ||
195 | } | ||
196 | |||
197 | $overlay->setAttribute('id', 'readOverlay'); | ||
198 | $innerDiv->setAttribute('id', 'readInner'); | ||
199 | |||
200 | /* Glue the structure of our document together. */ | ||
201 | $innerDiv->appendChild($articleTitle); | ||
202 | $innerDiv->appendChild($articleContent); | ||
203 | $overlay->appendChild($innerDiv); | ||
204 | |||
205 | /* Clear the old HTML, insert the new content. */ | ||
206 | $this->body->innerHTML = ''; | ||
207 | $this->body->appendChild($overlay); | ||
208 | //document.body.insertBefore(overlay, document.body.firstChild); | ||
209 | $this->body->removeAttribute('style'); | ||
210 | |||
211 | $this->postProcessContent($articleContent); | ||
212 | |||
213 | // Set title and content instance variables | ||
214 | $this->articleTitle = $articleTitle; | ||
215 | $this->articleContent = $articleContent; | ||
216 | |||
217 | return $this->success; | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * Debug | ||
222 | */ | ||
223 | protected function dbg($msg) { | ||
224 | if ($this->debug) echo '* ',$msg, "\n"; | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * Run any post-process modifications to article content as necessary. | ||
229 | * | ||
230 | * @param DOMElement | ||
231 | * @return void | ||
232 | */ | ||
233 | public function postProcessContent($articleContent) { | ||
234 | if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { | ||
235 | $this->addFootnotes($articleContent); | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * Get the article title as an H1. | ||
241 | * | ||
242 | * @return DOMElement | ||
243 | */ | ||
244 | protected function getArticleTitle() { | ||
245 | $curTitle = ''; | ||
246 | $origTitle = ''; | ||
247 | |||
248 | try { | ||
249 | $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0)); | ||
250 | } catch(Exception $e) {} | ||
251 | |||
252 | if (preg_match('/ [\|\-] /', $curTitle)) | ||
253 | { | ||
254 | $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle); | ||
255 | |||
256 | if (count(explode(' ', $curTitle)) < 3) { | ||
257 | $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle); | ||
258 | } | ||
259 | } | ||
260 | else if (strpos($curTitle, ': ') !== false) | ||
261 | { | ||
262 | $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle); | ||
263 | |||
264 | if (count(explode(' ', $curTitle)) < 3) { | ||
265 | $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle); | ||
266 | } | ||
267 | } | ||
268 | else if(strlen($curTitle) > 150 || strlen($curTitle) < 15) | ||
269 | { | ||
270 | $hOnes = $this->dom->getElementsByTagName('h1'); | ||
271 | if($hOnes->length == 1) | ||
272 | { | ||
273 | $curTitle = $this->getInnerText($hOnes->item(0)); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | $curTitle = trim($curTitle); | ||
278 | |||
279 | if (count(explode(' ', $curTitle)) <= 4) { | ||
280 | $curTitle = $origTitle; | ||
281 | } | ||
282 | |||
283 | $articleTitle = $this->dom->createElement('h1'); | ||
284 | $articleTitle->innerHTML = $curTitle; | ||
285 | |||
286 | return $articleTitle; | ||
287 | } | ||
288 | |||
289 | /** | ||
290 | * Prepare the HTML document for readability to scrape it. | ||
291 | * This includes things like stripping javascript, CSS, and handling terrible markup. | ||
292 | * | ||
293 | * @return void | ||
294 | **/ | ||
295 | protected function prepDocument() { | ||
296 | /** | ||
297 | * In some cases a body element can't be found (if the HTML is totally hosed for example) | ||
298 | * so we create a new body node and append it to the document. | ||
299 | */ | ||
300 | if ($this->body == null) | ||
301 | { | ||
302 | $this->body = $this->dom->createElement('body'); | ||
303 | $this->dom->documentElement->appendChild($this->body); | ||
304 | } | ||
305 | $this->body->setAttribute('id', 'readabilityBody'); | ||
306 | |||
307 | /* Remove all style tags in head */ | ||
308 | $styleTags = $this->dom->getElementsByTagName('style'); | ||
309 | for ($i = $styleTags->length-1; $i >= 0; $i--) | ||
310 | { | ||
311 | $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); | ||
312 | } | ||
313 | |||
314 | /* Turn all double br's into p's */ | ||
315 | /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ | ||
316 | //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>'); | ||
317 | // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. | ||
318 | // Manipulating innerHTML as it's done in JS is not possible in PHP. | ||
319 | } | ||
320 | |||
321 | /** | ||
322 | * For easier reading, convert this document to have footnotes at the bottom rather than inline links. | ||
323 | * @see http://www.roughtype.com/archives/2010/05/experiments_in.php | ||
324 | * | ||
325 | * @return void | ||
326 | **/ | ||
327 | public function addFootnotes($articleContent) { | ||
328 | $footnotesWrapper = $this->dom->createElement('div'); | ||
329 | $footnotesWrapper->setAttribute('id', 'readability-footnotes'); | ||
330 | $footnotesWrapper->innerHTML = '<h3>References</h3>'; | ||
331 | |||
332 | $articleFootnotes = $this->dom->createElement('ol'); | ||
333 | $articleFootnotes->setAttribute('id', 'readability-footnotes-list'); | ||
334 | $footnotesWrapper->appendChild($articleFootnotes); | ||
335 | |||
336 | $articleLinks = $articleContent->getElementsByTagName('a'); | ||
337 | |||
338 | $linkCount = 0; | ||
339 | for ($i = 0; $i < $articleLinks->length; $i++) | ||
340 | { | ||
341 | $articleLink = $articleLinks->item($i); | ||
342 | $footnoteLink = $articleLink->cloneNode(true); | ||
343 | $refLink = $this->dom->createElement('a'); | ||
344 | $footnote = $this->dom->createElement('li'); | ||
345 | $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST); | ||
346 | if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST); | ||
347 | //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host, | ||
348 | $linkText = $this->getInnerText($articleLink); | ||
349 | |||
350 | if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) { | ||
351 | continue; | ||
352 | } | ||
353 | |||
354 | $linkCount++; | ||
355 | |||
356 | /** Add a superscript reference after the article link */ | ||
357 | $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount); | ||
358 | $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>'; | ||
359 | $refLink->setAttribute('class', 'readability-DoNotFootnote'); | ||
360 | $refLink->setAttribute('style', 'color: inherit;'); | ||
361 | |||
362 | //TODO: does this work or should we use DOMNode.isSameNode()? | ||
363 | if ($articleLink->parentNode->lastChild == $articleLink) { | ||
364 | $articleLink->parentNode->appendChild($refLink); | ||
365 | } else { | ||
366 | $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling); | ||
367 | } | ||
368 | |||
369 | $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;'); | ||
370 | $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount); | ||
371 | |||
372 | $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> '; | ||
373 | |||
374 | $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText); | ||
375 | $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount); | ||
376 | |||
377 | $footnote->appendChild($footnoteLink); | ||
378 | if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>'; | ||
379 | |||
380 | $articleFootnotes->appendChild($footnote); | ||
381 | } | ||
382 | |||
383 | if ($linkCount > 0) { | ||
384 | $articleContent->appendChild($footnotesWrapper); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | /** | ||
389 | * Reverts P elements with class 'readability-styled' | ||
390 | * to text nodes - which is what they were before. | ||
391 | * | ||
392 | * @param DOMElement | ||
393 | * @return void | ||
394 | */ | ||
395 | function revertReadabilityStyledElements($articleContent) { | ||
396 | $xpath = new DOMXPath($articleContent->ownerDocument); | ||
397 | $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent); | ||
398 | //$elems = $articleContent->getElementsByTagName('p'); | ||
399 | for ($i = $elems->length-1; $i >= 0; $i--) { | ||
400 | $e = $elems->item($i); | ||
401 | $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); | ||
402 | //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') { | ||
403 | // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e); | ||
404 | //} | ||
405 | } | ||
406 | } | ||
407 | |||
408 | /** | ||
409 | * Prepare the article node for display. Clean out any inline styles, | ||
410 | * iframes, forms, strip extraneous <p> tags, etc. | ||
411 | * | ||
412 | * @param DOMElement | ||
413 | * @return void | ||
414 | */ | ||
415 | function prepArticle($articleContent) { | ||
416 | $this->cleanStyles($articleContent); | ||
417 | $this->killBreaks($articleContent); | ||
418 | if ($this->revertForcedParagraphElements) { | ||
419 | $this->revertReadabilityStyledElements($articleContent); | ||
420 | } | ||
421 | |||
422 | /* Clean out junk from the article content */ | ||
423 | $this->cleanConditionally($articleContent, 'form'); | ||
424 | $this->clean($articleContent, 'object'); | ||
425 | $this->clean($articleContent, 'h1'); | ||
426 | |||
427 | /** | ||
428 | * If there is only one h2, they are probably using it | ||
429 | * as a header and not a subheader, so remove it since we already have a header. | ||
430 | ***/ | ||
431 | if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) { | ||
432 | $this->clean($articleContent, 'h2'); | ||
433 | } | ||
434 | $this->clean($articleContent, 'iframe'); | ||
435 | |||
436 | $this->cleanHeaders($articleContent); | ||
437 | |||
438 | /* Do these last as the previous stuff may have removed junk that will affect these */ | ||
439 | $this->cleanConditionally($articleContent, 'table'); | ||
440 | $this->cleanConditionally($articleContent, 'ul'); | ||
441 | $this->cleanConditionally($articleContent, 'div'); | ||
442 | |||
443 | /* Remove extra paragraphs */ | ||
444 | $articleParagraphs = $articleContent->getElementsByTagName('p'); | ||
445 | for ($i = $articleParagraphs->length-1; $i >= 0; $i--) | ||
446 | { | ||
447 | $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length; | ||
448 | $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length; | ||
449 | $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length; | ||
450 | $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length; | ||
451 | |||
452 | if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '') | ||
453 | { | ||
454 | $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i)); | ||
455 | } | ||
456 | } | ||
457 | |||
458 | try { | ||
459 | $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML); | ||
460 | //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p'); | ||
461 | } | ||
462 | catch (Exception $e) { | ||
463 | $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | /** | ||
468 | * Initialize a node with the readability object. Also checks the | ||
469 | * className/id for special names to add to its score. | ||
470 | * | ||
471 | * @param Element | ||
472 | * @return void | ||
473 | **/ | ||
474 | protected function initializeNode($node) { | ||
475 | $readability = $this->dom->createAttribute('readability'); | ||
476 | $readability->value = 0; // this is our contentScore | ||
477 | $node->setAttributeNode($readability); | ||
478 | |||
479 | switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case | ||
480 | case 'DIV': | ||
481 | $readability->value += 5; | ||
482 | break; | ||
483 | |||
484 | case 'PRE': | ||
485 | case 'TD': | ||
486 | case 'BLOCKQUOTE': | ||
487 | $readability->value += 3; | ||
488 | break; | ||
489 | |||
490 | case 'ADDRESS': | ||
491 | case 'OL': | ||
492 | case 'UL': | ||
493 | case 'DL': | ||
494 | case 'DD': | ||
495 | case 'DT': | ||
496 | case 'LI': | ||
497 | case 'FORM': | ||
498 | $readability->value -= 3; | ||
499 | break; | ||
500 | |||
501 | case 'H1': | ||
502 | case 'H2': | ||
503 | case 'H3': | ||
504 | case 'H4': | ||
505 | case 'H5': | ||
506 | case 'H6': | ||
507 | case 'TH': | ||
508 | $readability->value -= 5; | ||
509 | break; | ||
510 | } | ||
511 | $readability->value += $this->getClassWeight($node); | ||
512 | } | ||
513 | |||
514 | /*** | ||
515 | * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is | ||
516 | * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | ||
517 | * | ||
518 | * @return DOMElement | ||
519 | **/ | ||
520 | protected function grabArticle($page=null) { | ||
521 | $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS); | ||
522 | if (!$page) $page = $this->dom; | ||
523 | $allElements = $page->getElementsByTagName('*'); | ||
524 | /** | ||
525 | * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs | ||
526 | * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) | ||
527 | * | ||
528 | * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | ||
529 | * TODO: Shouldn't this be a reverse traversal? | ||
530 | **/ | ||
531 | $node = null; | ||
532 | $nodesToScore = array(); | ||
533 | for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) { | ||
534 | //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) { | ||
535 | //$node = $targetList->item($nodeIndex); | ||
536 | $tagName = strtoupper($node->tagName); | ||
537 | /* Remove unlikely candidates */ | ||
538 | if ($stripUnlikelyCandidates) { | ||
539 | $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id'); | ||
540 | if ( | ||
541 | preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) && | ||
542 | !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) && | ||
543 | $tagName != 'BODY' | ||
544 | ) | ||
545 | { | ||
546 | $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString); | ||
547 | //$nodesToRemove[] = $node; | ||
548 | $node->parentNode->removeChild($node); | ||
549 | $nodeIndex--; | ||
550 | continue; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') { | ||
555 | $nodesToScore[] = $node; | ||
556 | } | ||
557 | |||
558 | /* Turn all divs that don't have children block level elements into p's */ | ||
559 | if ($tagName == 'DIV') { | ||
560 | if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) { | ||
561 | //$this->dbg('Altering div to p'); | ||
562 | $newNode = $this->dom->createElement('p'); | ||
563 | try { | ||
564 | $newNode->innerHTML = $node->innerHTML; | ||
565 | //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node); | ||
566 | $node->parentNode->replaceChild($newNode, $node); | ||
567 | $nodeIndex--; | ||
568 | $nodesToScore[] = $node; // or $newNode? | ||
569 | } | ||
570 | catch(Exception $e) { | ||
571 | $this->dbg('Could not alter div to p, reverting back to div.: ' . $e); | ||
572 | } | ||
573 | } | ||
574 | else | ||
575 | { | ||
576 | /* EXPERIMENTAL */ | ||
577 | // TODO: change these p elements back to text nodes after processing | ||
578 | for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) { | ||
579 | $childNode = $node->childNodes->item($i); | ||
580 | if ($childNode->nodeType == 3) { // XML_TEXT_NODE | ||
581 | //$this->dbg('replacing text node with a p tag with the same content.'); | ||
582 | $p = $this->dom->createElement('p'); | ||
583 | $p->innerHTML = $childNode->nodeValue; | ||
584 | $p->setAttribute('style', 'display: inline;'); | ||
585 | $p->setAttribute('class', 'readability-styled'); | ||
586 | $childNode->parentNode->replaceChild($p, $childNode); | ||
587 | } | ||
588 | } | ||
589 | } | ||
590 | } | ||
591 | } | ||
592 | |||
593 | /** | ||
594 | * Loop through all paragraphs, and assign a score to them based on how content-y they look. | ||
595 | * Then add their score to their parent node. | ||
596 | * | ||
597 | * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. | ||
598 | **/ | ||
599 | $candidates = array(); | ||
600 | for ($pt=0; $pt < count($nodesToScore); $pt++) { | ||
601 | $parentNode = $nodesToScore[$pt]->parentNode; | ||
602 | // $grandParentNode = $parentNode ? $parentNode->parentNode : null; | ||
603 | $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null); | ||
604 | $innerText = $this->getInnerText($nodesToScore[$pt]); | ||
605 | |||
606 | if (!$parentNode || !isset($parentNode->tagName)) { | ||
607 | continue; | ||
608 | } | ||
609 | |||
610 | /* If this paragraph is less than 25 characters, don't even count it. */ | ||
611 | if(strlen($innerText) < 25) { | ||
612 | continue; | ||
613 | } | ||
614 | |||
615 | /* Initialize readability data for the parent. */ | ||
616 | if (!$parentNode->hasAttribute('readability')) | ||
617 | { | ||
618 | $this->initializeNode($parentNode); | ||
619 | $candidates[] = $parentNode; | ||
620 | } | ||
621 | |||
622 | /* Initialize readability data for the grandparent. */ | ||
623 | if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName)) | ||
624 | { | ||
625 | $this->initializeNode($grandParentNode); | ||
626 | $candidates[] = $grandParentNode; | ||
627 | } | ||
628 | |||
629 | $contentScore = 0; | ||
630 | |||
631 | /* Add a point for the paragraph itself as a base. */ | ||
632 | $contentScore++; | ||
633 | |||
634 | /* Add points for any commas within this paragraph */ | ||
635 | $contentScore += count(explode(',', $innerText)); | ||
636 | |||
637 | /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ | ||
638 | $contentScore += min(floor(strlen($innerText) / 100), 3); | ||
639 | |||
640 | /* Add the score to the parent. The grandparent gets half. */ | ||
641 | $parentNode->getAttributeNode('readability')->value += $contentScore; | ||
642 | |||
643 | if ($grandParentNode) { | ||
644 | $grandParentNode->getAttributeNode('readability')->value += $contentScore/2; | ||
645 | } | ||
646 | } | ||
647 | |||
648 | /** | ||
649 | * After we've calculated scores, loop through all of the possible candidate nodes we found | ||
650 | * and find the one with the highest score. | ||
651 | **/ | ||
652 | $topCandidate = null; | ||
653 | for ($c=0, $cl=count($candidates); $c < $cl; $c++) | ||
654 | { | ||
655 | /** | ||
656 | * Scale the final candidates score based on link density. Good content should have a | ||
657 | * relatively small link density (5% or less) and be mostly unaffected by this operation. | ||
658 | **/ | ||
659 | $readability = $candidates[$c]->getAttributeNode('readability'); | ||
660 | $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c])); | ||
661 | |||
662 | $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value); | ||
663 | |||
664 | if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) { | ||
665 | $topCandidate = $candidates[$c]; | ||
666 | } | ||
667 | } | ||
668 | |||
669 | /** | ||
670 | * If we still have no top candidate, just use the body as a last resort. | ||
671 | * We also have to copy the body node so it is something we can modify. | ||
672 | **/ | ||
673 | if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY') | ||
674 | { | ||
675 | $topCandidate = $this->dom->createElement('div'); | ||
676 | if ($page instanceof DOMDocument) { | ||
677 | if (!isset($page->documentElement)) { | ||
678 | // we don't have a body either? what a mess! :) | ||
679 | } else { | ||
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | ||
681 | $page->documentElement->innerHTML = ''; | ||
682 | $page->documentElement->appendChild($topCandidate); | ||
683 | } | ||
684 | } else { | ||
685 | $topCandidate->innerHTML = $page->innerHTML; | ||
686 | $page->innerHTML = ''; | ||
687 | $page->appendChild($topCandidate); | ||
688 | } | ||
689 | $this->initializeNode($topCandidate); | ||
690 | } | ||
691 | |||
692 | /** | ||
693 | * Now that we have the top candidate, look through its siblings for content that might also be related. | ||
694 | * Things like preambles, content split by ads that we removed, etc. | ||
695 | **/ | ||
696 | $articleContent = $this->dom->createElement('div'); | ||
697 | $articleContent->setAttribute('id', 'readability-content'); | ||
698 | $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2); | ||
699 | $siblingNodes = $topCandidate->parentNode->childNodes; | ||
700 | if (!isset($siblingNodes)) { | ||
701 | $siblingNodes = new stdClass; | ||
702 | $siblingNodes->length = 0; | ||
703 | } | ||
704 | |||
705 | for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++) | ||
706 | { | ||
707 | $siblingNode = $siblingNodes->item($s); | ||
708 | $append = false; | ||
709 | |||
710 | $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : '')); | ||
711 | |||
712 | //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown')); | ||
713 | |||
714 | if ($siblingNode === $topCandidate) | ||
715 | // or if ($siblingNode->isSameNode($topCandidate)) | ||
716 | { | ||
717 | $append = true; | ||
718 | } | ||
719 | |||
720 | $contentBonus = 0; | ||
721 | /* Give a bonus if sibling nodes and top candidates have the example same classname */ | ||
722 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') { | ||
723 | $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2; | ||
724 | } | ||
725 | |||
726 | if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold) | ||
727 | { | ||
728 | $append = true; | ||
729 | } | ||
730 | |||
731 | if (strtoupper($siblingNode->nodeName) == 'P') { | ||
732 | $linkDensity = $this->getLinkDensity($siblingNode); | ||
733 | $nodeContent = $this->getInnerText($siblingNode); | ||
734 | $nodeLength = strlen($nodeContent); | ||
735 | |||
736 | if ($nodeLength > 80 && $linkDensity < 0.25) | ||
737 | { | ||
738 | $append = true; | ||
739 | } | ||
740 | else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) | ||
741 | { | ||
742 | $append = true; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | if ($append) | ||
747 | { | ||
748 | $this->dbg('Appending node: ' . $siblingNode->nodeName); | ||
749 | |||
750 | $nodeToAppend = null; | ||
751 | $sibNodeName = strtoupper($siblingNode->nodeName); | ||
752 | if ($sibNodeName != 'DIV' && $sibNodeName != 'P') { | ||
753 | /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ | ||
754 | |||
755 | $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.'); | ||
756 | $nodeToAppend = $this->dom->createElement('div'); | ||
757 | try { | ||
758 | $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id')); | ||
759 | $nodeToAppend->innerHTML = $siblingNode->innerHTML; | ||
760 | } | ||
761 | catch(Exception $e) | ||
762 | { | ||
763 | $this->dbg('Could not alter siblingNode to div, reverting back to original.'); | ||
764 | $nodeToAppend = $siblingNode; | ||
765 | $s--; | ||
766 | $sl--; | ||
767 | } | ||
768 | } else { | ||
769 | $nodeToAppend = $siblingNode; | ||
770 | $s--; | ||
771 | $sl--; | ||
772 | } | ||
773 | |||
774 | /* To ensure a node does not interfere with readability styles, remove its classnames */ | ||
775 | $nodeToAppend->removeAttribute('class'); | ||
776 | |||
777 | /* Append sibling and subtract from our list because it removes the node when you append to another node */ | ||
778 | $articleContent->appendChild($nodeToAppend); | ||
779 | } | ||
780 | } | ||
781 | |||
782 | /** | ||
783 | * So we have all of the content that we need. Now we clean it up for presentation. | ||
784 | **/ | ||
785 | $this->prepArticle($articleContent); | ||
786 | |||
787 | /** | ||
788 | * Now that we've gone through the full algorithm, check to see if we got any meaningful content. | ||
789 | * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher | ||
790 | * likelihood of finding the content, and the sieve approach gives us a higher likelihood of | ||
791 | * finding the -right- content. | ||
792 | **/ | ||
793 | if (strlen($this->getInnerText($articleContent, false)) < 250) | ||
794 | { | ||
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | ||
796 | // in the meantime, we check and create an empty element if it's not there. | ||
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | ||
798 | $this->body->innerHTML = $this->bodyCache; | ||
799 | |||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | ||
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | ||
802 | return $this->grabArticle($this->body); | ||
803 | } | ||
804 | else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | ||
805 | $this->removeFlag(self::FLAG_WEIGHT_CLASSES); | ||
806 | return $this->grabArticle($this->body); | ||
807 | } | ||
808 | else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | ||
809 | $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY); | ||
810 | return $this->grabArticle($this->body); | ||
811 | } | ||
812 | else { | ||
813 | return false; | ||
814 | } | ||
815 | } | ||
816 | return $articleContent; | ||
817 | } | ||
818 | |||
819 | /** | ||
820 | * Remove script tags from document | ||
821 | * | ||
822 | * @param DOMElement | ||
823 | * @return void | ||
824 | */ | ||
825 | public function removeScripts($doc) { | ||
826 | $scripts = $doc->getElementsByTagName('script'); | ||
827 | for($i = $scripts->length-1; $i >= 0; $i--) | ||
828 | { | ||
829 | $scripts->item($i)->parentNode->removeChild($scripts->item($i)); | ||
830 | } | ||
831 | } | ||
832 | |||
833 | /** | ||
834 | * Get the inner text of a node. | ||
835 | * This also strips out any excess whitespace to be found. | ||
836 | * | ||
837 | * @param DOMElement $ | ||
838 | * @param boolean $normalizeSpaces (default: true) | ||
839 | * @return string | ||
840 | **/ | ||
841 | public function getInnerText($e, $normalizeSpaces=true) { | ||
842 | $textContent = ''; | ||
843 | |||
844 | if (!isset($e->textContent) || $e->textContent == '') { | ||
845 | return ''; | ||
846 | } | ||
847 | |||
848 | $textContent = trim($e->textContent); | ||
849 | |||
850 | if ($normalizeSpaces) { | ||
851 | return preg_replace($this->regexps['normalize'], ' ', $textContent); | ||
852 | } else { | ||
853 | return $textContent; | ||
854 | } | ||
855 | } | ||
856 | |||
857 | /** | ||
858 | * Get the number of times a string $s appears in the node $e. | ||
859 | * | ||
860 | * @param DOMElement $e | ||
861 | * @param string - what to count. Default is "," | ||
862 | * @return number (integer) | ||
863 | **/ | ||
864 | public function getCharCount($e, $s=',') { | ||
865 | return substr_count($this->getInnerText($e), $s); | ||
866 | } | ||
867 | |||
868 | /** | ||
869 | * Remove the style attribute on every $e and under. | ||
870 | * | ||
871 | * @param DOMElement $e | ||
872 | * @return void | ||
873 | */ | ||
874 | public function cleanStyles($e) { | ||
875 | if (!is_object($e)) return; | ||
876 | $elems = $e->getElementsByTagName('*'); | ||
877 | foreach ($elems as $elem) { | ||
878 | $elem->removeAttribute('style'); | ||
879 | } | ||
880 | } | ||
881 | |||
882 | /** | ||
883 | * Get the density of links as a percentage of the content | ||
884 | * This is the amount of text that is inside a link divided by the total text in the node. | ||
885 | * | ||
886 | * @param DOMElement $e | ||
887 | * @return number (float) | ||
888 | */ | ||
889 | public function getLinkDensity($e) { | ||
890 | $links = $e->getElementsByTagName('a'); | ||
891 | $textLength = strlen($this->getInnerText($e)); | ||
892 | $linkLength = 0; | ||
893 | for ($i=0, $il=$links->length; $i < $il; $i++) | ||
894 | { | ||
895 | $linkLength += strlen($this->getInnerText($links->item($i))); | ||
896 | } | ||
897 | if ($textLength > 0) { | ||
898 | return $linkLength / $textLength; | ||
899 | } else { | ||
900 | return 0; | ||
901 | } | ||
902 | } | ||
903 | |||
904 | /** | ||
905 | * Get an elements class/id weight. Uses regular expressions to tell if this | ||
906 | * element looks good or bad. | ||
907 | * | ||
908 | * @param DOMElement $e | ||
909 | * @return number (Integer) | ||
910 | */ | ||
911 | public function getClassWeight($e) { | ||
912 | if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) { | ||
913 | return 0; | ||
914 | } | ||
915 | |||
916 | $weight = 0; | ||
917 | |||
918 | /* Look for a special classname */ | ||
919 | if ($e->hasAttribute('class') && $e->getAttribute('class') != '') | ||
920 | { | ||
921 | if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) { | ||
922 | $weight -= 25; | ||
923 | } | ||
924 | if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) { | ||
925 | $weight += 25; | ||
926 | } | ||
927 | } | ||
928 | |||
929 | /* Look for a special ID */ | ||
930 | if ($e->hasAttribute('id') && $e->getAttribute('id') != '') | ||
931 | { | ||
932 | if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) { | ||
933 | $weight -= 25; | ||
934 | } | ||
935 | if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) { | ||
936 | $weight += 25; | ||
937 | } | ||
938 | } | ||
939 | return $weight; | ||
940 | } | ||
941 | |||
942 | /** | ||
943 | * Remove extraneous break tags from a node. | ||
944 | * | ||
945 | * @param DOMElement $node | ||
946 | * @return void | ||
947 | */ | ||
948 | public function killBreaks($node) { | ||
949 | $html = $node->innerHTML; | ||
950 | $html = preg_replace($this->regexps['killBreaks'], '<br />', $html); | ||
951 | $node->innerHTML = $html; | ||
952 | } | ||
953 | |||
954 | /** | ||
955 | * Clean a node of all elements of type "tag". | ||
956 | * (Unless it's a youtube/vimeo video. People love movies.) | ||
957 | * | ||
958 | * Updated 2012-09-18 to preserve youtube/vimeo iframes | ||
959 | * | ||
960 | * @param DOMElement $e | ||
961 | * @param string $tag | ||
962 | * @return void | ||
963 | */ | ||
964 | public function clean($e, $tag) { | ||
965 | $targetList = $e->getElementsByTagName($tag); | ||
966 | $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed'); | ||
967 | |||
968 | for ($y=$targetList->length-1; $y >= 0; $y--) { | ||
969 | /* Allow youtube and vimeo videos through as people usually want to see those. */ | ||
970 | if ($isEmbed) { | ||
971 | $attributeValues = ''; | ||
972 | for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) { | ||
973 | $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test) | ||
974 | } | ||
975 | |||
976 | /* First, check the elements attributes to see if any of them contain youtube or vimeo */ | ||
977 | if (preg_match($this->regexps['video'], $attributeValues)) { | ||
978 | continue; | ||
979 | } | ||
980 | |||
981 | /* Then check the elements inside this element for the same. */ | ||
982 | if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) { | ||
983 | continue; | ||
984 | } | ||
985 | } | ||
986 | $targetList->item($y)->parentNode->removeChild($targetList->item($y)); | ||
987 | } | ||
988 | } | ||
989 | |||
990 | /** | ||
991 | * Clean an element of all tags of type "tag" if they look fishy. | ||
992 | * "Fishy" is an algorithm based on content length, classnames, | ||
993 | * link density, number of images & embeds, etc. | ||
994 | * | ||
995 | * @param DOMElement $e | ||
996 | * @param string $tag | ||
997 | * @return void | ||
998 | */ | ||
999 | public function cleanConditionally($e, $tag) { | ||
1000 | if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { | ||
1001 | return; | ||
1002 | } | ||
1003 | |||
1004 | $tagsList = $e->getElementsByTagName($tag); | ||
1005 | $curTagsLength = $tagsList->length; | ||
1006 | |||
1007 | /** | ||
1008 | * Gather counts for other typical elements embedded within. | ||
1009 | * Traverse backwards so we can remove nodes at the same time without effecting the traversal. | ||
1010 | * | ||
1011 | * TODO: Consider taking into account original contentScore here. | ||
1012 | */ | ||
1013 | for ($i=$curTagsLength-1; $i >= 0; $i--) { | ||
1014 | $weight = $this->getClassWeight($tagsList->item($i)); | ||
1015 | $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0; | ||
1016 | |||
1017 | $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : '')); | ||
1018 | |||
1019 | if ($weight + $contentScore < 0) { | ||
1020 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | ||
1021 | } | ||
1022 | else if ( $this->getCharCount($tagsList->item($i), ',') < 10) { | ||
1023 | /** | ||
1024 | * If there are not very many commas, and the number of | ||
1025 | * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. | ||
1026 | **/ | ||
1027 | $p = $tagsList->item($i)->getElementsByTagName('p')->length; | ||
1028 | $img = $tagsList->item($i)->getElementsByTagName('img')->length; | ||
1029 | $li = $tagsList->item($i)->getElementsByTagName('li')->length-100; | ||
1030 | $input = $tagsList->item($i)->getElementsByTagName('input')->length; | ||
1031 | $a = $tagsList->item($i)->getElementsByTagName('a')->length; | ||
1032 | |||
1033 | $embedCount = 0; | ||
1034 | $embeds = $tagsList->item($i)->getElementsByTagName('embed'); | ||
1035 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | ||
1036 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | ||
1037 | $embedCount++; | ||
1038 | } | ||
1039 | } | ||
1040 | $embeds = $tagsList->item($i)->getElementsByTagName('iframe'); | ||
1041 | for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) { | ||
1042 | if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) { | ||
1043 | $embedCount++; | ||
1044 | } | ||
1045 | } | ||
1046 | |||
1047 | $linkDensity = $this->getLinkDensity($tagsList->item($i)); | ||
1048 | $contentLength = strlen($this->getInnerText($tagsList->item($i))); | ||
1049 | $toRemove = false; | ||
1050 | |||
1051 | if ($this->lightClean) { | ||
1052 | $this->dbg('Light clean...'); | ||
1053 | if ( ($img > $p) && ($img > 4) ) { | ||
1054 | $this->dbg(' more than 4 images and more image elements than paragraph elements'); | ||
1055 | $toRemove = true; | ||
1056 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | ||
1057 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | ||
1058 | $toRemove = true; | ||
1059 | } else if ( $input > floor($p/3) ) { | ||
1060 | $this->dbg(' too many <input> elements'); | ||
1061 | $toRemove = true; | ||
1062 | } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { | ||
1063 | $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); | ||
1064 | $toRemove = true; | ||
1065 | } else if($weight < 25 && $linkDensity > 0.2) { | ||
1066 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | ||
1067 | $toRemove = true; | ||
1068 | } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) { | ||
1069 | $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5'); | ||
1070 | $toRemove = true; | ||
1071 | } else if($embedCount > 3) { | ||
1072 | $this->dbg(' more than 3 embeds'); | ||
1073 | $toRemove = true; | ||
1074 | } | ||
1075 | } else { | ||
1076 | $this->dbg('Standard clean...'); | ||
1077 | if ( $img > $p ) { | ||
1078 | $this->dbg(' more image elements than paragraph elements'); | ||
1079 | $toRemove = true; | ||
1080 | } else if ($li > $p && $tag != 'ul' && $tag != 'ol') { | ||
1081 | $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>'); | ||
1082 | $toRemove = true; | ||
1083 | } else if ( $input > floor($p/3) ) { | ||
1084 | $this->dbg(' too many <input> elements'); | ||
1085 | $toRemove = true; | ||
1086 | } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) { | ||
1087 | $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images'); | ||
1088 | $toRemove = true; | ||
1089 | } else if($weight < 25 && $linkDensity > 0.2) { | ||
1090 | $this->dbg(' weight smaller than 25 and link density above 0.2'); | ||
1091 | $toRemove = true; | ||
1092 | } else if($weight >= 25 && $linkDensity > 0.5) { | ||
1093 | $this->dbg(' weight above 25 but link density greater than 0.5'); | ||
1094 | $toRemove = true; | ||
1095 | } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) { | ||
1096 | $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed'); | ||
1097 | $toRemove = true; | ||
1098 | } | ||
1099 | } | ||
1100 | |||
1101 | if ($toRemove) { | ||
1102 | //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML); | ||
1103 | $tagsList->item($i)->parentNode->removeChild($tagsList->item($i)); | ||
1104 | } | ||
1105 | } | ||
1106 | } | ||
1107 | } | ||
1108 | |||
1109 | /** | ||
1110 | * Clean out spurious headers from an Element. Checks things like classnames and link density. | ||
1111 | * | ||
1112 | * @param DOMElement $e | ||
1113 | * @return void | ||
1114 | */ | ||
1115 | public function cleanHeaders($e) { | ||
1116 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { | ||
1117 | $headers = $e->getElementsByTagName('h' . $headerIndex); | ||
1118 | for ($i=$headers->length-1; $i >=0; $i--) { | ||
1119 | if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) { | ||
1120 | $headers->item($i)->parentNode->removeChild($headers->item($i)); | ||
1121 | } | ||
1122 | } | ||
1123 | } | ||
1124 | } | ||
1125 | |||
1126 | public function flagIsActive($flag) { | ||
1127 | return ($this->flags & $flag) > 0; | ||
1128 | } | ||
1129 | |||
1130 | public function addFlag($flag) { | ||
1131 | $this->flags = $this->flags | $flag; | ||
1132 | } | ||
1133 | |||
1134 | public function removeFlag($flag) { | ||
1135 | $this->flags = $this->flags & ~$flag; | ||
1136 | } | ||
1137 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/Session.class.php b/inc/3rdparty/Session.class.php new file mode 100644 index 00000000..3162f507 --- /dev/null +++ b/inc/3rdparty/Session.class.php | |||
@@ -0,0 +1,136 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Session management class | ||
4 | * http://www.developpez.net/forums/d51943/php/langage/sessions/ | ||
5 | * http://sebsauvage.net/wiki/doku.php?id=php:session | ||
6 | * http://sebsauvage.net/wiki/doku.php?id=php:shaarli | ||
7 | * | ||
8 | * Features: | ||
9 | * - Everything is stored on server-side (we do not trust client-side data, | ||
10 | * such as cookie expiration) | ||
11 | * - IP addresses + user agent are checked on each access to prevent session | ||
12 | * cookie hijacking (such as Firesheep) | ||
13 | * - Session expires on user inactivity (Session expiration date is | ||
14 | * automatically updated everytime the user accesses a page.) | ||
15 | * - A unique secret key is generated on server-side for this session | ||
16 | * (and never sent over the wire) which can be used | ||
17 | * to sign forms (HMAC) (See $_SESSION['uid'] ) | ||
18 | * - Token management to prevent XSRF attacks. | ||
19 | * | ||
20 | * TODO: | ||
21 | * - log login fail | ||
22 | * - prevent brute force (ban IP) | ||
23 | * | ||
24 | * HOWTOUSE: | ||
25 | * - Just call Session::init(); to initialize session and | ||
26 | * check if connected with Session::isLogged() | ||
27 | */ | ||
28 | |||
29 | class Session | ||
30 | { | ||
31 | // If the user does not access any page within this time, | ||
32 | // his/her session is considered expired (in seconds). | ||
33 | public static $inactivity_timeout = 3600; | ||
34 | private static $_instance; | ||
35 | |||
36 | // constructor | ||
37 | private function __construct() | ||
38 | { | ||
39 | // Use cookies to store session. | ||
40 | ini_set('session.use_cookies', 1); | ||
41 | // Force cookies for session (phpsessionID forbidden in URL) | ||
42 | ini_set('session.use_only_cookies', 1); | ||
43 | if (!session_id()){ | ||
44 | // Prevent php to use sessionID in URL if cookies are disabled. | ||
45 | ini_set('session.use_trans_sid', false); | ||
46 | session_start('poche'); | ||
47 | } | ||
48 | } | ||
49 | |||
50 | // initialize session | ||
51 | public static function init() | ||
52 | { | ||
53 | if (!isset(self::$_instance)) { | ||
54 | self::$_instance = new Session(); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | // Returns the IP address, user agent and language of the client | ||
59 | // (Used to prevent session cookie hijacking.) | ||
60 | private static function _allInfos() | ||
61 | { | ||
62 | $infos = $_SERVER["REMOTE_ADDR"]; | ||
63 | if (isset($_SERVER['HTTP_X_FORWARDED_FOR'])) { | ||
64 | $infos.=$_SERVER['HTTP_X_FORWARDED_FOR']; | ||
65 | } | ||
66 | if (isset($_SERVER['HTTP_CLIENT_IP'])) { | ||
67 | $infos.='_'.$_SERVER['HTTP_CLIENT_IP']; | ||
68 | } | ||
69 | $infos.='_'.$_SERVER['HTTP_USER_AGENT']; | ||
70 | $infos.='_'.$_SERVER['HTTP_ACCEPT_LANGUAGE']; | ||
71 | return sha1($infos); | ||
72 | } | ||
73 | |||
74 | // Check that user/password is correct and init some SESSION variables. | ||
75 | public static function login($login,$password,$login_test,$password_test, | ||
76 | $pValues = array()) | ||
77 | { | ||
78 | foreach ($pValues as $key => $value) { | ||
79 | $_SESSION[$key] = $value; | ||
80 | } | ||
81 | if ($login==$login_test && $password==$password_test){ | ||
82 | // generate unique random number to sign forms (HMAC) | ||
83 | $_SESSION['uid'] = sha1(uniqid('',true).'_'.mt_rand()); | ||
84 | $_SESSION['info']=Session::_allInfos(); | ||
85 | $_SESSION['username']=$login; | ||
86 | // Set session expiration. | ||
87 | $_SESSION['expires_on']=time()+Session::$inactivity_timeout; | ||
88 | return true; | ||
89 | } | ||
90 | return false; | ||
91 | } | ||
92 | |||
93 | // Force logout | ||
94 | public static function logout() | ||
95 | { | ||
96 | unset($_SESSION['uid'],$_SESSION['info'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass'], $_SESSION['poche_user']); | ||
97 | } | ||
98 | |||
99 | // Make sure user is logged in. | ||
100 | public static function isLogged() | ||
101 | { | ||
102 | if (!isset ($_SESSION['uid']) | ||
103 | || $_SESSION['info']!=Session::_allInfos() | ||
104 | || time()>=$_SESSION['expires_on']){ | ||
105 | Session::logout(); | ||
106 | return false; | ||
107 | } | ||
108 | // User accessed a page : Update his/her session expiration date. | ||
109 | $_SESSION['expires_on']=time()+Session::$inactivity_timeout; | ||
110 | return true; | ||
111 | } | ||
112 | |||
113 | // Returns a token. | ||
114 | public static function getToken() | ||
115 | { | ||
116 | if (!isset($_SESSION['tokens'])){ | ||
117 | $_SESSION['tokens']=array(); | ||
118 | } | ||
119 | // We generate a random string and store it on the server side. | ||
120 | $rnd = sha1(uniqid('',true).'_'.mt_rand()); | ||
121 | $_SESSION['tokens'][$rnd]=1; | ||
122 | return $rnd; | ||
123 | } | ||
124 | |||
125 | // Tells if a token is ok. Using this function will destroy the token. | ||
126 | // return true if token is ok. | ||
127 | public static function isToken($token) | ||
128 | { | ||
129 | if (isset($_SESSION['tokens'][$token])) | ||
130 | { | ||
131 | unset($_SESSION['tokens'][$token]); // Token is used: destroy it. | ||
132 | return true; // Token is ok. | ||
133 | } | ||
134 | return false; // Wrong token, or already used. | ||
135 | } | ||
136 | } \ No newline at end of file | ||
diff --git a/inc/3rdparty/class.messages.php b/inc/3rdparty/class.messages.php new file mode 100755 index 00000000..e60bd3a1 --- /dev/null +++ b/inc/3rdparty/class.messages.php | |||
@@ -0,0 +1,231 @@ | |||
1 | <?php | ||
2 | //-------------------------------------------------------------------------------------------------- | ||
3 | // Session-Based Flash Messages v1.0 | ||
4 | // Copyright 2012 Mike Everhart (http://mikeeverhart.net) | ||
5 | // | ||
6 | // Licensed under the Apache License, Version 2.0 (the "License"); | ||
7 | // you may not use this file except in compliance with the License. | ||
8 | // You may obtain a copy of the License at | ||
9 | // | ||
10 | // http://www.apache.org/licenses/LICENSE-2.0 | ||
11 | // | ||
12 | // Unless required by applicable law or agreed to in writing, software | ||
13 | // distributed under the License is distributed on an "AS IS" BASIS, | ||
14 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
15 | // See the License for the specific language governing permissions and | ||
16 | // limitations under the License. | ||
17 | // | ||
18 | //------------------------------------------------------------------------------ | ||
19 | // Description: | ||
20 | //------------------------------------------------------------------------------ | ||
21 | // | ||
22 | // Stores messages in Session data to be easily retrieved later on. | ||
23 | // This class includes four different types of messages: | ||
24 | // - Success | ||
25 | // - Error | ||
26 | // - Warning | ||
27 | // - Information | ||
28 | // | ||
29 | // See README for basic usage instructions, or see samples/index.php for more advanced samples | ||
30 | // | ||
31 | //-------------------------------------------------------------------------------------------------- | ||
32 | // Changelog | ||
33 | //-------------------------------------------------------------------------------------------------- | ||
34 | // | ||
35 | // 2011-05-15 - v1.0 - Initial Version | ||
36 | // | ||
37 | //-------------------------------------------------------------------------------------------------- | ||
38 | |||
39 | class Messages { | ||
40 | |||
41 | //----------------------------------------------------------------------------------------------- | ||
42 | // Class Variables | ||
43 | //----------------------------------------------------------------------------------------------- | ||
44 | var $msgId; | ||
45 | var $msgTypes = array( 'help', 'info', 'warning', 'success', 'error' ); | ||
46 | var $msgClass = 'messages'; | ||
47 | var $msgWrapper = "<div class='%s %s'><a href='#' class='closeMessage'>X</a>\n%s</div>\n"; | ||
48 | var $msgBefore = '<p>'; | ||
49 | var $msgAfter = "</p>\n"; | ||
50 | |||
51 | |||
52 | /** | ||
53 | * Constructor | ||
54 | * @author Mike Everhart | ||
55 | */ | ||
56 | public function __construct() { | ||
57 | |||
58 | // Generate a unique ID for this user and session | ||
59 | $this->msgId = md5(uniqid()); | ||
60 | |||
61 | // Create the session array if it doesnt already exist | ||
62 | if( !array_key_exists('flash_messages', $_SESSION) ) $_SESSION['flash_messages'] = array(); | ||
63 | |||
64 | } | ||
65 | |||
66 | /** | ||
67 | * Add a message to the queue | ||
68 | * | ||
69 | * @author Mike Everhart | ||
70 | * | ||
71 | * @param string $type The type of message to add | ||
72 | * @param string $message The message | ||
73 | * @param string $redirect_to (optional) If set, the user will be redirected to this URL | ||
74 | * @return bool | ||
75 | * | ||
76 | */ | ||
77 | public function add($type, $message, $redirect_to=null) { | ||
78 | |||
79 | if( !isset($_SESSION['flash_messages']) ) return false; | ||
80 | |||
81 | if( !isset($type) || !isset($message[0]) ) return false; | ||
82 | |||
83 | // Replace any shorthand codes with their full version | ||
84 | if( strlen(trim($type)) == 1 ) { | ||
85 | $type = str_replace( array('h', 'i', 'w', 'e', 's'), array('help', 'info', 'warning', 'error', 'success'), $type ); | ||
86 | |||
87 | // Backwards compatibility... | ||
88 | } elseif( $type == 'information' ) { | ||
89 | $type = 'info'; | ||
90 | } | ||
91 | |||
92 | // Make sure it's a valid message type | ||
93 | if( !in_array($type, $this->msgTypes) ) die('"' . strip_tags($type) . '" is not a valid message type!' ); | ||
94 | |||
95 | // If the session array doesn't exist, create it | ||
96 | if( !array_key_exists( $type, $_SESSION['flash_messages'] ) ) $_SESSION['flash_messages'][$type] = array(); | ||
97 | |||
98 | $_SESSION['flash_messages'][$type][] = $message; | ||
99 | |||
100 | if( !is_null($redirect_to) ) { | ||
101 | header("Location: $redirect_to"); | ||
102 | exit(); | ||
103 | } | ||
104 | |||
105 | return true; | ||
106 | |||
107 | } | ||
108 | |||
109 | //----------------------------------------------------------------------------------------------- | ||
110 | // display() | ||
111 | // print queued messages to the screen | ||
112 | //----------------------------------------------------------------------------------------------- | ||
113 | /** | ||
114 | * Display the queued messages | ||
115 | * | ||
116 | * @author Mike Everhart | ||
117 | * | ||
118 | * @param string $type Which messages to display | ||
119 | * @param bool $print True = print the messages on the screen | ||
120 | * @return mixed | ||
121 | * | ||
122 | */ | ||
123 | public function display($type='all', $print=true) { | ||
124 | $messages = ''; | ||
125 | $data = ''; | ||
126 | |||
127 | if( !isset($_SESSION['flash_messages']) ) return false; | ||
128 | |||
129 | if( $type == 'g' || $type == 'growl' ) { | ||
130 | $this->displayGrowlMessages(); | ||
131 | return true; | ||
132 | } | ||
133 | |||
134 | // Print a certain type of message? | ||
135 | if( in_array($type, $this->msgTypes) ) { | ||
136 | foreach( $_SESSION['flash_messages'][$type] as $msg ) { | ||
137 | $messages .= $this->msgBefore . $msg . $this->msgAfter; | ||
138 | } | ||
139 | |||
140 | $data .= sprintf($this->msgWrapper, $this->msgClass, $type, $messages); | ||
141 | |||
142 | // Clear the viewed messages | ||
143 | $this->clear($type); | ||
144 | |||
145 | // Print ALL queued messages | ||
146 | } elseif( $type == 'all' ) { | ||
147 | foreach( $_SESSION['flash_messages'] as $type => $msgArray ) { | ||
148 | $messages = ''; | ||
149 | foreach( $msgArray as $msg ) { | ||
150 | $messages .= $this->msgBefore . $msg . $this->msgAfter; | ||
151 | } | ||
152 | $data .= sprintf($this->msgWrapper, $this->msgClass, $type, $messages); | ||
153 | } | ||
154 | |||
155 | // Clear ALL of the messages | ||
156 | $this->clear(); | ||
157 | |||
158 | // Invalid Message Type? | ||
159 | } else { | ||
160 | return false; | ||
161 | } | ||
162 | |||
163 | // Print everything to the screen or return the data | ||
164 | if( $print ) { | ||
165 | echo $data; | ||
166 | } else { | ||
167 | return $data; | ||
168 | } | ||
169 | } | ||
170 | |||
171 | |||
172 | /** | ||
173 | * Check to see if there are any queued error messages | ||
174 | * | ||
175 | * @author Mike Everhart | ||
176 | * | ||
177 | * @return bool true = There ARE error messages | ||
178 | * false = There are NOT any error messages | ||
179 | * | ||
180 | */ | ||
181 | public function hasErrors() { | ||
182 | return empty($_SESSION['flash_messages']['error']) ? false : true; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * Check to see if there are any ($type) messages queued | ||
187 | * | ||
188 | * @author Mike Everhart | ||
189 | * | ||
190 | * @param string $type The type of messages to check for | ||
191 | * @return bool | ||
192 | * | ||
193 | */ | ||
194 | public function hasMessages($type=null) { | ||
195 | if( !is_null($type) ) { | ||
196 | if( !empty($_SESSION['flash_messages'][$type]) ) return $_SESSION['flash_messages'][$type]; | ||
197 | } else { | ||
198 | foreach( $this->msgTypes as $type ) { | ||
199 | if( !empty($_SESSION['flash_messages']) ) return true; | ||
200 | } | ||
201 | } | ||
202 | return false; | ||
203 | } | ||
204 | |||
205 | /** | ||
206 | * Clear messages from the session data | ||
207 | * | ||
208 | * @author Mike Everhart | ||
209 | * | ||
210 | * @param string $type The type of messages to clear | ||
211 | * @return bool | ||
212 | * | ||
213 | */ | ||
214 | public function clear($type='all') { | ||
215 | if( $type == 'all' ) { | ||
216 | unset($_SESSION['flash_messages']); | ||
217 | } else { | ||
218 | unset($_SESSION['flash_messages'][$type]); | ||
219 | } | ||
220 | return true; | ||
221 | } | ||
222 | |||
223 | public function __toString() { return $this->hasMessages(); } | ||
224 | |||
225 | public function __destruct() { | ||
226 | //$this->clear(); | ||
227 | } | ||
228 | |||
229 | |||
230 | } // end class | ||
231 | ?> \ No newline at end of file | ||
diff --git a/inc/3rdparty/paginator.php b/inc/3rdparty/paginator.php new file mode 100644 index 00000000..306756c0 --- /dev/null +++ b/inc/3rdparty/paginator.php | |||
@@ -0,0 +1,202 @@ | |||
1 | <?php | ||
2 | /* | ||
3 | * PHP Pagination Class | ||
4 | * | ||
5 | * @author David Carr - dave@daveismyname.com - http://www.daveismyname.com | ||
6 | * @version 1.0 | ||
7 | * @date October 20, 2013 | ||
8 | */ | ||
9 | class Paginator{ | ||
10 | |||
11 | /** | ||
12 | * set the number of items per page. | ||
13 | * | ||
14 | * @var numeric | ||
15 | */ | ||
16 | private $_perPage; | ||
17 | |||
18 | /** | ||
19 | * set get parameter for fetching the page number | ||
20 | * | ||
21 | * @var string | ||
22 | */ | ||
23 | private $_instance; | ||
24 | |||
25 | /** | ||
26 | * sets the page number. | ||
27 | * | ||
28 | * @var numeric | ||
29 | */ | ||
30 | private $_page; | ||
31 | |||
32 | /** | ||
33 | * set the limit for the data source | ||
34 | * | ||
35 | * @var string | ||
36 | */ | ||
37 | private $_limit; | ||
38 | |||
39 | /** | ||
40 | * set the total number of records/items. | ||
41 | * | ||
42 | * @var numeric | ||
43 | */ | ||
44 | private $_totalRows = 0; | ||
45 | |||
46 | |||
47 | |||
48 | /** | ||
49 | * __construct | ||
50 | * | ||
51 | * pass values when class is istantiated | ||
52 | * | ||
53 | * @param numeric $_perPage sets the number of iteems per page | ||
54 | * @param numeric $_instance sets the instance for the GET parameter | ||
55 | */ | ||
56 | public function __construct($perPage,$instance){ | ||
57 | $this->_instance = $instance; | ||
58 | $this->_perPage = $perPage; | ||
59 | $this->set_instance(); | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * get_start | ||
64 | * | ||
65 | * creates the starting point for limiting the dataset | ||
66 | * @return numeric | ||
67 | */ | ||
68 | private function get_start(){ | ||
69 | return ($this->_page * $this->_perPage) - $this->_perPage; | ||
70 | } | ||
71 | |||
72 | /** | ||
73 | * set_instance | ||
74 | * | ||
75 | * sets the instance parameter, if numeric value is 0 then set to 1 | ||
76 | * | ||
77 | * @var numeric | ||
78 | */ | ||
79 | private function set_instance(){ | ||
80 | $this->_page = (int) (!isset($_GET[$this->_instance]) ? 1 : $_GET[$this->_instance]); | ||
81 | $this->_page = ($this->_page == 0 ? 1 : $this->_page); | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * set_total | ||
86 | * | ||
87 | * collect a numberic value and assigns it to the totalRows | ||
88 | * | ||
89 | * @var numeric | ||
90 | */ | ||
91 | public function set_total($_totalRows){ | ||
92 | $this->_totalRows = $_totalRows; | ||
93 | } | ||
94 | |||
95 | /** | ||
96 | * get_limit | ||
97 | * | ||
98 | * returns the limit for the data source, calling the get_start method and passing in the number of items perp page | ||
99 | * | ||
100 | * @return string | ||
101 | */ | ||
102 | public function get_limit(){ | ||
103 | if (STORAGE == 'postgres') { | ||
104 | return "LIMIT ".$this->_perPage." OFFSET ".$this->get_start(); | ||
105 | } else { | ||
106 | return "LIMIT ".$this->get_start().",".$this->_perPage; | ||
107 | } | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * page_links | ||
112 | * | ||
113 | * create the html links for navigating through the dataset | ||
114 | * | ||
115 | * @var sting $path optionally set the path for the link | ||
116 | * @var sting $ext optionally pass in extra parameters to the GET | ||
117 | * @return string returns the html menu | ||
118 | */ | ||
119 | public function page_links($path='?',$ext=null) | ||
120 | { | ||
121 | $adjacents = "2"; | ||
122 | $prev = $this->_page - 1; | ||
123 | $next = $this->_page + 1; | ||
124 | $lastpage = ceil($this->_totalRows/$this->_perPage); | ||
125 | $lpm1 = $lastpage - 1; | ||
126 | |||
127 | $pagination = ""; | ||
128 | if($lastpage > 1) | ||
129 | { | ||
130 | $pagination .= "<div class='pagination'>"; | ||
131 | if ($this->_page > 1) | ||
132 | $pagination.= "<a href='".$path."$this->_instance=$prev"."$ext'>« previous</a>"; | ||
133 | else | ||
134 | $pagination.= "<span class='disabled'>« previous</span>"; | ||
135 | |||
136 | if ($lastpage < 7 + ($adjacents * 2)) | ||
137 | { | ||
138 | for ($counter = 1; $counter <= $lastpage; $counter++) | ||
139 | { | ||
140 | if ($counter == $this->_page) | ||
141 | $pagination.= "<span class='current'>$counter</span>"; | ||
142 | else | ||
143 | $pagination.= "<a href='".$path."$this->_instance=$counter"."$ext'>$counter</a>"; | ||
144 | } | ||
145 | } | ||
146 | elseif($lastpage > 5 + ($adjacents * 2)) | ||
147 | { | ||
148 | if($this->_page < 1 + ($adjacents * 2)) | ||
149 | { | ||
150 | for ($counter = 1; $counter < 4 + ($adjacents * 2); $counter++) | ||
151 | { | ||
152 | if ($counter == $this->_page) | ||
153 | $pagination.= "<span class='current'>$counter</span>"; | ||
154 | else | ||
155 | $pagination.= "<a href='".$path."$this->_instance=$counter"."$ext'>$counter</a>"; | ||
156 | } | ||
157 | $pagination.= "..."; | ||
158 | $pagination.= "<a href='".$path."$this->_instance=$lpm1"."$ext'>$lpm1</a>"; | ||
159 | $pagination.= "<a href='".$path."$this->_instance=$lastpage"."$ext'>$lastpage</a>"; | ||
160 | } | ||
161 | elseif($lastpage - ($adjacents * 2) > $this->_page && $this->_page > ($adjacents * 2)) | ||
162 | { | ||
163 | $pagination.= "<a href='".$path."$this->_instance=1"."$ext'>1</a>"; | ||
164 | $pagination.= "<a href='".$path."$this->_instance=2"."$ext'>2</a>"; | ||
165 | $pagination.= "..."; | ||
166 | for ($counter = $this->_page - $adjacents; $counter <= $this->_page + $adjacents; $counter++) | ||
167 | { | ||
168 | if ($counter == $this->_page) | ||
169 | $pagination.= "<span class='current'>$counter</span>"; | ||
170 | else | ||
171 | $pagination.= "<a href='".$path."$this->_instance=$counter"."$ext'>$counter</a>"; | ||
172 | } | ||
173 | $pagination.= ".."; | ||
174 | $pagination.= "<a href='".$path."$this->_instance=$lpm1"."$ext'>$lpm1</a>"; | ||
175 | $pagination.= "<a href='".$path."$this->_instance=$lastpage"."$ext'>$lastpage</a>"; | ||
176 | } | ||
177 | else | ||
178 | { | ||
179 | $pagination.= "<a href='".$path."$this->_instance=1"."$ext'>1</a>"; | ||
180 | $pagination.= "<a href='".$path."$this->_instance=2"."$ext'>2</a>"; | ||
181 | $pagination.= ".."; | ||
182 | for ($counter = $lastpage - (2 + ($adjacents * 2)); $counter <= $lastpage; $counter++) | ||
183 | { | ||
184 | if ($counter == $this->_page) | ||
185 | $pagination.= "<span class='current'>$counter</span>"; | ||
186 | else | ||
187 | $pagination.= "<a href='".$path."$this->_instance=$counter"."$ext'>$counter</a>"; | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | |||
192 | if ($this->_page < $counter - 1) | ||
193 | $pagination.= "<a href='".$path."$this->_instance=$next"."$ext'>next »</a>"; | ||
194 | else | ||
195 | $pagination.= "<span class='disabled'>next »</span>"; | ||
196 | $pagination.= "</div>\n"; | ||
197 | } | ||
198 | |||
199 | |||
200 | return $pagination; | ||
201 | } | ||
202 | } | ||
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php new file mode 100644 index 00000000..43b94e57 --- /dev/null +++ b/inc/3rdparty/simple_html_dom.php | |||
@@ -0,0 +1,1722 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Website: http://sourceforge.net/projects/simplehtmldom/ | ||
4 | * Additional projects that may be used: http://sourceforge.net/projects/debugobject/ | ||
5 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) | ||
6 | * Contributions by: | ||
7 | * Yousuke Kumakura (Attribute filters) | ||
8 | * Vadim Voituk (Negative indexes supports of "find" method) | ||
9 | * Antcs (Constructor with automatically load contents either text or file/url) | ||
10 | * | ||
11 | * all affected sections have comments starting with "PaperG" | ||
12 | * | ||
13 | * Paperg - Added case insensitive testing of the value of the selector. | ||
14 | * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. | ||
15 | * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, | ||
16 | * it will almost always be smaller by some amount. | ||
17 | * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. | ||
18 | * but for most purposes, it's a really good estimation. | ||
19 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. | ||
20 | * Allow the user to tell us how much they trust the html. | ||
21 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. | ||
22 | * This allows for us to find tags based on the text they contain. | ||
23 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. | ||
24 | * Paperg: added parse_charset so that we know about the character set of the source document. | ||
25 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the | ||
26 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. | ||
27 | * | ||
28 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. | ||
29 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. | ||
30 | * | ||
31 | * Licensed under The MIT License | ||
32 | * Redistributions of files must retain the above copyright notice. | ||
33 | * | ||
34 | * @author S.C. Chen <me578022@gmail.com> | ||
35 | * @author John Schlick | ||
36 | * @author Rus Carroll | ||
37 | * @version 1.5 ($Rev: 202 $) | ||
38 | * @package PlaceLocalInclude | ||
39 | * @subpackage simple_html_dom | ||
40 | */ | ||
41 | |||
42 | /** | ||
43 | * All of the Defines for the classes below. | ||
44 | * @author S.C. Chen <me578022@gmail.com> | ||
45 | */ | ||
46 | define('HDOM_TYPE_ELEMENT', 1); | ||
47 | define('HDOM_TYPE_COMMENT', 2); | ||
48 | define('HDOM_TYPE_TEXT', 3); | ||
49 | define('HDOM_TYPE_ENDTAG', 4); | ||
50 | define('HDOM_TYPE_ROOT', 5); | ||
51 | define('HDOM_TYPE_UNKNOWN', 6); | ||
52 | define('HDOM_QUOTE_DOUBLE', 0); | ||
53 | define('HDOM_QUOTE_SINGLE', 1); | ||
54 | define('HDOM_QUOTE_NO', 3); | ||
55 | define('HDOM_INFO_BEGIN', 0); | ||
56 | define('HDOM_INFO_END', 1); | ||
57 | define('HDOM_INFO_QUOTE', 2); | ||
58 | define('HDOM_INFO_SPACE', 3); | ||
59 | define('HDOM_INFO_TEXT', 4); | ||
60 | define('HDOM_INFO_INNER', 5); | ||
61 | define('HDOM_INFO_OUTER', 6); | ||
62 | define('HDOM_INFO_ENDSPACE',7); | ||
63 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); | ||
64 | define('DEFAULT_BR_TEXT', "\r\n"); | ||
65 | define('DEFAULT_SPAN_TEXT', " "); | ||
66 | define('MAX_FILE_SIZE', 600000); | ||
67 | // helper functions | ||
68 | // ----------------------------------------------------------------------------- | ||
69 | // get html dom from file | ||
70 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. | ||
71 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) | ||
72 | { | ||
73 | // We DO force the tags to be terminated. | ||
74 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); | ||
75 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. | ||
76 | $contents = file_get_contents($url, $use_include_path, $context, $offset); | ||
77 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. | ||
78 | //$contents = retrieve_url_contents($url); | ||
79 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) | ||
80 | { | ||
81 | return false; | ||
82 | } | ||
83 | // The second parameter can force the selectors to all be lowercase. | ||
84 | $dom->load($contents, $lowercase, $stripRN); | ||
85 | return $dom; | ||
86 | } | ||
87 | |||
88 | // get html dom from string | ||
89 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) | ||
90 | { | ||
91 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); | ||
92 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) | ||
93 | { | ||
94 | $dom->clear(); | ||
95 | return false; | ||
96 | } | ||
97 | $dom->load($str, $lowercase, $stripRN); | ||
98 | return $dom; | ||
99 | } | ||
100 | |||
101 | // dump html dom tree | ||
102 | function dump_html_tree($node, $show_attr=true, $deep=0) | ||
103 | { | ||
104 | $node->dump($node); | ||
105 | } | ||
106 | |||
107 | |||
108 | /** | ||
109 | * simple html dom node | ||
110 | * PaperG - added ability for "find" routine to lowercase the value of the selector. | ||
111 | * PaperG - added $tag_start to track the start position of the tag in the total byte index | ||
112 | * | ||
113 | * @package PlaceLocalInclude | ||
114 | */ | ||
115 | class simple_html_dom_node | ||
116 | { | ||
117 | public $nodetype = HDOM_TYPE_TEXT; | ||
118 | public $tag = 'text'; | ||
119 | public $attr = array(); | ||
120 | public $children = array(); | ||
121 | public $nodes = array(); | ||
122 | public $parent = null; | ||
123 | // The "info" array - see HDOM_INFO_... for what each element contains. | ||
124 | public $_ = array(); | ||
125 | public $tag_start = 0; | ||
126 | private $dom = null; | ||
127 | |||
128 | function __construct($dom) | ||
129 | { | ||
130 | $this->dom = $dom; | ||
131 | $dom->nodes[] = $this; | ||
132 | } | ||
133 | |||
134 | function __destruct() | ||
135 | { | ||
136 | $this->clear(); | ||
137 | } | ||
138 | |||
139 | function __toString() | ||
140 | { | ||
141 | return $this->outertext(); | ||
142 | } | ||
143 | |||
144 | // clean up memory due to php5 circular references memory leak... | ||
145 | function clear() | ||
146 | { | ||
147 | $this->dom = null; | ||
148 | $this->nodes = null; | ||
149 | $this->parent = null; | ||
150 | $this->children = null; | ||
151 | } | ||
152 | |||
153 | // dump node's tree | ||
154 | function dump($show_attr=true, $deep=0) | ||
155 | { | ||
156 | $lead = str_repeat(' ', $deep); | ||
157 | |||
158 | echo $lead.$this->tag; | ||
159 | if ($show_attr && count($this->attr)>0) | ||
160 | { | ||
161 | echo '('; | ||
162 | foreach ($this->attr as $k=>$v) | ||
163 | echo "[$k]=>\"".$this->$k.'", '; | ||
164 | echo ')'; | ||
165 | } | ||
166 | echo "\n"; | ||
167 | |||
168 | if ($this->nodes) | ||
169 | { | ||
170 | foreach ($this->nodes as $c) | ||
171 | { | ||
172 | $c->dump($show_attr, $deep+1); | ||
173 | } | ||
174 | } | ||
175 | } | ||
176 | |||
177 | |||
178 | // Debugging function to dump a single dom node with a bunch of information about it. | ||
179 | function dump_node($echo=true) | ||
180 | { | ||
181 | |||
182 | $string = $this->tag; | ||
183 | if (count($this->attr)>0) | ||
184 | { | ||
185 | $string .= '('; | ||
186 | foreach ($this->attr as $k=>$v) | ||
187 | { | ||
188 | $string .= "[$k]=>\"".$this->$k.'", '; | ||
189 | } | ||
190 | $string .= ')'; | ||
191 | } | ||
192 | if (count($this->_)>0) | ||
193 | { | ||
194 | $string .= ' $_ ('; | ||
195 | foreach ($this->_ as $k=>$v) | ||
196 | { | ||
197 | if (is_array($v)) | ||
198 | { | ||
199 | $string .= "[$k]=>("; | ||
200 | foreach ($v as $k2=>$v2) | ||
201 | { | ||
202 | $string .= "[$k2]=>\"".$v2.'", '; | ||
203 | } | ||
204 | $string .= ")"; | ||
205 | } else { | ||
206 | $string .= "[$k]=>\"".$v.'", '; | ||
207 | } | ||
208 | } | ||
209 | $string .= ")"; | ||
210 | } | ||
211 | |||
212 | if (isset($this->text)) | ||
213 | { | ||
214 | $string .= " text: (" . $this->text . ")"; | ||
215 | } | ||
216 | |||
217 | $string .= " HDOM_INNER_INFO: '"; | ||
218 | if (isset($node->_[HDOM_INFO_INNER])) | ||
219 | { | ||
220 | $string .= $node->_[HDOM_INFO_INNER] . "'"; | ||
221 | } | ||
222 | else | ||
223 | { | ||
224 | $string .= ' NULL '; | ||
225 | } | ||
226 | |||
227 | $string .= " children: " . count($this->children); | ||
228 | $string .= " nodes: " . count($this->nodes); | ||
229 | $string .= " tag_start: " . $this->tag_start; | ||
230 | $string .= "\n"; | ||
231 | |||
232 | if ($echo) | ||
233 | { | ||
234 | echo $string; | ||
235 | return; | ||
236 | } | ||
237 | else | ||
238 | { | ||
239 | return $string; | ||
240 | } | ||
241 | } | ||
242 | |||
243 | // returns the parent of node | ||
244 | // If a node is passed in, it will reset the parent of the current node to that one. | ||
245 | function parent($parent=null) | ||
246 | { | ||
247 | // I am SURE that this doesn't work properly. | ||
248 | // It fails to unset the current node from it's current parents nodes or children list first. | ||
249 | if ($parent !== null) | ||
250 | { | ||
251 | $this->parent = $parent; | ||
252 | $this->parent->nodes[] = $this; | ||
253 | $this->parent->children[] = $this; | ||
254 | } | ||
255 | |||
256 | return $this->parent; | ||
257 | } | ||
258 | |||
259 | // verify that node has children | ||
260 | function has_child() | ||
261 | { | ||
262 | return !empty($this->children); | ||
263 | } | ||
264 | |||
265 | // returns children of node | ||
266 | function children($idx=-1) | ||
267 | { | ||
268 | if ($idx===-1) | ||
269 | { | ||
270 | return $this->children; | ||
271 | } | ||
272 | if (isset($this->children[$idx])) return $this->children[$idx]; | ||
273 | return null; | ||
274 | } | ||
275 | |||
276 | // returns the first child of node | ||
277 | function first_child() | ||
278 | { | ||
279 | if (count($this->children)>0) | ||
280 | { | ||
281 | return $this->children[0]; | ||
282 | } | ||
283 | return null; | ||
284 | } | ||
285 | |||
286 | // returns the last child of node | ||
287 | function last_child() | ||
288 | { | ||
289 | if (($count=count($this->children))>0) | ||
290 | { | ||
291 | return $this->children[$count-1]; | ||
292 | } | ||
293 | return null; | ||
294 | } | ||
295 | |||
296 | // returns the next sibling of node | ||
297 | function next_sibling() | ||
298 | { | ||
299 | if ($this->parent===null) | ||
300 | { | ||
301 | return null; | ||
302 | } | ||
303 | |||
304 | $idx = 0; | ||
305 | $count = count($this->parent->children); | ||
306 | while ($idx<$count && $this!==$this->parent->children[$idx]) | ||
307 | { | ||
308 | ++$idx; | ||
309 | } | ||
310 | if (++$idx>=$count) | ||
311 | { | ||
312 | return null; | ||
313 | } | ||
314 | return $this->parent->children[$idx]; | ||
315 | } | ||
316 | |||
317 | // returns the previous sibling of node | ||
318 | function prev_sibling() | ||
319 | { | ||
320 | if ($this->parent===null) return null; | ||
321 | $idx = 0; | ||
322 | $count = count($this->parent->children); | ||
323 | while ($idx<$count && $this!==$this->parent->children[$idx]) | ||
324 | ++$idx; | ||
325 | if (--$idx<0) return null; | ||
326 | return $this->parent->children[$idx]; | ||
327 | } | ||
328 | |||
329 | // function to locate a specific ancestor tag in the path to the root. | ||
330 | function find_ancestor_tag($tag) | ||
331 | { | ||
332 | global $debug_object; | ||
333 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | ||
334 | |||
335 | // Start by including ourselves in the comparison. | ||
336 | $returnDom = $this; | ||
337 | |||
338 | while (!is_null($returnDom)) | ||
339 | { | ||
340 | if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); } | ||
341 | |||
342 | if ($returnDom->tag == $tag) | ||
343 | { | ||
344 | break; | ||
345 | } | ||
346 | $returnDom = $returnDom->parent; | ||
347 | } | ||
348 | return $returnDom; | ||
349 | } | ||
350 | |||
351 | // get dom node's inner html | ||
352 | function innertext() | ||
353 | { | ||
354 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; | ||
355 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
356 | |||
357 | $ret = ''; | ||
358 | foreach ($this->nodes as $n) | ||
359 | $ret .= $n->outertext(); | ||
360 | return $ret; | ||
361 | } | ||
362 | |||
363 | // get dom node's outer text (with tag) | ||
364 | function outertext() | ||
365 | { | ||
366 | global $debug_object; | ||
367 | if (is_object($debug_object)) | ||
368 | { | ||
369 | $text = ''; | ||
370 | if ($this->tag == 'text') | ||
371 | { | ||
372 | if (!empty($this->text)) | ||
373 | { | ||
374 | $text = " with text: " . $this->text; | ||
375 | } | ||
376 | } | ||
377 | $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); | ||
378 | } | ||
379 | |||
380 | if ($this->tag==='root') return $this->innertext(); | ||
381 | |||
382 | // trigger callback | ||
383 | if ($this->dom && $this->dom->callback!==null) | ||
384 | { | ||
385 | call_user_func_array($this->dom->callback, array($this)); | ||
386 | } | ||
387 | |||
388 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; | ||
389 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
390 | |||
391 | // render begin tag | ||
392 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) | ||
393 | { | ||
394 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); | ||
395 | } else { | ||
396 | $ret = ""; | ||
397 | } | ||
398 | |||
399 | // render inner text | ||
400 | if (isset($this->_[HDOM_INFO_INNER])) | ||
401 | { | ||
402 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. | ||
403 | if ($this->tag != "br") | ||
404 | { | ||
405 | $ret .= $this->_[HDOM_INFO_INNER]; | ||
406 | } | ||
407 | } else { | ||
408 | if ($this->nodes) | ||
409 | { | ||
410 | foreach ($this->nodes as $n) | ||
411 | { | ||
412 | $ret .= $this->convert_text($n->outertext()); | ||
413 | } | ||
414 | } | ||
415 | } | ||
416 | |||
417 | // render end tag | ||
418 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) | ||
419 | $ret .= '</'.$this->tag.'>'; | ||
420 | return $ret; | ||
421 | } | ||
422 | |||
423 | // get dom node's plain text | ||
424 | function text() | ||
425 | { | ||
426 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; | ||
427 | switch ($this->nodetype) | ||
428 | { | ||
429 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
430 | case HDOM_TYPE_COMMENT: return ''; | ||
431 | case HDOM_TYPE_UNKNOWN: return ''; | ||
432 | } | ||
433 | if (strcasecmp($this->tag, 'script')===0) return ''; | ||
434 | if (strcasecmp($this->tag, 'style')===0) return ''; | ||
435 | |||
436 | $ret = ''; | ||
437 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. | ||
438 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. | ||
439 | // WHY is this happening? | ||
440 | if (!is_null($this->nodes)) | ||
441 | { | ||
442 | foreach ($this->nodes as $n) | ||
443 | { | ||
444 | $ret .= $this->convert_text($n->text()); | ||
445 | } | ||
446 | |||
447 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. | ||
448 | if ($this->tag == "span") | ||
449 | { | ||
450 | $ret .= $this->dom->default_span_text; | ||
451 | } | ||
452 | |||
453 | |||
454 | } | ||
455 | return $ret; | ||
456 | } | ||
457 | |||
458 | function xmltext() | ||
459 | { | ||
460 | $ret = $this->innertext(); | ||
461 | $ret = str_ireplace('<![CDATA[', '', $ret); | ||
462 | $ret = str_replace(']]>', '', $ret); | ||
463 | return $ret; | ||
464 | } | ||
465 | |||
466 | // build node's text with tag | ||
467 | function makeup() | ||
468 | { | ||
469 | // text, comment, unknown | ||
470 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
471 | |||
472 | $ret = '<'.$this->tag; | ||
473 | $i = -1; | ||
474 | |||
475 | foreach ($this->attr as $key=>$val) | ||
476 | { | ||
477 | ++$i; | ||
478 | |||
479 | // skip removed attribute | ||
480 | if ($val===null || $val===false) | ||
481 | continue; | ||
482 | |||
483 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; | ||
484 | //no value attr: nowrap, checked selected... | ||
485 | if ($val===true) | ||
486 | $ret .= $key; | ||
487 | else { | ||
488 | switch ($this->_[HDOM_INFO_QUOTE][$i]) | ||
489 | { | ||
490 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; | ||
491 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; | ||
492 | default: $quote = ''; | ||
493 | } | ||
494 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; | ||
495 | } | ||
496 | } | ||
497 | $ret = $this->dom->restore_noise($ret); | ||
498 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; | ||
499 | } | ||
500 | |||
501 | // find elements by css selector | ||
502 | //PaperG - added ability for find to lowercase the value of the selector. | ||
503 | function find($selector, $idx=null, $lowercase=false) | ||
504 | { | ||
505 | $selectors = $this->parse_selector($selector); | ||
506 | if (($count=count($selectors))===0) return array(); | ||
507 | $found_keys = array(); | ||
508 | |||
509 | // find each selector | ||
510 | for ($c=0; $c<$count; ++$c) | ||
511 | { | ||
512 | // The change on the below line was documented on the sourceforge code tracker id 2788009 | ||
513 | // used to be: if (($levle=count($selectors[0]))===0) return array(); | ||
514 | if (($levle=count($selectors[$c]))===0) return array(); | ||
515 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); | ||
516 | |||
517 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); | ||
518 | |||
519 | // handle descendant selectors, no recursive! | ||
520 | for ($l=0; $l<$levle; ++$l) | ||
521 | { | ||
522 | $ret = array(); | ||
523 | foreach ($head as $k=>$v) | ||
524 | { | ||
525 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; | ||
526 | //PaperG - Pass this optional parameter on to the seek function. | ||
527 | $n->seek($selectors[$c][$l], $ret, $lowercase); | ||
528 | } | ||
529 | $head = $ret; | ||
530 | } | ||
531 | |||
532 | foreach ($head as $k=>$v) | ||
533 | { | ||
534 | if (!isset($found_keys[$k])) | ||
535 | $found_keys[$k] = 1; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | // sort keys | ||
540 | ksort($found_keys); | ||
541 | |||
542 | $found = array(); | ||
543 | foreach ($found_keys as $k=>$v) | ||
544 | $found[] = $this->dom->nodes[$k]; | ||
545 | |||
546 | // return nth-element or array | ||
547 | if (is_null($idx)) return $found; | ||
548 | else if ($idx<0) $idx = count($found) + $idx; | ||
549 | return (isset($found[$idx])) ? $found[$idx] : null; | ||
550 | } | ||
551 | |||
552 | // seek for given conditions | ||
553 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. | ||
554 | protected function seek($selector, &$ret, $lowercase=false) | ||
555 | { | ||
556 | global $debug_object; | ||
557 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | ||
558 | |||
559 | list($tag, $key, $val, $exp, $no_key) = $selector; | ||
560 | |||
561 | // xpath index | ||
562 | if ($tag && $key && is_numeric($key)) | ||
563 | { | ||
564 | $count = 0; | ||
565 | foreach ($this->children as $c) | ||
566 | { | ||
567 | if ($tag==='*' || $tag===$c->tag) { | ||
568 | if (++$count==$key) { | ||
569 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; | ||
570 | return; | ||
571 | } | ||
572 | } | ||
573 | } | ||
574 | return; | ||
575 | } | ||
576 | |||
577 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; | ||
578 | if ($end==0) { | ||
579 | $parent = $this->parent; | ||
580 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { | ||
581 | $end -= 1; | ||
582 | $parent = $parent->parent; | ||
583 | } | ||
584 | $end += $parent->_[HDOM_INFO_END]; | ||
585 | } | ||
586 | |||
587 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { | ||
588 | $node = $this->dom->nodes[$i]; | ||
589 | |||
590 | $pass = true; | ||
591 | |||
592 | if ($tag==='*' && !$key) { | ||
593 | if (in_array($node, $this->children, true)) | ||
594 | $ret[$i] = 1; | ||
595 | continue; | ||
596 | } | ||
597 | |||
598 | // compare tag | ||
599 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} | ||
600 | // compare key | ||
601 | if ($pass && $key) { | ||
602 | if ($no_key) { | ||
603 | if (isset($node->attr[$key])) $pass=false; | ||
604 | } else { | ||
605 | if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false; | ||
606 | } | ||
607 | } | ||
608 | // compare value | ||
609 | if ($pass && $key && $val && $val!=='*') { | ||
610 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? | ||
611 | if ($key == "plaintext") { | ||
612 | // $node->plaintext actually returns $node->text(); | ||
613 | $nodeKeyValue = $node->text(); | ||
614 | } else { | ||
615 | // this is a normal search, we want the value of that attribute of the tag. | ||
616 | $nodeKeyValue = $node->attr[$key]; | ||
617 | } | ||
618 | if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} | ||
619 | |||
620 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. | ||
621 | if ($lowercase) { | ||
622 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); | ||
623 | } else { | ||
624 | $check = $this->match($exp, $val, $nodeKeyValue); | ||
625 | } | ||
626 | if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));} | ||
627 | |||
628 | // handle multiple class | ||
629 | if (!$check && strcasecmp($key, 'class')===0) { | ||
630 | foreach (explode(' ',$node->attr[$key]) as $k) { | ||
631 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. | ||
632 | if (!empty($k)) { | ||
633 | if ($lowercase) { | ||
634 | $check = $this->match($exp, strtolower($val), strtolower($k)); | ||
635 | } else { | ||
636 | $check = $this->match($exp, $val, $k); | ||
637 | } | ||
638 | if ($check) break; | ||
639 | } | ||
640 | } | ||
641 | } | ||
642 | if (!$check) $pass = false; | ||
643 | } | ||
644 | if ($pass) $ret[$i] = 1; | ||
645 | unset($node); | ||
646 | } | ||
647 | // It's passed by reference so this is actually what this function returns. | ||
648 | if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);} | ||
649 | } | ||
650 | |||
651 | protected function match($exp, $pattern, $value) { | ||
652 | global $debug_object; | ||
653 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | ||
654 | |||
655 | switch ($exp) { | ||
656 | case '=': | ||
657 | return ($value===$pattern); | ||
658 | case '!=': | ||
659 | return ($value!==$pattern); | ||
660 | case '^=': | ||
661 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); | ||
662 | case '$=': | ||
663 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); | ||
664 | case '*=': | ||
665 | if ($pattern[0]=='/') { | ||
666 | return preg_match($pattern, $value); | ||
667 | } | ||
668 | return preg_match("/".$pattern."/i", $value); | ||
669 | } | ||
670 | return false; | ||
671 | } | ||
672 | |||
673 | protected function parse_selector($selector_string) { | ||
674 | global $debug_object; | ||
675 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | ||
676 | |||
677 | // pattern of CSS selectors, modified from mootools | ||
678 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. | ||
679 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. | ||
680 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. | ||
681 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. | ||
682 | // farther study is required to determine of this should be documented or removed. | ||
683 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | ||
684 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | ||
685 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); | ||
686 | if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);} | ||
687 | |||
688 | $selectors = array(); | ||
689 | $result = array(); | ||
690 | //print_r($matches); | ||
691 | |||
692 | foreach ($matches as $m) { | ||
693 | $m[0] = trim($m[0]); | ||
694 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; | ||
695 | // for browser generated xpath | ||
696 | if ($m[1]==='tbody') continue; | ||
697 | |||
698 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); | ||
699 | if (!empty($m[2])) {$key='id'; $val=$m[2];} | ||
700 | if (!empty($m[3])) {$key='class'; $val=$m[3];} | ||
701 | if (!empty($m[4])) {$key=$m[4];} | ||
702 | if (!empty($m[5])) {$exp=$m[5];} | ||
703 | if (!empty($m[6])) {$val=$m[6];} | ||
704 | |||
705 | // convert to lowercase | ||
706 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} | ||
707 | //elements that do NOT have the specified attribute | ||
708 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} | ||
709 | |||
710 | $result[] = array($tag, $key, $val, $exp, $no_key); | ||
711 | if (trim($m[7])===',') { | ||
712 | $selectors[] = $result; | ||
713 | $result = array(); | ||
714 | } | ||
715 | } | ||
716 | if (count($result)>0) | ||
717 | $selectors[] = $result; | ||
718 | return $selectors; | ||
719 | } | ||
720 | |||
721 | function __get($name) { | ||
722 | if (isset($this->attr[$name])) | ||
723 | { | ||
724 | return $this->convert_text($this->attr[$name]); | ||
725 | } | ||
726 | switch ($name) { | ||
727 | case 'outertext': return $this->outertext(); | ||
728 | case 'innertext': return $this->innertext(); | ||
729 | case 'plaintext': return $this->text(); | ||
730 | case 'xmltext': return $this->xmltext(); | ||
731 | default: return array_key_exists($name, $this->attr); | ||
732 | } | ||
733 | } | ||
734 | |||
735 | function __set($name, $value) { | ||
736 | switch ($name) { | ||
737 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; | ||
738 | case 'innertext': | ||
739 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; | ||
740 | return $this->_[HDOM_INFO_INNER] = $value; | ||
741 | } | ||
742 | if (!isset($this->attr[$name])) { | ||
743 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); | ||
744 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | ||
745 | } | ||
746 | $this->attr[$name] = $value; | ||
747 | } | ||
748 | |||
749 | function __isset($name) { | ||
750 | switch ($name) { | ||
751 | case 'outertext': return true; | ||
752 | case 'innertext': return true; | ||
753 | case 'plaintext': return true; | ||
754 | } | ||
755 | //no value attr: nowrap, checked selected... | ||
756 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); | ||
757 | } | ||
758 | |||
759 | function __unset($name) { | ||
760 | if (isset($this->attr[$name])) | ||
761 | unset($this->attr[$name]); | ||
762 | } | ||
763 | |||
764 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. | ||
765 | function convert_text($text) | ||
766 | { | ||
767 | global $debug_object; | ||
768 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | ||
769 | |||
770 | $converted_text = $text; | ||
771 | |||
772 | $sourceCharset = ""; | ||
773 | $targetCharset = ""; | ||
774 | |||
775 | if ($this->dom) | ||
776 | { | ||
777 | $sourceCharset = strtoupper($this->dom->_charset); | ||
778 | $targetCharset = strtoupper($this->dom->_target_charset); | ||
779 | } | ||
780 | if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} | ||
781 | |||
782 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) | ||
783 | { | ||
784 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 | ||
785 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) | ||
786 | { | ||
787 | $converted_text = $text; | ||
788 | } | ||
789 | else | ||
790 | { | ||
791 | $converted_text = iconv($sourceCharset, $targetCharset, $text); | ||
792 | } | ||
793 | } | ||
794 | |||
795 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. | ||
796 | if ($targetCharset == 'UTF-8') | ||
797 | { | ||
798 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") | ||
799 | { | ||
800 | $converted_text = substr($converted_text, 3); | ||
801 | } | ||
802 | if (substr($converted_text, -3) == "\xef\xbb\xbf") | ||
803 | { | ||
804 | $converted_text = substr($converted_text, 0, -3); | ||
805 | } | ||
806 | } | ||
807 | |||
808 | return $converted_text; | ||
809 | } | ||
810 | |||
811 | /** | ||
812 | * Returns true if $string is valid UTF-8 and false otherwise. | ||
813 | * | ||
814 | * @param mixed $str String to be tested | ||
815 | * @return boolean | ||
816 | */ | ||
817 | static function is_utf8($str) | ||
818 | { | ||
819 | $c=0; $b=0; | ||
820 | $bits=0; | ||
821 | $len=strlen($str); | ||
822 | for($i=0; $i<$len; $i++) | ||
823 | { | ||
824 | $c=ord($str[$i]); | ||
825 | if($c > 128) | ||
826 | { | ||
827 | if(($c >= 254)) return false; | ||
828 | elseif($c >= 252) $bits=6; | ||
829 | elseif($c >= 248) $bits=5; | ||
830 | elseif($c >= 240) $bits=4; | ||
831 | elseif($c >= 224) $bits=3; | ||
832 | elseif($c >= 192) $bits=2; | ||
833 | else return false; | ||
834 | if(($i+$bits) > $len) return false; | ||
835 | while($bits > 1) | ||
836 | { | ||
837 | $i++; | ||
838 | $b=ord($str[$i]); | ||
839 | if($b < 128 || $b > 191) return false; | ||
840 | $bits--; | ||
841 | } | ||
842 | } | ||
843 | } | ||
844 | return true; | ||
845 | } | ||
846 | /* | ||
847 | function is_utf8($string) | ||
848 | { | ||
849 | //this is buggy | ||
850 | return (utf8_encode(utf8_decode($string)) == $string); | ||
851 | } | ||
852 | */ | ||
853 | |||
854 | /** | ||
855 | * Function to try a few tricks to determine the displayed size of an img on the page. | ||
856 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. | ||
857 | * | ||
858 | * @author John Schlick | ||
859 | * @version April 19 2012 | ||
860 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. | ||
861 | */ | ||
862 | function get_display_size() | ||
863 | { | ||
864 | global $debug_object; | ||
865 | |||
866 | $width = -1; | ||
867 | $height = -1; | ||
868 | |||
869 | if ($this->tag !== 'img') | ||
870 | { | ||
871 | return false; | ||
872 | } | ||
873 | |||
874 | // See if there is aheight or width attribute in the tag itself. | ||
875 | if (isset($this->attr['width'])) | ||
876 | { | ||
877 | $width = $this->attr['width']; | ||
878 | } | ||
879 | |||
880 | if (isset($this->attr['height'])) | ||
881 | { | ||
882 | $height = $this->attr['height']; | ||
883 | } | ||
884 | |||
885 | // Now look for an inline style. | ||
886 | if (isset($this->attr['style'])) | ||
887 | { | ||
888 | // Thanks to user gnarf from stackoverflow for this regular expression. | ||
889 | $attributes = array(); | ||
890 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); | ||
891 | foreach ($matches as $match) { | ||
892 | $attributes[$match[1]] = $match[2]; | ||
893 | } | ||
894 | |||
895 | // If there is a width in the style attributes: | ||
896 | if (isset($attributes['width']) && $width == -1) | ||
897 | { | ||
898 | // check that the last two characters are px (pixels) | ||
899 | if (strtolower(substr($attributes['width'], -2)) == 'px') | ||
900 | { | ||
901 | $proposed_width = substr($attributes['width'], 0, -2); | ||
902 | // Now make sure that it's an integer and not something stupid. | ||
903 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) | ||
904 | { | ||
905 | $width = $proposed_width; | ||
906 | } | ||
907 | } | ||
908 | } | ||
909 | |||
910 | // If there is a width in the style attributes: | ||
911 | if (isset($attributes['height']) && $height == -1) | ||
912 | { | ||
913 | // check that the last two characters are px (pixels) | ||
914 | if (strtolower(substr($attributes['height'], -2)) == 'px') | ||
915 | { | ||
916 | $proposed_height = substr($attributes['height'], 0, -2); | ||
917 | // Now make sure that it's an integer and not something stupid. | ||
918 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) | ||
919 | { | ||
920 | $height = $proposed_height; | ||
921 | } | ||
922 | } | ||
923 | } | ||
924 | |||
925 | } | ||
926 | |||
927 | // Future enhancement: | ||
928 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. | ||
929 | |||
930 | // Far future enhancement | ||
931 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width | ||
932 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. | ||
933 | |||
934 | // ridiculously far future development | ||
935 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. | ||
936 | |||
937 | $result = array('height' => $height, | ||
938 | 'width' => $width); | ||
939 | return $result; | ||
940 | } | ||
941 | |||
942 | // camel naming conventions | ||
943 | function getAllAttributes() {return $this->attr;} | ||
944 | function getAttribute($name) {return $this->__get($name);} | ||
945 | function setAttribute($name, $value) {$this->__set($name, $value);} | ||
946 | function hasAttribute($name) {return $this->__isset($name);} | ||
947 | function removeAttribute($name) {$this->__set($name, null);} | ||
948 | function getElementById($id) {return $this->find("#$id", 0);} | ||
949 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} | ||
950 | function getElementByTagName($name) {return $this->find($name, 0);} | ||
951 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} | ||
952 | function parentNode() {return $this->parent();} | ||
953 | function childNodes($idx=-1) {return $this->children($idx);} | ||
954 | function firstChild() {return $this->first_child();} | ||
955 | function lastChild() {return $this->last_child();} | ||
956 | function nextSibling() {return $this->next_sibling();} | ||
957 | function previousSibling() {return $this->prev_sibling();} | ||
958 | function hasChildNodes() {return $this->has_child();} | ||
959 | function nodeName() {return $this->tag;} | ||
960 | function appendChild($node) {$node->parent($this); return $node;} | ||
961 | |||
962 | } | ||
963 | |||
964 | /** | ||
965 | * simple html dom parser | ||
966 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. | ||
967 | * Paperg - change $size from protected to public so we can easily access it | ||
968 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. | ||
969 | * | ||
970 | * @package PlaceLocalInclude | ||
971 | */ | ||
972 | class simple_html_dom | ||
973 | { | ||
974 | public $root = null; | ||
975 | public $nodes = array(); | ||
976 | public $callback = null; | ||
977 | public $lowercase = false; | ||
978 | // Used to keep track of how large the text was when we started. | ||
979 | public $original_size; | ||
980 | public $size; | ||
981 | protected $pos; | ||
982 | protected $doc; | ||
983 | protected $char; | ||
984 | protected $cursor; | ||
985 | protected $parent; | ||
986 | protected $noise = array(); | ||
987 | protected $token_blank = " \t\r\n"; | ||
988 | protected $token_equal = ' =/>'; | ||
989 | protected $token_slash = " />\r\n\t"; | ||
990 | protected $token_attr = ' >'; | ||
991 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. | ||
992 | public $_charset = ''; | ||
993 | public $_target_charset = ''; | ||
994 | protected $default_br_text = ""; | ||
995 | public $default_span_text = ""; | ||
996 | |||
997 | // use isset instead of in_array, performance boost about 30%... | ||
998 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); | ||
999 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); | ||
1000 | // Known sourceforge issue #2977341 | ||
1001 | // B tags that are not closed cause us to return everything to the end of the document. | ||
1002 | protected $optional_closing_tags = array( | ||
1003 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), | ||
1004 | 'th'=>array('th'=>1), | ||
1005 | 'td'=>array('td'=>1), | ||
1006 | 'li'=>array('li'=>1), | ||
1007 | 'dt'=>array('dt'=>1, 'dd'=>1), | ||
1008 | 'dd'=>array('dd'=>1, 'dt'=>1), | ||
1009 | 'dl'=>array('dd'=>1, 'dt'=>1), | ||
1010 | 'p'=>array('p'=>1), | ||
1011 | 'nobr'=>array('nobr'=>1), | ||
1012 | 'b'=>array('b'=>1), | ||
1013 | 'option'=>array('option'=>1), | ||
1014 | ); | ||
1015 | |||
1016 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) | ||
1017 | { | ||
1018 | if ($str) | ||
1019 | { | ||
1020 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) | ||
1021 | { | ||
1022 | $this->load_file($str); | ||
1023 | } | ||
1024 | else | ||
1025 | { | ||
1026 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); | ||
1027 | } | ||
1028 | } | ||
1029 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. | ||
1030 | if (!$forceTagsClosed) { | ||
1031 | $this->optional_closing_array=array(); | ||
1032 | } | ||
1033 | $this->_target_charset = $target_charset; | ||
1034 | } | ||
1035 | |||
1036 | function __destruct() | ||
1037 | { | ||
1038 | $this->clear(); | ||
1039 | } | ||
1040 | |||
1041 | // load html from string | ||
1042 | function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) | ||
1043 | { | ||
1044 | global $debug_object; | ||
1045 | |||
1046 | // prepare | ||
1047 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); | ||
1048 | // strip out comments | ||
1049 | $this->remove_noise("'<!--(.*?)-->'is"); | ||
1050 | // strip out cdata | ||
1051 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); | ||
1052 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 | ||
1053 | // Script tags removal now preceeds style tag removal. | ||
1054 | // strip out <script> tags | ||
1055 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); | ||
1056 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); | ||
1057 | // strip out <style> tags | ||
1058 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); | ||
1059 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); | ||
1060 | // strip out preformatted tags | ||
1061 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); | ||
1062 | // strip out server side scripts | ||
1063 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); | ||
1064 | // strip smarty scripts | ||
1065 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); | ||
1066 | |||
1067 | // parsing | ||
1068 | while ($this->parse()); | ||
1069 | // end | ||
1070 | $this->root->_[HDOM_INFO_END] = $this->cursor; | ||
1071 | $this->parse_charset(); | ||
1072 | |||
1073 | // make load function chainable | ||
1074 | return $this; | ||
1075 | |||
1076 | } | ||
1077 | |||
1078 | // load html from file | ||
1079 | function load_file() | ||
1080 | { | ||
1081 | $args = func_get_args(); | ||
1082 | $this->load(call_user_func_array('file_get_contents', $args), true); | ||
1083 | // Throw an error if we can't properly load the dom. | ||
1084 | if (($error=error_get_last())!==null) { | ||
1085 | $this->clear(); | ||
1086 | return false; | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1090 | // set callback function | ||
1091 | function set_callback($function_name) | ||
1092 | { | ||
1093 | $this->callback = $function_name; | ||
1094 | } | ||
1095 | |||
1096 | // remove callback function | ||
1097 | function remove_callback() | ||
1098 | { | ||
1099 | $this->callback = null; | ||
1100 | } | ||
1101 | |||
1102 | // save dom as string | ||
1103 | function save($filepath='') | ||
1104 | { | ||
1105 | $ret = $this->root->innertext(); | ||
1106 | if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX); | ||
1107 | return $ret; | ||
1108 | } | ||
1109 | |||
1110 | // find dom node by css selector | ||
1111 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. | ||
1112 | function find($selector, $idx=null, $lowercase=false) | ||
1113 | { | ||
1114 | return $this->root->find($selector, $idx, $lowercase); | ||
1115 | } | ||
1116 | |||
1117 | // clean up memory due to php5 circular references memory leak... | ||
1118 | function clear() | ||
1119 | { | ||
1120 | foreach ($this->nodes as $n) {$n->clear(); $n = null;} | ||
1121 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. | ||
1122 | if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;} | ||
1123 | if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} | ||
1124 | if (isset($this->root)) {$this->root->clear(); unset($this->root);} | ||
1125 | unset($this->doc); | ||
1126 | unset($this->noise); | ||
1127 | } | ||
1128 | |||
1129 | function dump($show_attr=true) | ||
1130 | { | ||
1131 | $this->root->dump($show_attr); | ||
1132 | } | ||
1133 | |||
1134 | // prepare HTML data and init everything | ||
1135 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) | ||
1136 | { | ||
1137 | $this->clear(); | ||
1138 | |||
1139 | // set the length of content before we do anything to it. | ||
1140 | $this->size = strlen($str); | ||
1141 | // Save the original size of the html that we got in. It might be useful to someone. | ||
1142 | $this->original_size = $this->size; | ||
1143 | |||
1144 | //before we save the string as the doc... strip out the \r \n's if we are told to. | ||
1145 | if ($stripRN) { | ||
1146 | $str = str_replace("\r", " ", $str); | ||
1147 | $str = str_replace("\n", " ", $str); | ||
1148 | |||
1149 | // set the length of content since we have changed it. | ||
1150 | $this->size = strlen($str); | ||
1151 | } | ||
1152 | |||
1153 | $this->doc = $str; | ||
1154 | $this->pos = 0; | ||
1155 | $this->cursor = 1; | ||
1156 | $this->noise = array(); | ||
1157 | $this->nodes = array(); | ||
1158 | $this->lowercase = $lowercase; | ||
1159 | $this->default_br_text = $defaultBRText; | ||
1160 | $this->default_span_text = $defaultSpanText; | ||
1161 | $this->root = new simple_html_dom_node($this); | ||
1162 | $this->root->tag = 'root'; | ||
1163 | $this->root->_[HDOM_INFO_BEGIN] = -1; | ||
1164 | $this->root->nodetype = HDOM_TYPE_ROOT; | ||
1165 | $this->parent = $this->root; | ||
1166 | if ($this->size>0) $this->char = $this->doc[0]; | ||
1167 | } | ||
1168 | |||
1169 | // parse html content | ||
1170 | protected function parse() | ||
1171 | { | ||
1172 | if (($s = $this->copy_until_char('<'))==='') | ||
1173 | { | ||
1174 | return $this->read_tag(); | ||
1175 | } | ||
1176 | |||
1177 | // text | ||
1178 | $node = new simple_html_dom_node($this); | ||
1179 | ++$this->cursor; | ||
1180 | $node->_[HDOM_INFO_TEXT] = $s; | ||
1181 | $this->link_nodes($node, false); | ||
1182 | return true; | ||
1183 | } | ||
1184 | |||
1185 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. | ||
1186 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec | ||
1187 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. | ||
1188 | protected function parse_charset() | ||
1189 | { | ||
1190 | global $debug_object; | ||
1191 | |||
1192 | $charset = null; | ||
1193 | |||
1194 | if (function_exists('get_last_retrieve_url_contents_content_type')) | ||
1195 | { | ||
1196 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); | ||
1197 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); | ||
1198 | if ($success) | ||
1199 | { | ||
1200 | $charset = $matches[1]; | ||
1201 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);} | ||
1202 | } | ||
1203 | |||
1204 | } | ||
1205 | |||
1206 | if (empty($charset)) | ||
1207 | { | ||
1208 | $el = $this->root->find('meta[http-equiv=Content-Type]',0); | ||
1209 | if (!empty($el)) | ||
1210 | { | ||
1211 | $fullvalue = $el->content; | ||
1212 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);} | ||
1213 | |||
1214 | if (!empty($fullvalue)) | ||
1215 | { | ||
1216 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); | ||
1217 | if ($success) | ||
1218 | { | ||
1219 | $charset = $matches[1]; | ||
1220 | } | ||
1221 | else | ||
1222 | { | ||
1223 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 | ||
1224 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} | ||
1225 | $charset = 'ISO-8859-1'; | ||
1226 | } | ||
1227 | } | ||
1228 | } | ||
1229 | } | ||
1230 | |||
1231 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... | ||
1232 | if (empty($charset)) | ||
1233 | { | ||
1234 | // Have php try to detect the encoding from the text given to us. | ||
1235 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); | ||
1236 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);} | ||
1237 | |||
1238 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... | ||
1239 | if ($charset === false) | ||
1240 | { | ||
1241 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');} | ||
1242 | $charset = 'UTF-8'; | ||
1243 | } | ||
1244 | } | ||
1245 | |||
1246 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. | ||
1247 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) | ||
1248 | { | ||
1249 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} | ||
1250 | $charset = 'CP1252'; | ||
1251 | } | ||
1252 | |||
1253 | if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);} | ||
1254 | |||
1255 | return $this->_charset = $charset; | ||
1256 | } | ||
1257 | |||
1258 | // read tag info | ||
1259 | protected function read_tag() | ||
1260 | { | ||
1261 | if ($this->char!=='<') | ||
1262 | { | ||
1263 | $this->root->_[HDOM_INFO_END] = $this->cursor; | ||
1264 | return false; | ||
1265 | } | ||
1266 | $begin_tag_pos = $this->pos; | ||
1267 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1268 | |||
1269 | // end tag | ||
1270 | if ($this->char==='/') | ||
1271 | { | ||
1272 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1273 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. | ||
1274 | // $this->skip($this->token_blank_t); | ||
1275 | $this->skip($this->token_blank); | ||
1276 | $tag = $this->copy_until_char('>'); | ||
1277 | |||
1278 | // skip attributes in end tag | ||
1279 | if (($pos = strpos($tag, ' '))!==false) | ||
1280 | $tag = substr($tag, 0, $pos); | ||
1281 | |||
1282 | $parent_lower = strtolower($this->parent->tag); | ||
1283 | $tag_lower = strtolower($tag); | ||
1284 | |||
1285 | if ($parent_lower!==$tag_lower) | ||
1286 | { | ||
1287 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) | ||
1288 | { | ||
1289 | $this->parent->_[HDOM_INFO_END] = 0; | ||
1290 | $org_parent = $this->parent; | ||
1291 | |||
1292 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) | ||
1293 | $this->parent = $this->parent->parent; | ||
1294 | |||
1295 | if (strtolower($this->parent->tag)!==$tag_lower) { | ||
1296 | $this->parent = $org_parent; // restore origonal parent | ||
1297 | if ($this->parent->parent) $this->parent = $this->parent->parent; | ||
1298 | $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
1299 | return $this->as_text_node($tag); | ||
1300 | } | ||
1301 | } | ||
1302 | else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) | ||
1303 | { | ||
1304 | $this->parent->_[HDOM_INFO_END] = 0; | ||
1305 | $org_parent = $this->parent; | ||
1306 | |||
1307 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) | ||
1308 | $this->parent = $this->parent->parent; | ||
1309 | |||
1310 | if (strtolower($this->parent->tag)!==$tag_lower) | ||
1311 | { | ||
1312 | $this->parent = $org_parent; // restore origonal parent | ||
1313 | $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
1314 | return $this->as_text_node($tag); | ||
1315 | } | ||
1316 | } | ||
1317 | else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) | ||
1318 | { | ||
1319 | $this->parent->_[HDOM_INFO_END] = 0; | ||
1320 | $this->parent = $this->parent->parent; | ||
1321 | } | ||
1322 | else | ||
1323 | return $this->as_text_node($tag); | ||
1324 | } | ||
1325 | |||
1326 | $this->parent->_[HDOM_INFO_END] = $this->cursor; | ||
1327 | if ($this->parent->parent) $this->parent = $this->parent->parent; | ||
1328 | |||
1329 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1330 | return true; | ||
1331 | } | ||
1332 | |||
1333 | $node = new simple_html_dom_node($this); | ||
1334 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; | ||
1335 | ++$this->cursor; | ||
1336 | $tag = $this->copy_until($this->token_slash); | ||
1337 | $node->tag_start = $begin_tag_pos; | ||
1338 | |||
1339 | // doctype, cdata & comments... | ||
1340 | if (isset($tag[0]) && $tag[0]==='!') { | ||
1341 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); | ||
1342 | |||
1343 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { | ||
1344 | $node->nodetype = HDOM_TYPE_COMMENT; | ||
1345 | $node->tag = 'comment'; | ||
1346 | } else { | ||
1347 | $node->nodetype = HDOM_TYPE_UNKNOWN; | ||
1348 | $node->tag = 'unknown'; | ||
1349 | } | ||
1350 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; | ||
1351 | $this->link_nodes($node, true); | ||
1352 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1353 | return true; | ||
1354 | } | ||
1355 | |||
1356 | // text | ||
1357 | if ($pos=strpos($tag, '<')!==false) { | ||
1358 | $tag = '<' . substr($tag, 0, -1); | ||
1359 | $node->_[HDOM_INFO_TEXT] = $tag; | ||
1360 | $this->link_nodes($node, false); | ||
1361 | $this->char = $this->doc[--$this->pos]; // prev | ||
1362 | return true; | ||
1363 | } | ||
1364 | |||
1365 | if (!preg_match("/^[\w-:]+$/", $tag)) { | ||
1366 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); | ||
1367 | if ($this->char==='<') { | ||
1368 | $this->link_nodes($node, false); | ||
1369 | return true; | ||
1370 | } | ||
1371 | |||
1372 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; | ||
1373 | $this->link_nodes($node, false); | ||
1374 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1375 | return true; | ||
1376 | } | ||
1377 | |||
1378 | // begin tag | ||
1379 | $node->nodetype = HDOM_TYPE_ELEMENT; | ||
1380 | $tag_lower = strtolower($tag); | ||
1381 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; | ||
1382 | |||
1383 | // handle optional closing tags | ||
1384 | if (isset($this->optional_closing_tags[$tag_lower]) ) | ||
1385 | { | ||
1386 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) | ||
1387 | { | ||
1388 | $this->parent->_[HDOM_INFO_END] = 0; | ||
1389 | $this->parent = $this->parent->parent; | ||
1390 | } | ||
1391 | $node->parent = $this->parent; | ||
1392 | } | ||
1393 | |||
1394 | $guard = 0; // prevent infinity loop | ||
1395 | $space = array($this->copy_skip($this->token_blank), '', ''); | ||
1396 | |||
1397 | // attributes | ||
1398 | do | ||
1399 | { | ||
1400 | if ($this->char!==null && $space[0]==='') | ||
1401 | { | ||
1402 | break; | ||
1403 | } | ||
1404 | $name = $this->copy_until($this->token_equal); | ||
1405 | if ($guard===$this->pos) | ||
1406 | { | ||
1407 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1408 | continue; | ||
1409 | } | ||
1410 | $guard = $this->pos; | ||
1411 | |||
1412 | // handle endless '<' | ||
1413 | if ($this->pos>=$this->size-1 && $this->char!=='>') { | ||
1414 | $node->nodetype = HDOM_TYPE_TEXT; | ||
1415 | $node->_[HDOM_INFO_END] = 0; | ||
1416 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; | ||
1417 | $node->tag = 'text'; | ||
1418 | $this->link_nodes($node, false); | ||
1419 | return true; | ||
1420 | } | ||
1421 | |||
1422 | // handle mismatch '<' | ||
1423 | if ($this->doc[$this->pos-1]=='<') { | ||
1424 | $node->nodetype = HDOM_TYPE_TEXT; | ||
1425 | $node->tag = 'text'; | ||
1426 | $node->attr = array(); | ||
1427 | $node->_[HDOM_INFO_END] = 0; | ||
1428 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); | ||
1429 | $this->pos -= 2; | ||
1430 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1431 | $this->link_nodes($node, false); | ||
1432 | return true; | ||
1433 | } | ||
1434 | |||
1435 | if ($name!=='/' && $name!=='') { | ||
1436 | $space[1] = $this->copy_skip($this->token_blank); | ||
1437 | $name = $this->restore_noise($name); | ||
1438 | if ($this->lowercase) $name = strtolower($name); | ||
1439 | if ($this->char==='=') { | ||
1440 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1441 | $this->parse_attr($node, $name, $space); | ||
1442 | } | ||
1443 | else { | ||
1444 | //no value attr: nowrap, checked selected... | ||
1445 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; | ||
1446 | $node->attr[$name] = true; | ||
1447 | if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev | ||
1448 | } | ||
1449 | $node->_[HDOM_INFO_SPACE][] = $space; | ||
1450 | $space = array($this->copy_skip($this->token_blank), '', ''); | ||
1451 | } | ||
1452 | else | ||
1453 | break; | ||
1454 | } while ($this->char!=='>' && $this->char!=='/'); | ||
1455 | |||
1456 | $this->link_nodes($node, true); | ||
1457 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; | ||
1458 | |||
1459 | // check self closing | ||
1460 | if ($this->copy_until_char_escape('>')==='/') | ||
1461 | { | ||
1462 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; | ||
1463 | $node->_[HDOM_INFO_END] = 0; | ||
1464 | } | ||
1465 | else | ||
1466 | { | ||
1467 | // reset parent | ||
1468 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; | ||
1469 | } | ||
1470 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1471 | |||
1472 | // If it's a BR tag, we need to set it's text to the default text. | ||
1473 | // This way when we see it in plaintext, we can generate formatting that the user wants. | ||
1474 | // since a br tag never has sub nodes, this works well. | ||
1475 | if ($node->tag == "br") | ||
1476 | { | ||
1477 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; | ||
1478 | } | ||
1479 | |||
1480 | return true; | ||
1481 | } | ||
1482 | |||
1483 | // parse attributes | ||
1484 | protected function parse_attr($node, $name, &$space) | ||
1485 | { | ||
1486 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 | ||
1487 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. | ||
1488 | if (isset($node->attr[$name])) | ||
1489 | { | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | $space[2] = $this->copy_skip($this->token_blank); | ||
1494 | switch ($this->char) { | ||
1495 | case '"': | ||
1496 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | ||
1497 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1498 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); | ||
1499 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1500 | break; | ||
1501 | case '\'': | ||
1502 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; | ||
1503 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1504 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); | ||
1505 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1506 | break; | ||
1507 | default: | ||
1508 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; | ||
1509 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); | ||
1510 | } | ||
1511 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. | ||
1512 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); | ||
1513 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); | ||
1514 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. | ||
1515 | if ($name == "class") { | ||
1516 | $node->attr[$name] = trim($node->attr[$name]); | ||
1517 | } | ||
1518 | } | ||
1519 | |||
1520 | // link node's parent | ||
1521 | protected function link_nodes(&$node, $is_child) | ||
1522 | { | ||
1523 | $node->parent = $this->parent; | ||
1524 | $this->parent->nodes[] = $node; | ||
1525 | if ($is_child) | ||
1526 | { | ||
1527 | $this->parent->children[] = $node; | ||
1528 | } | ||
1529 | } | ||
1530 | |||
1531 | // as a text node | ||
1532 | protected function as_text_node($tag) | ||
1533 | { | ||
1534 | $node = new simple_html_dom_node($this); | ||
1535 | ++$this->cursor; | ||
1536 | $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; | ||
1537 | $this->link_nodes($node, false); | ||
1538 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1539 | return true; | ||
1540 | } | ||
1541 | |||
1542 | protected function skip($chars) | ||
1543 | { | ||
1544 | $this->pos += strspn($this->doc, $chars, $this->pos); | ||
1545 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1546 | } | ||
1547 | |||
1548 | protected function copy_skip($chars) | ||
1549 | { | ||
1550 | $pos = $this->pos; | ||
1551 | $len = strspn($this->doc, $chars, $pos); | ||
1552 | $this->pos += $len; | ||
1553 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1554 | if ($len===0) return ''; | ||
1555 | return substr($this->doc, $pos, $len); | ||
1556 | } | ||
1557 | |||
1558 | protected function copy_until($chars) | ||
1559 | { | ||
1560 | $pos = $this->pos; | ||
1561 | $len = strcspn($this->doc, $chars, $pos); | ||
1562 | $this->pos += $len; | ||
1563 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next | ||
1564 | return substr($this->doc, $pos, $len); | ||
1565 | } | ||
1566 | |||
1567 | protected function copy_until_char($char) | ||
1568 | { | ||
1569 | if ($this->char===null) return ''; | ||
1570 | |||
1571 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { | ||
1572 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); | ||
1573 | $this->char = null; | ||
1574 | $this->pos = $this->size; | ||
1575 | return $ret; | ||
1576 | } | ||
1577 | |||
1578 | if ($pos===$this->pos) return ''; | ||
1579 | $pos_old = $this->pos; | ||
1580 | $this->char = $this->doc[$pos]; | ||
1581 | $this->pos = $pos; | ||
1582 | return substr($this->doc, $pos_old, $pos-$pos_old); | ||
1583 | } | ||
1584 | |||
1585 | protected function copy_until_char_escape($char) | ||
1586 | { | ||
1587 | if ($this->char===null) return ''; | ||
1588 | |||
1589 | $start = $this->pos; | ||
1590 | while (1) | ||
1591 | { | ||
1592 | if (($pos = strpos($this->doc, $char, $start))===false) | ||
1593 | { | ||
1594 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); | ||
1595 | $this->char = null; | ||
1596 | $this->pos = $this->size; | ||
1597 | return $ret; | ||
1598 | } | ||
1599 | |||
1600 | if ($pos===$this->pos) return ''; | ||
1601 | |||
1602 | if ($this->doc[$pos-1]==='\\') { | ||
1603 | $start = $pos+1; | ||
1604 | continue; | ||
1605 | } | ||
1606 | |||
1607 | $pos_old = $this->pos; | ||
1608 | $this->char = $this->doc[$pos]; | ||
1609 | $this->pos = $pos; | ||
1610 | return substr($this->doc, $pos_old, $pos-$pos_old); | ||
1611 | } | ||
1612 | } | ||
1613 | |||
1614 | // remove noise from html content | ||
1615 | // save the noise in the $this->noise array. | ||
1616 | protected function remove_noise($pattern, $remove_tag=false) | ||
1617 | { | ||
1618 | global $debug_object; | ||
1619 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | ||
1620 | |||
1621 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); | ||
1622 | |||
1623 | for ($i=$count-1; $i>-1; --$i) | ||
1624 | { | ||
1625 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); | ||
1626 | if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); } | ||
1627 | $idx = ($remove_tag) ? 0 : 1; | ||
1628 | $this->noise[$key] = $matches[$i][$idx][0]; | ||
1629 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); | ||
1630 | } | ||
1631 | |||
1632 | // reset the length of content | ||
1633 | $this->size = strlen($this->doc); | ||
1634 | if ($this->size>0) | ||
1635 | { | ||
1636 | $this->char = $this->doc[0]; | ||
1637 | } | ||
1638 | } | ||
1639 | |||
1640 | // restore noise to html content | ||
1641 | function restore_noise($text) | ||
1642 | { | ||
1643 | global $debug_object; | ||
1644 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | ||
1645 | |||
1646 | while (($pos=strpos($text, '___noise___'))!==false) | ||
1647 | { | ||
1648 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... | ||
1649 | if (strlen($text) > $pos+15) | ||
1650 | { | ||
1651 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; | ||
1652 | if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); } | ||
1653 | |||
1654 | if (isset($this->noise[$key])) | ||
1655 | { | ||
1656 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); | ||
1657 | } | ||
1658 | else | ||
1659 | { | ||
1660 | // do this to prevent an infinite loop. | ||
1661 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); | ||
1662 | } | ||
1663 | } | ||
1664 | else | ||
1665 | { | ||
1666 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. | ||
1667 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); | ||
1668 | } | ||
1669 | } | ||
1670 | return $text; | ||
1671 | } | ||
1672 | |||
1673 | // Sometimes we NEED one of the noise elements. | ||
1674 | function search_noise($text) | ||
1675 | { | ||
1676 | global $debug_object; | ||
1677 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | ||
1678 | |||
1679 | foreach($this->noise as $noiseElement) | ||
1680 | { | ||
1681 | if (strpos($noiseElement, $text)!==false) | ||
1682 | { | ||
1683 | return $noiseElement; | ||
1684 | } | ||
1685 | } | ||
1686 | } | ||
1687 | function __toString() | ||
1688 | { | ||
1689 | return $this->root->innertext(); | ||
1690 | } | ||
1691 | |||
1692 | function __get($name) | ||
1693 | { | ||
1694 | switch ($name) | ||
1695 | { | ||
1696 | case 'outertext': | ||
1697 | return $this->root->innertext(); | ||
1698 | case 'innertext': | ||
1699 | return $this->root->innertext(); | ||
1700 | case 'plaintext': | ||
1701 | return $this->root->text(); | ||
1702 | case 'charset': | ||
1703 | return $this->_charset; | ||
1704 | case 'target_charset': | ||
1705 | return $this->_target_charset; | ||
1706 | } | ||
1707 | } | ||
1708 | |||
1709 | // camel naming conventions | ||
1710 | function childNodes($idx=-1) {return $this->root->childNodes($idx);} | ||
1711 | function firstChild() {return $this->root->first_child();} | ||
1712 | function lastChild() {return $this->root->last_child();} | ||
1713 | function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();} | ||
1714 | function createTextNode($value) {return @end(str_get_html($value)->nodes);} | ||
1715 | function getElementById($id) {return $this->find("#$id", 0);} | ||
1716 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} | ||
1717 | function getElementByTagName($name) {return $this->find($name, 0);} | ||
1718 | function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} | ||
1719 | function loadFile() {$args = func_get_args();$this->load_file($args);} | ||
1720 | } | ||
1721 | |||
1722 | ?> \ No newline at end of file | ||