aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty
diff options
context:
space:
mode:
authorNicolas Lœuillet <nicolas.loeuillet@gmail.com>2013-08-02 22:40:51 +0200
committerNicolas Lœuillet <nicolas.loeuillet@gmail.com>2013-08-02 22:40:51 +0200
commita4565e88edbc8e3bd092a475469769c86a4c350c (patch)
treea6a3c935b03a23ff87575c8c315cf8ba78fe68c2 /inc/3rdparty
parentf6c9baab3efeec1d0efa151e276fc08d5b58f9e9 (diff)
downloadwallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.gz
wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.zst
wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.zip
add Twig & refactor poche
Diffstat (limited to 'inc/3rdparty')
-rw-r--r--inc/3rdparty/Encoding.php262
-rw-r--r--inc/3rdparty/JSLikeHTMLElement.php109
-rw-r--r--inc/3rdparty/Readability.php1137
-rw-r--r--inc/3rdparty/Session.class.php136
-rw-r--r--inc/3rdparty/simple_html_dom.php1722
5 files changed, 3366 insertions, 0 deletions
diff --git a/inc/3rdparty/Encoding.php b/inc/3rdparty/Encoding.php
new file mode 100644
index 00000000..577763b4
--- /dev/null
+++ b/inc/3rdparty/Encoding.php
@@ -0,0 +1,262 @@
1<?php
2/**
3 * @author "Sebastián Grignoli" <grignoli@framework2.com.ar>
4 * @package Encoding
5 * @version 1.1
6 * @link http://www.framework2.com.ar/dzone/forceUTF8-es/
7 * @example http://www.framework2.com.ar/dzone/forceUTF8-es/
8 */
9
10class Encoding {
11
12 protected static $win1252ToUtf8 = array(
13 128 => "\xe2\x82\xac",
14
15 130 => "\xe2\x80\x9a",
16 131 => "\xc6\x92",
17 132 => "\xe2\x80\x9e",
18 133 => "\xe2\x80\xa6",
19 134 => "\xe2\x80\xa0",
20 135 => "\xe2\x80\xa1",
21 136 => "\xcb\x86",
22 137 => "\xe2\x80\xb0",
23 138 => "\xc5\xa0",
24 139 => "\xe2\x80\xb9",
25 140 => "\xc5\x92",
26
27 142 => "\xc5\xbd",
28
29
30 145 => "\xe2\x80\x98",
31 146 => "\xe2\x80\x99",
32 147 => "\xe2\x80\x9c",
33 148 => "\xe2\x80\x9d",
34 149 => "\xe2\x80\xa2",
35 150 => "\xe2\x80\x93",
36 151 => "\xe2\x80\x94",
37 152 => "\xcb\x9c",
38 153 => "\xe2\x84\xa2",
39 154 => "\xc5\xa1",
40 155 => "\xe2\x80\xba",
41 156 => "\xc5\x93",
42
43 158 => "\xc5\xbe",
44 159 => "\xc5\xb8"
45 );
46
47 protected static $brokenUtf8ToUtf8 = array(
48 "\xc2\x80" => "\xe2\x82\xac",
49
50 "\xc2\x82" => "\xe2\x80\x9a",
51 "\xc2\x83" => "\xc6\x92",
52 "\xc2\x84" => "\xe2\x80\x9e",
53 "\xc2\x85" => "\xe2\x80\xa6",
54 "\xc2\x86" => "\xe2\x80\xa0",
55 "\xc2\x87" => "\xe2\x80\xa1",
56 "\xc2\x88" => "\xcb\x86",
57 "\xc2\x89" => "\xe2\x80\xb0",
58 "\xc2\x8a" => "\xc5\xa0",
59 "\xc2\x8b" => "\xe2\x80\xb9",
60 "\xc2\x8c" => "\xc5\x92",
61
62 "\xc2\x8e" => "\xc5\xbd",
63
64
65 "\xc2\x91" => "\xe2\x80\x98",
66 "\xc2\x92" => "\xe2\x80\x99",
67 "\xc2\x93" => "\xe2\x80\x9c",
68 "\xc2\x94" => "\xe2\x80\x9d",
69 "\xc2\x95" => "\xe2\x80\xa2",
70 "\xc2\x96" => "\xe2\x80\x93",
71 "\xc2\x97" => "\xe2\x80\x94",
72 "\xc2\x98" => "\xcb\x9c",
73 "\xc2\x99" => "\xe2\x84\xa2",
74 "\xc2\x9a" => "\xc5\xa1",
75 "\xc2\x9b" => "\xe2\x80\xba",
76 "\xc2\x9c" => "\xc5\x93",
77
78 "\xc2\x9e" => "\xc5\xbe",
79 "\xc2\x9f" => "\xc5\xb8"
80 );
81
82 protected static $utf8ToWin1252 = array(
83 "\xe2\x82\xac" => "\x80",
84
85 "\xe2\x80\x9a" => "\x82",
86 "\xc6\x92" => "\x83",
87 "\xe2\x80\x9e" => "\x84",
88 "\xe2\x80\xa6" => "\x85",
89 "\xe2\x80\xa0" => "\x86",
90 "\xe2\x80\xa1" => "\x87",
91 "\xcb\x86" => "\x88",
92 "\xe2\x80\xb0" => "\x89",
93 "\xc5\xa0" => "\x8a",
94 "\xe2\x80\xb9" => "\x8b",
95 "\xc5\x92" => "\x8c",
96
97 "\xc5\xbd" => "\x8e",
98
99
100 "\xe2\x80\x98" => "\x91",
101 "\xe2\x80\x99" => "\x92",
102 "\xe2\x80\x9c" => "\x93",
103 "\xe2\x80\x9d" => "\x94",
104 "\xe2\x80\xa2" => "\x95",
105 "\xe2\x80\x93" => "\x96",
106 "\xe2\x80\x94" => "\x97",
107 "\xcb\x9c" => "\x98",
108 "\xe2\x84\xa2" => "\x99",
109 "\xc5\xa1" => "\x9a",
110 "\xe2\x80\xba" => "\x9b",
111 "\xc5\x93" => "\x9c",
112
113 "\xc5\xbe" => "\x9e",
114 "\xc5\xb8" => "\x9f"
115 );
116
117 static function toUTF8($text){
118 /**
119 * Function Encoding::toUTF8
120 *
121 * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
122 *
123 * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
124 *
125 * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
126 *
127 * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
128 * are followed by any of these: ("group B")
129 * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
130 * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ»
131 * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
132 * is also a valid unicode character, and will be left unchanged.
133 *
134 * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B,
135 * 3) when any of these: ðñòó are followed by THREE chars from group B.
136 *
137 * @name toUTF8
138 * @param string $text Any string.
139 * @return string The same string, UTF8 encoded
140 *
141 */
142
143 if(is_array($text))
144 {
145 foreach($text as $k => $v)
146 {
147 $text[$k] = self::toUTF8($v);
148 }
149 return $text;
150 } elseif(is_string($text)) {
151
152 $max = strlen($text);
153 $buf = "";
154 for($i = 0; $i < $max; $i++){
155 $c1 = $text{$i};
156 if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
157 $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
158 $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
159 $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
160 if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
161 if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
162 $buf .= $c1 . $c2;
163 $i++;
164 } else { //not valid UTF8. Convert it.
165 $cc1 = (chr(ord($c1) / 64) | "\xc0");
166 $cc2 = ($c1 & "\x3f") | "\x80";
167 $buf .= $cc1 . $cc2;
168 }
169 } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
170 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
171 $buf .= $c1 . $c2 . $c3;
172 $i = $i + 2;
173 } else { //not valid UTF8. Convert it.
174 $cc1 = (chr(ord($c1) / 64) | "\xc0");
175 $cc2 = ($c1 & "\x3f") | "\x80";
176 $buf .= $cc1 . $cc2;
177 }
178 } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
179 if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
180 $buf .= $c1 . $c2 . $c3;
181 $i = $i + 2;
182 } else { //not valid UTF8. Convert it.
183 $cc1 = (chr(ord($c1) / 64) | "\xc0");
184 $cc2 = ($c1 & "\x3f") | "\x80";
185 $buf .= $cc1 . $cc2;
186 }
187 } else { //doesn't look like UTF8, but should be converted
188 $cc1 = (chr(ord($c1) / 64) | "\xc0");
189 $cc2 = (($c1 & "\x3f") | "\x80");
190 $buf .= $cc1 . $cc2;
191 }
192 } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
193 if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
194 $buf .= self::$win1252ToUtf8[ord($c1)];
195 } else {
196 $cc1 = (chr(ord($c1) / 64) | "\xc0");
197 $cc2 = (($c1 & "\x3f") | "\x80");
198 $buf .= $cc1 . $cc2;
199 }
200 } else { // it doesn't need convesion
201 $buf .= $c1;
202 }
203 }
204 return $buf;
205 } else {
206 return $text;
207 }
208 }
209
210 static function toWin1252($text) {
211 if(is_array($text)) {
212 foreach($text as $k => $v) {
213 $text[$k] = self::toWin1252($v);
214 }
215 return $text;
216 } elseif(is_string($text)) {
217 return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
218 } else {
219 return $text;
220 }
221 }
222
223 static function toISO8859($text) {
224 return self::toWin1252($text);
225 }
226
227 static function toLatin1($text) {
228 return self::toWin1252($text);
229 }
230
231 static function fixUTF8($text){
232 if(is_array($text)) {
233 foreach($text as $k => $v) {
234 $text[$k] = self::fixUTF8($v);
235 }
236 return $text;
237 }
238
239 $last = "";
240 while($last <> $text){
241 $last = $text;
242 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
243 }
244 $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
245 return $text;
246 }
247
248 static function UTF8FixWin1252Chars($text){
249 // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
250 // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
251 // See: http://en.wikipedia.org/wiki/Windows-1252
252
253 return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
254 }
255
256 static function removeBOM($str=""){
257 if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
258 $str=substr($str, 3);
259 }
260 return $str;
261 }
262} \ No newline at end of file
diff --git a/inc/3rdparty/JSLikeHTMLElement.php b/inc/3rdparty/JSLikeHTMLElement.php
new file mode 100644
index 00000000..238ba8a8
--- /dev/null
+++ b/inc/3rdparty/JSLikeHTMLElement.php
@@ -0,0 +1,109 @@
1<?php
2/**
3* JavaScript-like HTML DOM Element
4*
5* This class extends PHP's DOMElement to allow
6* users to get and set the innerHTML property of
7* HTML elements in the same way it's done in
8* JavaScript.
9*
10* Example usage:
11* @code
12* require_once 'JSLikeHTMLElement.php';
13* header('Content-Type: text/plain');
14* $doc = new DOMDocument();
15* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
16* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
17* $elem = $doc->getElementsByTagName('div')->item(0);
18*
19* // print innerHTML
20* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
21* echo "\n\n";
22*
23* // set innerHTML
24* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>';
25* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>'
26* echo "\n\n";
27*
28* // print document (with our changes)
29* echo $doc->saveXML();
30* @endcode
31*
32* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
33* @see http://fivefilters.org (the project this was written for)
34*/
35class JSLikeHTMLElement extends DOMElement
36{
37 /**
38 * Used for setting innerHTML like it's done in JavaScript:
39 * @code
40 * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
41 * @endcode
42 */
43 public function __set($name, $value) {
44 if ($name == 'innerHTML') {
45 // first, empty the element
46 for ($x=$this->childNodes->length-1; $x>=0; $x--) {
47 $this->removeChild($this->childNodes->item($x));
48 }
49 // $value holds our new inner HTML
50 if ($value != '') {
51 $f = $this->ownerDocument->createDocumentFragment();
52 // appendXML() expects well-formed markup (XHTML)
53 $result = @$f->appendXML($value); // @ to suppress PHP warnings
54 if ($result) {
55 if ($f->hasChildNodes()) $this->appendChild($f);
56 } else {
57 // $value is probably ill-formed
58 $f = new DOMDocument();
59 $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
60 // Using <htmlfragment> will generate a warning, but so will bad HTML
61 // (and by this point, bad HTML is what we've got).
62 // We use it (and suppress the warning) because an HTML fragment will
63 // be wrapped around <html><body> tags which we don't really want to keep.
64 // Note: despite the warning, if loadHTML succeeds it will return true.
65 $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>');
66 if ($result) {
67 $import = $f->getElementsByTagName('htmlfragment')->item(0);
68 foreach ($import->childNodes as $child) {
69 $importedNode = $this->ownerDocument->importNode($child, true);
70 $this->appendChild($importedNode);
71 }
72 } else {
73 // oh well, we tried, we really did. :(
74 // this element is now empty
75 }
76 }
77 }
78 } else {
79 $trace = debug_backtrace();
80 trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
81 }
82 }
83
84 /**
85 * Used for getting innerHTML like it's done in JavaScript:
86 * @code
87 * $string = $div->innerHTML;
88 * @endcode
89 */
90 public function __get($name)
91 {
92 if ($name == 'innerHTML') {
93 $inner = '';
94 foreach ($this->childNodes as $child) {
95 $inner .= $this->ownerDocument->saveXML($child);
96 }
97 return $inner;
98 }
99
100 $trace = debug_backtrace();
101 trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
102 return null;
103 }
104
105 public function __toString()
106 {
107 return '['.$this->tagName.']';
108 }
109} \ No newline at end of file
diff --git a/inc/3rdparty/Readability.php b/inc/3rdparty/Readability.php
new file mode 100644
index 00000000..e1e8738b
--- /dev/null
+++ b/inc/3rdparty/Readability.php
@@ -0,0 +1,1137 @@
1<?php
2/**
3* Arc90's Readability ported to PHP for FiveFilters.org
4* Based on readability.js version 1.7.1 (without multi-page support)
5* Updated to allow HTML5 parsing with html5lib
6* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
7* ------------------------------------------------------
8* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
9* Arc90's project URL: http://lab.arc90.com/experiments/readability/
10* JS Source: http://code.google.com/p/arc90labs-readability
11* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
12* More information: http://fivefilters.org/content-only/
13* License: Apache License, Version 2.0
14* Requires: PHP5
15* Date: 2012-09-19
16*
17* Differences between the PHP port and the original
18* ------------------------------------------------------
19* Arc90's Readability is designed to run in the browser. It works on the DOM
20* tree (the parsed HTML) after the page's CSS styles have been applied and
21* Javascript code executed. This PHP port does not run inside a browser.
22* We use PHP's ability to parse HTML to build our DOM tree, but we cannot
23* rely on CSS or Javascript support. As such, the results will not always
24* match Arc90's Readability. (For example, if a web page contains CSS style
25* rules or Javascript code which hide certain HTML elements from display,
26* Arc90's Readability will dismiss those from consideration but our PHP port,
27* unable to understand CSS or Javascript, will not know any better.)
28*
29* Another significant difference is that the aim of Arc90's Readability is
30* to re-present the main content block of a given web page so users can
31* read it more easily in their browsers. Correct identification, clean up,
32* and separation of the content block is only a part of this process.
33* This PHP port is only concerned with this part, it does not include code
34* that relates to presentation in the browser - Arc90 already do
35* that extremely well, and for PDF output there's FiveFilters.org's
36* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
37*
38* Finally, this class contains methods that might be useful for developers
39* working on HTML document fragments. So without deviating too much from
40* the original code (which I don't want to do because it makes debugging
41* and updating more difficult), I've tried to make it a little more
42* developer friendly. You should be able to use the methods here on
43* existing DOMElement objects without passing an entire HTML document to
44* be parsed.
45*/
46
47// This class allows us to do JavaScript like assignements to innerHTML
48require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
49
50// Alternative usage (for testing only!)
51// uncomment the lines below and call Readability.php in your browser
52// passing it the URL of the page you'd like content from, e.g.:
53// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
54
55/*
56if (!isset($_GET['url']) || $_GET['url'] == '') {
57 die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
58}
59$url = $_GET['url'];
60if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
61$html = file_get_contents($url);
62$r = new Readability($html, $url);
63$r->init();
64echo $r->articleContent->innerHTML;
65*/
66
67class Readability
68{
69 public $version = '1.7.1-without-multi-page';
70 public $convertLinksToFootnotes = false;
71 public $revertForcedParagraphElements = true;
72 public $articleTitle;
73 public $articleContent;
74 public $dom;
75 public $url = null; // optional - URL where HTML was retrieved
76 public $debug = false;
77 public $lightClean = true; // preserves more content (experimental) added 2012-09-19
78 protected $body = null; //
79 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
80 protected $flags = 7; // 1 | 2 | 4; // Start with all flags set.
81 protected $success = false; // indicates whether we were able to extract or not
82
83 /**
84 * All of the regular expressions in use within readability.
85 * Defined up here so we don't instantiate them repeatedly in loops.
86 **/
87 public $regexps = array(
88 'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
89 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
90 'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
91 'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
92 'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
93 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
94 'replaceFonts' => '/<(\/?)font[^>]*>/i',
95 // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
96 'normalize' => '/\s{2,}/',
97 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
98 'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
99 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
100 );
101
102 /* constants */
103 const FLAG_STRIP_UNLIKELYS = 1;
104 const FLAG_WEIGHT_CLASSES = 2;
105 const FLAG_CLEAN_CONDITIONALLY = 4;
106
107 /**
108 * Create instance of Readability
109 * @param string UTF-8 encoded string
110 * @param string (optional) URL associated with HTML (used for footnotes)
111 * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
112 */
113 function __construct($html, $url=null, $parser='libxml')
114 {
115 $this->url = $url;
116 /* Turn all double br's into p's */
117 $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
118 $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
119 $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
120 if (trim($html) == '') $html = '<html></html>';
121 if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
122 // all good
123 } else {
124 $this->dom = new DOMDocument();
125 $this->dom->preserveWhiteSpace = false;
126 @$this->dom->loadHTML($html);
127 }
128 $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
129 }
130
131 /**
132 * Get article title element
133 * @return DOMElement
134 */
135 public function getTitle() {
136 return $this->articleTitle;
137 }
138
139 /**
140 * Get article content element
141 * @return DOMElement
142 */
143 public function getContent() {
144 return $this->articleContent;
145 }
146
147 /**
148 * Runs readability.
149 *
150 * Workflow:
151 * 1. Prep the document by removing script tags, css, etc.
152 * 2. Build readability's DOM tree.
153 * 3. Grab the article content from the current dom tree.
154 * 4. Replace the current DOM tree with the new one.
155 * 5. Read peacefully.
156 *
157 * @return boolean true if we found content, false otherwise
158 **/
159 public function init()
160 {
161 if (!isset($this->dom->documentElement)) return false;
162 $this->removeScripts($this->dom);
163 //die($this->getInnerHTML($this->dom->documentElement));
164
165 // Assume successful outcome
166 $this->success = true;
167
168 $bodyElems = $this->dom->getElementsByTagName('body');
169 if ($bodyElems->length > 0) {
170 if ($this->bodyCache == null) {
171 $this->bodyCache = $bodyElems->item(0)->innerHTML;
172 }
173 if ($this->body == null) {
174 $this->body = $bodyElems->item(0);
175 }
176 }
177
178 $this->prepDocument();
179
180 //die($this->dom->documentElement->parentNode->nodeType);
181 //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
182 //die($this->getInnerHTML($this->dom->documentElement));
183
184 /* Build readability's DOM tree */
185 $overlay = $this->dom->createElement('div');
186 $innerDiv = $this->dom->createElement('div');
187 $articleTitle = $this->getArticleTitle();
188 $articleContent = $this->grabArticle();
189
190 if (!$articleContent) {
191 $this->success = false;
192 $articleContent = $this->dom->createElement('div');
193 $articleContent->setAttribute('id', 'readability-content');
194 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
195 }
196
197 $overlay->setAttribute('id', 'readOverlay');
198 $innerDiv->setAttribute('id', 'readInner');
199
200 /* Glue the structure of our document together. */
201 $innerDiv->appendChild($articleTitle);
202 $innerDiv->appendChild($articleContent);
203 $overlay->appendChild($innerDiv);
204
205 /* Clear the old HTML, insert the new content. */
206 $this->body->innerHTML = '';
207 $this->body->appendChild($overlay);
208 //document.body.insertBefore(overlay, document.body.firstChild);
209 $this->body->removeAttribute('style');
210
211 $this->postProcessContent($articleContent);
212
213 // Set title and content instance variables
214 $this->articleTitle = $articleTitle;
215 $this->articleContent = $articleContent;
216
217 return $this->success;
218 }
219
220 /**
221 * Debug
222 */
223 protected function dbg($msg) {
224 if ($this->debug) echo '* ',$msg, "\n";
225 }
226
227 /**
228 * Run any post-process modifications to article content as necessary.
229 *
230 * @param DOMElement
231 * @return void
232 */
233 public function postProcessContent($articleContent) {
234 if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) {
235 $this->addFootnotes($articleContent);
236 }
237 }
238
239 /**
240 * Get the article title as an H1.
241 *
242 * @return DOMElement
243 */
244 protected function getArticleTitle() {
245 $curTitle = '';
246 $origTitle = '';
247
248 try {
249 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
250 } catch(Exception $e) {}
251
252 if (preg_match('/ [\|\-] /', $curTitle))
253 {
254 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
255
256 if (count(explode(' ', $curTitle)) < 3) {
257 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
258 }
259 }
260 else if (strpos($curTitle, ': ') !== false)
261 {
262 $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
263
264 if (count(explode(' ', $curTitle)) < 3) {
265 $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
266 }
267 }
268 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
269 {
270 $hOnes = $this->dom->getElementsByTagName('h1');
271 if($hOnes->length == 1)
272 {
273 $curTitle = $this->getInnerText($hOnes->item(0));
274 }
275 }
276
277 $curTitle = trim($curTitle);
278
279 if (count(explode(' ', $curTitle)) <= 4) {
280 $curTitle = $origTitle;
281 }
282
283 $articleTitle = $this->dom->createElement('h1');
284 $articleTitle->innerHTML = $curTitle;
285
286 return $articleTitle;
287 }
288
289 /**
290 * Prepare the HTML document for readability to scrape it.
291 * This includes things like stripping javascript, CSS, and handling terrible markup.
292 *
293 * @return void
294 **/
295 protected function prepDocument() {
296 /**
297 * In some cases a body element can't be found (if the HTML is totally hosed for example)
298 * so we create a new body node and append it to the document.
299 */
300 if ($this->body == null)
301 {
302 $this->body = $this->dom->createElement('body');
303 $this->dom->documentElement->appendChild($this->body);
304 }
305 $this->body->setAttribute('id', 'readabilityBody');
306
307 /* Remove all style tags in head */
308 $styleTags = $this->dom->getElementsByTagName('style');
309 for ($i = $styleTags->length-1; $i >= 0; $i--)
310 {
311 $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
312 }
313
314 /* Turn all double br's into p's */
315 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
316 //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
317 // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
318 // Manipulating innerHTML as it's done in JS is not possible in PHP.
319 }
320
321 /**
322 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
323 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
324 *
325 * @return void
326 **/
327 public function addFootnotes($articleContent) {
328 $footnotesWrapper = $this->dom->createElement('div');
329 $footnotesWrapper->setAttribute('id', 'readability-footnotes');
330 $footnotesWrapper->innerHTML = '<h3>References</h3>';
331
332 $articleFootnotes = $this->dom->createElement('ol');
333 $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
334 $footnotesWrapper->appendChild($articleFootnotes);
335
336 $articleLinks = $articleContent->getElementsByTagName('a');
337
338 $linkCount = 0;
339 for ($i = 0; $i < $articleLinks->length; $i++)
340 {
341 $articleLink = $articleLinks->item($i);
342 $footnoteLink = $articleLink->cloneNode(true);
343 $refLink = $this->dom->createElement('a');
344 $footnote = $this->dom->createElement('li');
345 $linkDomain = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
346 if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
347 //linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
348 $linkText = $this->getInnerText($articleLink);
349
350 if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
351 continue;
352 }
353
354 $linkCount++;
355
356 /** Add a superscript reference after the article link */
357 $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
358 $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
359 $refLink->setAttribute('class', 'readability-DoNotFootnote');
360 $refLink->setAttribute('style', 'color: inherit;');
361
362 //TODO: does this work or should we use DOMNode.isSameNode()?
363 if ($articleLink->parentNode->lastChild == $articleLink) {
364 $articleLink->parentNode->appendChild($refLink);
365 } else {
366 $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
367 }
368
369 $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
370 $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
371
372 $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
373
374 $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
375 $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
376
377 $footnote->appendChild($footnoteLink);
378 if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
379
380 $articleFootnotes->appendChild($footnote);
381 }
382
383 if ($linkCount > 0) {
384 $articleContent->appendChild($footnotesWrapper);
385 }
386 }
387
388 /**
389 * Reverts P elements with class 'readability-styled'
390 * to text nodes - which is what they were before.
391 *
392 * @param DOMElement
393 * @return void
394 */
395 function revertReadabilityStyledElements($articleContent) {
396 $xpath = new DOMXPath($articleContent->ownerDocument);
397 $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
398 //$elems = $articleContent->getElementsByTagName('p');
399 for ($i = $elems->length-1; $i >= 0; $i--) {
400 $e = $elems->item($i);
401 $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
402 //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
403 // $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
404 //}
405 }
406 }
407
408 /**
409 * Prepare the article node for display. Clean out any inline styles,
410 * iframes, forms, strip extraneous <p> tags, etc.
411 *
412 * @param DOMElement
413 * @return void
414 */
415 function prepArticle($articleContent) {
416 $this->cleanStyles($articleContent);
417 $this->killBreaks($articleContent);
418 if ($this->revertForcedParagraphElements) {
419 $this->revertReadabilityStyledElements($articleContent);
420 }
421
422 /* Clean out junk from the article content */
423 $this->cleanConditionally($articleContent, 'form');
424 $this->clean($articleContent, 'object');
425 $this->clean($articleContent, 'h1');
426
427 /**
428 * If there is only one h2, they are probably using it
429 * as a header and not a subheader, so remove it since we already have a header.
430 ***/
431 if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
432 $this->clean($articleContent, 'h2');
433 }
434 $this->clean($articleContent, 'iframe');
435
436 $this->cleanHeaders($articleContent);
437
438 /* Do these last as the previous stuff may have removed junk that will affect these */
439 $this->cleanConditionally($articleContent, 'table');
440 $this->cleanConditionally($articleContent, 'ul');
441 $this->cleanConditionally($articleContent, 'div');
442
443 /* Remove extra paragraphs */
444 $articleParagraphs = $articleContent->getElementsByTagName('p');
445 for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
446 {
447 $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
448 $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
449 $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
450 $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
451
452 if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
453 {
454 $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
455 }
456 }
457
458 try {
459 $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
460 //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
461 }
462 catch (Exception $e) {
463 $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
464 }
465 }
466
467 /**
468 * Initialize a node with the readability object. Also checks the
469 * className/id for special names to add to its score.
470 *
471 * @param Element
472 * @return void
473 **/
474 protected function initializeNode($node) {
475 $readability = $this->dom->createAttribute('readability');
476 $readability->value = 0; // this is our contentScore
477 $node->setAttributeNode($readability);
478
479 switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
480 case 'DIV':
481 $readability->value += 5;
482 break;
483
484 case 'PRE':
485 case 'TD':
486 case 'BLOCKQUOTE':
487 $readability->value += 3;
488 break;
489
490 case 'ADDRESS':
491 case 'OL':
492 case 'UL':
493 case 'DL':
494 case 'DD':
495 case 'DT':
496 case 'LI':
497 case 'FORM':
498 $readability->value -= 3;
499 break;
500
501 case 'H1':
502 case 'H2':
503 case 'H3':
504 case 'H4':
505 case 'H5':
506 case 'H6':
507 case 'TH':
508 $readability->value -= 5;
509 break;
510 }
511 $readability->value += $this->getClassWeight($node);
512 }
513
514 /***
515 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
516 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
517 *
518 * @return DOMElement
519 **/
520 protected function grabArticle($page=null) {
521 $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
522 if (!$page) $page = $this->dom;
523 $allElements = $page->getElementsByTagName('*');
524 /**
525 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
526 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
527 *
528 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
529 * TODO: Shouldn't this be a reverse traversal?
530 **/
531 $node = null;
532 $nodesToScore = array();
533 for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
534 //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
535 //$node = $targetList->item($nodeIndex);
536 $tagName = strtoupper($node->tagName);
537 /* Remove unlikely candidates */
538 if ($stripUnlikelyCandidates) {
539 $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
540 if (
541 preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
542 !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
543 $tagName != 'BODY'
544 )
545 {
546 $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
547 //$nodesToRemove[] = $node;
548 $node->parentNode->removeChild($node);
549 $nodeIndex--;
550 continue;
551 }
552 }
553
554 if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
555 $nodesToScore[] = $node;
556 }
557
558 /* Turn all divs that don't have children block level elements into p's */
559 if ($tagName == 'DIV') {
560 if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
561 //$this->dbg('Altering div to p');
562 $newNode = $this->dom->createElement('p');
563 try {
564 $newNode->innerHTML = $node->innerHTML;
565 //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
566 $node->parentNode->replaceChild($newNode, $node);
567 $nodeIndex--;
568 $nodesToScore[] = $node; // or $newNode?
569 }
570 catch(Exception $e) {
571 $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
572 }
573 }
574 else
575 {
576 /* EXPERIMENTAL */
577 // TODO: change these p elements back to text nodes after processing
578 for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
579 $childNode = $node->childNodes->item($i);
580 if ($childNode->nodeType == 3) { // XML_TEXT_NODE
581 //$this->dbg('replacing text node with a p tag with the same content.');
582 $p = $this->dom->createElement('p');
583 $p->innerHTML = $childNode->nodeValue;
584 $p->setAttribute('style', 'display: inline;');
585 $p->setAttribute('class', 'readability-styled');
586 $childNode->parentNode->replaceChild($p, $childNode);
587 }
588 }
589 }
590 }
591 }
592
593 /**
594 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
595 * Then add their score to their parent node.
596 *
597 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
598 **/
599 $candidates = array();
600 for ($pt=0; $pt < count($nodesToScore); $pt++) {
601 $parentNode = $nodesToScore[$pt]->parentNode;
602 // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
603 $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
604 $innerText = $this->getInnerText($nodesToScore[$pt]);
605
606 if (!$parentNode || !isset($parentNode->tagName)) {
607 continue;
608 }
609
610 /* If this paragraph is less than 25 characters, don't even count it. */
611 if(strlen($innerText) < 25) {
612 continue;
613 }
614
615 /* Initialize readability data for the parent. */
616 if (!$parentNode->hasAttribute('readability'))
617 {
618 $this->initializeNode($parentNode);
619 $candidates[] = $parentNode;
620 }
621
622 /* Initialize readability data for the grandparent. */
623 if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
624 {
625 $this->initializeNode($grandParentNode);
626 $candidates[] = $grandParentNode;
627 }
628
629 $contentScore = 0;
630
631 /* Add a point for the paragraph itself as a base. */
632 $contentScore++;
633
634 /* Add points for any commas within this paragraph */
635 $contentScore += count(explode(',', $innerText));
636
637 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
638 $contentScore += min(floor(strlen($innerText) / 100), 3);
639
640 /* Add the score to the parent. The grandparent gets half. */
641 $parentNode->getAttributeNode('readability')->value += $contentScore;
642
643 if ($grandParentNode) {
644 $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
645 }
646 }
647
648 /**
649 * After we've calculated scores, loop through all of the possible candidate nodes we found
650 * and find the one with the highest score.
651 **/
652 $topCandidate = null;
653 for ($c=0, $cl=count($candidates); $c < $cl; $c++)
654 {
655 /**
656 * Scale the final candidates score based on link density. Good content should have a
657 * relatively small link density (5% or less) and be mostly unaffected by this operation.
658 **/
659 $readability = $candidates[$c]->getAttributeNode('readability');
660 $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
661
662 $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
663
664 if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
665 $topCandidate = $candidates[$c];
666 }
667 }
668
669 /**
670 * If we still have no top candidate, just use the body as a last resort.
671 * We also have to copy the body node so it is something we can modify.
672 **/
673 if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
674 {
675 $topCandidate = $this->dom->createElement('div');
676 if ($page instanceof DOMDocument) {
677 if (!isset($page->documentElement)) {
678 // we don't have a body either? what a mess! :)
679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = '';
682 $page->documentElement->appendChild($topCandidate);
683 }
684 } else {
685 $topCandidate->innerHTML = $page->innerHTML;
686 $page->innerHTML = '';
687 $page->appendChild($topCandidate);
688 }
689 $this->initializeNode($topCandidate);
690 }
691
692 /**
693 * Now that we have the top candidate, look through its siblings for content that might also be related.
694 * Things like preambles, content split by ads that we removed, etc.
695 **/
696 $articleContent = $this->dom->createElement('div');
697 $articleContent->setAttribute('id', 'readability-content');
698 $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
699 $siblingNodes = $topCandidate->parentNode->childNodes;
700 if (!isset($siblingNodes)) {
701 $siblingNodes = new stdClass;
702 $siblingNodes->length = 0;
703 }
704
705 for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
706 {
707 $siblingNode = $siblingNodes->item($s);
708 $append = false;
709
710 $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
711
712 //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
713
714 if ($siblingNode === $topCandidate)
715 // or if ($siblingNode->isSameNode($topCandidate))
716 {
717 $append = true;
718 }
719
720 $contentBonus = 0;
721 /* Give a bonus if sibling nodes and top candidates have the example same classname */
722 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
723 $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
724 }
725
726 if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
727 {
728 $append = true;
729 }
730
731 if (strtoupper($siblingNode->nodeName) == 'P') {
732 $linkDensity = $this->getLinkDensity($siblingNode);
733 $nodeContent = $this->getInnerText($siblingNode);
734 $nodeLength = strlen($nodeContent);
735
736 if ($nodeLength > 80 && $linkDensity < 0.25)
737 {
738 $append = true;
739 }
740 else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
741 {
742 $append = true;
743 }
744 }
745
746 if ($append)
747 {
748 $this->dbg('Appending node: ' . $siblingNode->nodeName);
749
750 $nodeToAppend = null;
751 $sibNodeName = strtoupper($siblingNode->nodeName);
752 if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
753 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
754
755 $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
756 $nodeToAppend = $this->dom->createElement('div');
757 try {
758 $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
759 $nodeToAppend->innerHTML = $siblingNode->innerHTML;
760 }
761 catch(Exception $e)
762 {
763 $this->dbg('Could not alter siblingNode to div, reverting back to original.');
764 $nodeToAppend = $siblingNode;
765 $s--;
766 $sl--;
767 }
768 } else {
769 $nodeToAppend = $siblingNode;
770 $s--;
771 $sl--;
772 }
773
774 /* To ensure a node does not interfere with readability styles, remove its classnames */
775 $nodeToAppend->removeAttribute('class');
776
777 /* Append sibling and subtract from our list because it removes the node when you append to another node */
778 $articleContent->appendChild($nodeToAppend);
779 }
780 }
781
782 /**
783 * So we have all of the content that we need. Now we clean it up for presentation.
784 **/
785 $this->prepArticle($articleContent);
786
787 /**
788 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
789 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
790 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
791 * finding the -right- content.
792 **/
793 if (strlen($this->getInnerText($articleContent, false)) < 250)
794 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
798 $this->body->innerHTML = $this->bodyCache;
799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
802 return $this->grabArticle($this->body);
803 }
804 else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
805 $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
806 return $this->grabArticle($this->body);
807 }
808 else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
809 $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
810 return $this->grabArticle($this->body);
811 }
812 else {
813 return false;
814 }
815 }
816 return $articleContent;
817 }
818
819 /**
820 * Remove script tags from document
821 *
822 * @param DOMElement
823 * @return void
824 */
825 public function removeScripts($doc) {
826 $scripts = $doc->getElementsByTagName('script');
827 for($i = $scripts->length-1; $i >= 0; $i--)
828 {
829 $scripts->item($i)->parentNode->removeChild($scripts->item($i));
830 }
831 }
832
833 /**
834 * Get the inner text of a node.
835 * This also strips out any excess whitespace to be found.
836 *
837 * @param DOMElement $
838 * @param boolean $normalizeSpaces (default: true)
839 * @return string
840 **/
841 public function getInnerText($e, $normalizeSpaces=true) {
842 $textContent = '';
843
844 if (!isset($e->textContent) || $e->textContent == '') {
845 return '';
846 }
847
848 $textContent = trim($e->textContent);
849
850 if ($normalizeSpaces) {
851 return preg_replace($this->regexps['normalize'], ' ', $textContent);
852 } else {
853 return $textContent;
854 }
855 }
856
857 /**
858 * Get the number of times a string $s appears in the node $e.
859 *
860 * @param DOMElement $e
861 * @param string - what to count. Default is ","
862 * @return number (integer)
863 **/
864 public function getCharCount($e, $s=',') {
865 return substr_count($this->getInnerText($e), $s);
866 }
867
868 /**
869 * Remove the style attribute on every $e and under.
870 *
871 * @param DOMElement $e
872 * @return void
873 */
874 public function cleanStyles($e) {
875 if (!is_object($e)) return;
876 $elems = $e->getElementsByTagName('*');
877 foreach ($elems as $elem) {
878 $elem->removeAttribute('style');
879 }
880 }
881
882 /**
883 * Get the density of links as a percentage of the content
884 * This is the amount of text that is inside a link divided by the total text in the node.
885 *
886 * @param DOMElement $e
887 * @return number (float)
888 */
889 public function getLinkDensity($e) {
890 $links = $e->getElementsByTagName('a');
891 $textLength = strlen($this->getInnerText($e));
892 $linkLength = 0;
893 for ($i=0, $il=$links->length; $i < $il; $i++)
894 {
895 $linkLength += strlen($this->getInnerText($links->item($i)));
896 }
897 if ($textLength > 0) {
898 return $linkLength / $textLength;
899 } else {
900 return 0;
901 }
902 }
903
904 /**
905 * Get an elements class/id weight. Uses regular expressions to tell if this
906 * element looks good or bad.
907 *
908 * @param DOMElement $e
909 * @return number (Integer)
910 */
911 public function getClassWeight($e) {
912 if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
913 return 0;
914 }
915
916 $weight = 0;
917
918 /* Look for a special classname */
919 if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
920 {
921 if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
922 $weight -= 25;
923 }
924 if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
925 $weight += 25;
926 }
927 }
928
929 /* Look for a special ID */
930 if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
931 {
932 if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
933 $weight -= 25;
934 }
935 if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
936 $weight += 25;
937 }
938 }
939 return $weight;
940 }
941
942 /**
943 * Remove extraneous break tags from a node.
944 *
945 * @param DOMElement $node
946 * @return void
947 */
948 public function killBreaks($node) {
949 $html = $node->innerHTML;
950 $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
951 $node->innerHTML = $html;
952 }
953
954 /**
955 * Clean a node of all elements of type "tag".
956 * (Unless it's a youtube/vimeo video. People love movies.)
957 *
958 * Updated 2012-09-18 to preserve youtube/vimeo iframes
959 *
960 * @param DOMElement $e
961 * @param string $tag
962 * @return void
963 */
964 public function clean($e, $tag) {
965 $targetList = $e->getElementsByTagName($tag);
966 $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
967
968 for ($y=$targetList->length-1; $y >= 0; $y--) {
969 /* Allow youtube and vimeo videos through as people usually want to see those. */
970 if ($isEmbed) {
971 $attributeValues = '';
972 for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
973 $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
974 }
975
976 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
977 if (preg_match($this->regexps['video'], $attributeValues)) {
978 continue;
979 }
980
981 /* Then check the elements inside this element for the same. */
982 if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
983 continue;
984 }
985 }
986 $targetList->item($y)->parentNode->removeChild($targetList->item($y));
987 }
988 }
989
990 /**
991 * Clean an element of all tags of type "tag" if they look fishy.
992 * "Fishy" is an algorithm based on content length, classnames,
993 * link density, number of images & embeds, etc.
994 *
995 * @param DOMElement $e
996 * @param string $tag
997 * @return void
998 */
999 public function cleanConditionally($e, $tag) {
1000 if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
1001 return;
1002 }
1003
1004 $tagsList = $e->getElementsByTagName($tag);
1005 $curTagsLength = $tagsList->length;
1006
1007 /**
1008 * Gather counts for other typical elements embedded within.
1009 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1010 *
1011 * TODO: Consider taking into account original contentScore here.
1012 */
1013 for ($i=$curTagsLength-1; $i >= 0; $i--) {
1014 $weight = $this->getClassWeight($tagsList->item($i));
1015 $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
1016
1017 $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
1018
1019 if ($weight + $contentScore < 0) {
1020 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1021 }
1022 else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
1023 /**
1024 * If there are not very many commas, and the number of
1025 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1026 **/
1027 $p = $tagsList->item($i)->getElementsByTagName('p')->length;
1028 $img = $tagsList->item($i)->getElementsByTagName('img')->length;
1029 $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
1030 $input = $tagsList->item($i)->getElementsByTagName('input')->length;
1031 $a = $tagsList->item($i)->getElementsByTagName('a')->length;
1032
1033 $embedCount = 0;
1034 $embeds = $tagsList->item($i)->getElementsByTagName('embed');
1035 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1036 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1037 $embedCount++;
1038 }
1039 }
1040 $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
1041 for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
1042 if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
1043 $embedCount++;
1044 }
1045 }
1046
1047 $linkDensity = $this->getLinkDensity($tagsList->item($i));
1048 $contentLength = strlen($this->getInnerText($tagsList->item($i)));
1049 $toRemove = false;
1050
1051 if ($this->lightClean) {
1052 $this->dbg('Light clean...');
1053 if ( ($img > $p) && ($img > 4) ) {
1054 $this->dbg(' more than 4 images and more image elements than paragraph elements');
1055 $toRemove = true;
1056 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1057 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1058 $toRemove = true;
1059 } else if ( $input > floor($p/3) ) {
1060 $this->dbg(' too many <input> elements');
1061 $toRemove = true;
1062 } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
1063 $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
1064 $toRemove = true;
1065 } else if($weight < 25 && $linkDensity > 0.2) {
1066 $this->dbg(' weight smaller than 25 and link density above 0.2');
1067 $toRemove = true;
1068 } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
1069 $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
1070 $toRemove = true;
1071 } else if($embedCount > 3) {
1072 $this->dbg(' more than 3 embeds');
1073 $toRemove = true;
1074 }
1075 } else {
1076 $this->dbg('Standard clean...');
1077 if ( $img > $p ) {
1078 $this->dbg(' more image elements than paragraph elements');
1079 $toRemove = true;
1080 } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
1081 $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
1082 $toRemove = true;
1083 } else if ( $input > floor($p/3) ) {
1084 $this->dbg(' too many <input> elements');
1085 $toRemove = true;
1086 } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
1087 $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
1088 $toRemove = true;
1089 } else if($weight < 25 && $linkDensity > 0.2) {
1090 $this->dbg(' weight smaller than 25 and link density above 0.2');
1091 $toRemove = true;
1092 } else if($weight >= 25 && $linkDensity > 0.5) {
1093 $this->dbg(' weight above 25 but link density greater than 0.5');
1094 $toRemove = true;
1095 } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
1096 $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
1097 $toRemove = true;
1098 }
1099 }
1100
1101 if ($toRemove) {
1102 //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
1103 $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
1104 }
1105 }
1106 }
1107 }
1108
1109 /**
1110 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1111 *
1112 * @param DOMElement $e
1113 * @return void
1114 */
1115 public function cleanHeaders($e) {
1116 for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1117 $headers = $e->getElementsByTagName('h' . $headerIndex);
1118 for ($i=$headers->length-1; $i >=0; $i--) {
1119 if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
1120 $headers->item($i)->parentNode->removeChild($headers->item($i));
1121 }
1122 }
1123 }
1124 }
1125
1126 public function flagIsActive($flag) {
1127 return ($this->flags & $flag) > 0;
1128 }
1129
1130 public function addFlag($flag) {
1131 $this->flags = $this->flags | $flag;
1132 }
1133
1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag;
1136 }
1137} \ No newline at end of file
diff --git a/inc/3rdparty/Session.class.php b/inc/3rdparty/Session.class.php
new file mode 100644
index 00000000..eff924cc
--- /dev/null
+++ b/inc/3rdparty/Session.class.php
@@ -0,0 +1,136 @@
1<?php
2/**
3 * Session management class
4 * http://www.developpez.net/forums/d51943/php/langage/sessions/
5 * http://sebsauvage.net/wiki/doku.php?id=php:session
6 * http://sebsauvage.net/wiki/doku.php?id=php:shaarli
7 *
8 * Features:
9 * - Everything is stored on server-side (we do not trust client-side data,
10 * such as cookie expiration)
11 * - IP addresses + user agent are checked on each access to prevent session
12 * cookie hijacking (such as Firesheep)
13 * - Session expires on user inactivity (Session expiration date is
14 * automatically updated everytime the user accesses a page.)
15 * - A unique secret key is generated on server-side for this session
16 * (and never sent over the wire) which can be used
17 * to sign forms (HMAC) (See $_SESSION['uid'] )
18 * - Token management to prevent XSRF attacks.
19 *
20 * TODO:
21 * - log login fail
22 * - prevent brute force (ban IP)
23 *
24 * HOWTOUSE:
25 * - Just call Session::init(); to initialize session and
26 * check if connected with Session::isLogged()
27 */
28
29class Session
30{
31 // If the user does not access any page within this time,
32 // his/her session is considered expired (in seconds).
33 public static $inactivity_timeout = 3600;
34 private static $_instance;
35
36 // constructor
37 private function __construct()
38 {
39 // Use cookies to store session.
40 ini_set('session.use_cookies', 1);
41 // Force cookies for session (phpsessionID forbidden in URL)
42 ini_set('session.use_only_cookies', 1);
43 if (!session_id()){
44 // Prevent php to use sessionID in URL if cookies are disabled.
45 ini_set('session.use_trans_sid', false);
46 session_start('poche');
47 }
48 }
49
50 // initialize session
51 public static function init()
52 {
53 if (!isset(self::$_instance)) {
54 self::$_instance = new Session();
55 }
56 }
57
58 // Returns the IP address, user agent and language of the client
59 // (Used to prevent session cookie hijacking.)
60 private static function _allInfos()
61 {
62 $infos = $_SERVER["REMOTE_ADDR"];
63 if (isset($_SERVER['HTTP_X_FORWARDED_FOR'])) {
64 $infos.=$_SERVER['HTTP_X_FORWARDED_FOR'];
65 }
66 if (isset($_SERVER['HTTP_CLIENT_IP'])) {
67 $infos.='_'.$_SERVER['HTTP_CLIENT_IP'];
68 }
69 $infos.='_'.$_SERVER['HTTP_USER_AGENT'];
70 $infos.='_'.$_SERVER['HTTP_ACCEPT_LANGUAGE'];
71 return sha1($infos);
72 }
73
74 // Check that user/password is correct and init some SESSION variables.
75 public static function login($login,$password,$login_test,$password_test,
76 $pValues = array())
77 {
78 foreach ($pValues as $key => $value) {
79 $_SESSION[$key] = $value;
80 }
81 if ($login==$login_test && $password==$password_test){
82 // generate unique random number to sign forms (HMAC)
83 $_SESSION['uid'] = sha1(uniqid('',true).'_'.mt_rand());
84 $_SESSION['info']=Session::_allInfos();
85 $_SESSION['username']=$login;
86 // Set session expiration.
87 $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
88 return true;
89 }
90 return false;
91 }
92
93 // Force logout
94 public static function logout()
95 {
96 unset($_SESSION['uid'],$_SESSION['info'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass']);
97 }
98
99 // Make sure user is logged in.
100 public static function isLogged()
101 {
102 if (!isset ($_SESSION['uid'])
103 || $_SESSION['info']!=Session::_allInfos()
104 || time()>=$_SESSION['expires_on']){
105 Session::logout();
106 return false;
107 }
108 // User accessed a page : Update his/her session expiration date.
109 $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
110 return true;
111 }
112
113 // Returns a token.
114 public static function getToken()
115 {
116 if (!isset($_SESSION['tokens'])){
117 $_SESSION['tokens']=array();
118 }
119 // We generate a random string and store it on the server side.
120 $rnd = sha1(uniqid('',true).'_'.mt_rand());
121 $_SESSION['tokens'][$rnd]=1;
122 return $rnd;
123 }
124
125 // Tells if a token is ok. Using this function will destroy the token.
126 // return true if token is ok.
127 public static function isToken($token)
128 {
129 if (isset($_SESSION['tokens'][$token]))
130 {
131 unset($_SESSION['tokens'][$token]); // Token is used: destroy it.
132 return true; // Token is ok.
133 }
134 return false; // Wrong token, or already used.
135 }
136} \ No newline at end of file
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
new file mode 100644
index 00000000..43b94e57
--- /dev/null
+++ b/inc/3rdparty/simple_html_dom.php
@@ -0,0 +1,1722 @@
1<?php
2/**
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6 * Contributions by:
7 * Yousuke Kumakura (Attribute filters)
8 * Vadim Voituk (Negative indexes supports of "find" method)
9 * Antcs (Constructor with automatically load contents either text or file/url)
10 *
11 * all affected sections have comments starting with "PaperG"
12 *
13 * Paperg - Added case insensitive testing of the value of the selector.
14 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16 * it will almost always be smaller by some amount.
17 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18 * but for most purposes, it's a really good estimation.
19 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 * Allow the user to tell us how much they trust the html.
21 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 * This allows for us to find tags based on the text they contain.
23 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 * Paperg: added parse_charset so that we know about the character set of the source document.
25 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27 *
28 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30 *
31 * Licensed under The MIT License
32 * Redistributions of files must retain the above copyright notice.
33 *
34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick
36 * @author Rus Carroll
37 * @version 1.5 ($Rev: 202 $)
38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom
40 */
41
42/**
43 * All of the Defines for the classes below.
44 * @author S.C. Chen <me578022@gmail.com>
45 */
46define('HDOM_TYPE_ELEMENT', 1);
47define('HDOM_TYPE_COMMENT', 2);
48define('HDOM_TYPE_TEXT', 3);
49define('HDOM_TYPE_ENDTAG', 4);
50define('HDOM_TYPE_ROOT', 5);
51define('HDOM_TYPE_UNKNOWN', 6);
52define('HDOM_QUOTE_DOUBLE', 0);
53define('HDOM_QUOTE_SINGLE', 1);
54define('HDOM_QUOTE_NO', 3);
55define('HDOM_INFO_BEGIN', 0);
56define('HDOM_INFO_END', 1);
57define('HDOM_INFO_QUOTE', 2);
58define('HDOM_INFO_SPACE', 3);
59define('HDOM_INFO_TEXT', 4);
60define('HDOM_INFO_INNER', 5);
61define('HDOM_INFO_OUTER', 6);
62define('HDOM_INFO_ENDSPACE',7);
63define('DEFAULT_TARGET_CHARSET', 'UTF-8');
64define('DEFAULT_BR_TEXT', "\r\n");
65define('DEFAULT_SPAN_TEXT', " ");
66define('MAX_FILE_SIZE', 600000);
67// helper functions
68// -----------------------------------------------------------------------------
69// get html dom from file
70// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
72{
73 // We DO force the tags to be terminated.
74 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
75 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 $contents = file_get_contents($url, $use_include_path, $context, $offset);
77 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 //$contents = retrieve_url_contents($url);
79 if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80 {
81 return false;
82 }
83 // The second parameter can force the selectors to all be lowercase.
84 $dom->load($contents, $lowercase, $stripRN);
85 return $dom;
86}
87
88// get html dom from string
89function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90{
91 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92 if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93 {
94 $dom->clear();
95 return false;
96 }
97 $dom->load($str, $lowercase, $stripRN);
98 return $dom;
99}
100
101// dump html dom tree
102function dump_html_tree($node, $show_attr=true, $deep=0)
103{
104 $node->dump($node);
105}
106
107
108/**
109 * simple html dom node
110 * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 * PaperG - added $tag_start to track the start position of the tag in the total byte index
112 *
113 * @package PlaceLocalInclude
114 */
115class simple_html_dom_node
116{
117 public $nodetype = HDOM_TYPE_TEXT;
118 public $tag = 'text';
119 public $attr = array();
120 public $children = array();
121 public $nodes = array();
122 public $parent = null;
123 // The "info" array - see HDOM_INFO_... for what each element contains.
124 public $_ = array();
125 public $tag_start = 0;
126 private $dom = null;
127
128 function __construct($dom)
129 {
130 $this->dom = $dom;
131 $dom->nodes[] = $this;
132 }
133
134 function __destruct()
135 {
136 $this->clear();
137 }
138
139 function __toString()
140 {
141 return $this->outertext();
142 }
143
144 // clean up memory due to php5 circular references memory leak...
145 function clear()
146 {
147 $this->dom = null;
148 $this->nodes = null;
149 $this->parent = null;
150 $this->children = null;
151 }
152
153 // dump node's tree
154 function dump($show_attr=true, $deep=0)
155 {
156 $lead = str_repeat(' ', $deep);
157
158 echo $lead.$this->tag;
159 if ($show_attr && count($this->attr)>0)
160 {
161 echo '(';
162 foreach ($this->attr as $k=>$v)
163 echo "[$k]=>\"".$this->$k.'", ';
164 echo ')';
165 }
166 echo "\n";
167
168 if ($this->nodes)
169 {
170 foreach ($this->nodes as $c)
171 {
172 $c->dump($show_attr, $deep+1);
173 }
174 }
175 }
176
177
178 // Debugging function to dump a single dom node with a bunch of information about it.
179 function dump_node($echo=true)
180 {
181
182 $string = $this->tag;
183 if (count($this->attr)>0)
184 {
185 $string .= '(';
186 foreach ($this->attr as $k=>$v)
187 {
188 $string .= "[$k]=>\"".$this->$k.'", ';
189 }
190 $string .= ')';
191 }
192 if (count($this->_)>0)
193 {
194 $string .= ' $_ (';
195 foreach ($this->_ as $k=>$v)
196 {
197 if (is_array($v))
198 {
199 $string .= "[$k]=>(";
200 foreach ($v as $k2=>$v2)
201 {
202 $string .= "[$k2]=>\"".$v2.'", ';
203 }
204 $string .= ")";
205 } else {
206 $string .= "[$k]=>\"".$v.'", ';
207 }
208 }
209 $string .= ")";
210 }
211
212 if (isset($this->text))
213 {
214 $string .= " text: (" . $this->text . ")";
215 }
216
217 $string .= " HDOM_INNER_INFO: '";
218 if (isset($node->_[HDOM_INFO_INNER]))
219 {
220 $string .= $node->_[HDOM_INFO_INNER] . "'";
221 }
222 else
223 {
224 $string .= ' NULL ';
225 }
226
227 $string .= " children: " . count($this->children);
228 $string .= " nodes: " . count($this->nodes);
229 $string .= " tag_start: " . $this->tag_start;
230 $string .= "\n";
231
232 if ($echo)
233 {
234 echo $string;
235 return;
236 }
237 else
238 {
239 return $string;
240 }
241 }
242
243 // returns the parent of node
244 // If a node is passed in, it will reset the parent of the current node to that one.
245 function parent($parent=null)
246 {
247 // I am SURE that this doesn't work properly.
248 // It fails to unset the current node from it's current parents nodes or children list first.
249 if ($parent !== null)
250 {
251 $this->parent = $parent;
252 $this->parent->nodes[] = $this;
253 $this->parent->children[] = $this;
254 }
255
256 return $this->parent;
257 }
258
259 // verify that node has children
260 function has_child()
261 {
262 return !empty($this->children);
263 }
264
265 // returns children of node
266 function children($idx=-1)
267 {
268 if ($idx===-1)
269 {
270 return $this->children;
271 }
272 if (isset($this->children[$idx])) return $this->children[$idx];
273 return null;
274 }
275
276 // returns the first child of node
277 function first_child()
278 {
279 if (count($this->children)>0)
280 {
281 return $this->children[0];
282 }
283 return null;
284 }
285
286 // returns the last child of node
287 function last_child()
288 {
289 if (($count=count($this->children))>0)
290 {
291 return $this->children[$count-1];
292 }
293 return null;
294 }
295
296 // returns the next sibling of node
297 function next_sibling()
298 {
299 if ($this->parent===null)
300 {
301 return null;
302 }
303
304 $idx = 0;
305 $count = count($this->parent->children);
306 while ($idx<$count && $this!==$this->parent->children[$idx])
307 {
308 ++$idx;
309 }
310 if (++$idx>=$count)
311 {
312 return null;
313 }
314 return $this->parent->children[$idx];
315 }
316
317 // returns the previous sibling of node
318 function prev_sibling()
319 {
320 if ($this->parent===null) return null;
321 $idx = 0;
322 $count = count($this->parent->children);
323 while ($idx<$count && $this!==$this->parent->children[$idx])
324 ++$idx;
325 if (--$idx<0) return null;
326 return $this->parent->children[$idx];
327 }
328
329 // function to locate a specific ancestor tag in the path to the root.
330 function find_ancestor_tag($tag)
331 {
332 global $debug_object;
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
334
335 // Start by including ourselves in the comparison.
336 $returnDom = $this;
337
338 while (!is_null($returnDom))
339 {
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }
341
342 if ($returnDom->tag == $tag)
343 {
344 break;
345 }
346 $returnDom = $returnDom->parent;
347 }
348 return $returnDom;
349 }
350
351 // get dom node's inner html
352 function innertext()
353 {
354 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
355 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
356
357 $ret = '';
358 foreach ($this->nodes as $n)
359 $ret .= $n->outertext();
360 return $ret;
361 }
362
363 // get dom node's outer text (with tag)
364 function outertext()
365 {
366 global $debug_object;
367 if (is_object($debug_object))
368 {
369 $text = '';
370 if ($this->tag == 'text')
371 {
372 if (!empty($this->text))
373 {
374 $text = " with text: " . $this->text;
375 }
376 }
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
378 }
379
380 if ($this->tag==='root') return $this->innertext();
381
382 // trigger callback
383 if ($this->dom && $this->dom->callback!==null)
384 {
385 call_user_func_array($this->dom->callback, array($this));
386 }
387
388 if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
389 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390
391 // render begin tag
392 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
393 {
394 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
395 } else {
396 $ret = "";
397 }
398
399 // render inner text
400 if (isset($this->_[HDOM_INFO_INNER]))
401 {
402 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
403 if ($this->tag != "br")
404 {
405 $ret .= $this->_[HDOM_INFO_INNER];
406 }
407 } else {
408 if ($this->nodes)
409 {
410 foreach ($this->nodes as $n)
411 {
412 $ret .= $this->convert_text($n->outertext());
413 }
414 }
415 }
416
417 // render end tag
418 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
419 $ret .= '</'.$this->tag.'>';
420 return $ret;
421 }
422
423 // get dom node's plain text
424 function text()
425 {
426 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
427 switch ($this->nodetype)
428 {
429 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
430 case HDOM_TYPE_COMMENT: return '';
431 case HDOM_TYPE_UNKNOWN: return '';
432 }
433 if (strcasecmp($this->tag, 'script')===0) return '';
434 if (strcasecmp($this->tag, 'style')===0) return '';
435
436 $ret = '';
437 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
438 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
439 // WHY is this happening?
440 if (!is_null($this->nodes))
441 {
442 foreach ($this->nodes as $n)
443 {
444 $ret .= $this->convert_text($n->text());
445 }
446
447 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
448 if ($this->tag == "span")
449 {
450 $ret .= $this->dom->default_span_text;
451 }
452
453
454 }
455 return $ret;
456 }
457
458 function xmltext()
459 {
460 $ret = $this->innertext();
461 $ret = str_ireplace('<![CDATA[', '', $ret);
462 $ret = str_replace(']]>', '', $ret);
463 return $ret;
464 }
465
466 // build node's text with tag
467 function makeup()
468 {
469 // text, comment, unknown
470 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471
472 $ret = '<'.$this->tag;
473 $i = -1;
474
475 foreach ($this->attr as $key=>$val)
476 {
477 ++$i;
478
479 // skip removed attribute
480 if ($val===null || $val===false)
481 continue;
482
483 $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
484 //no value attr: nowrap, checked selected...
485 if ($val===true)
486 $ret .= $key;
487 else {
488 switch ($this->_[HDOM_INFO_QUOTE][$i])
489 {
490 case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491 case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492 default: $quote = '';
493 }
494 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
495 }
496 }
497 $ret = $this->dom->restore_noise($ret);
498 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
499 }
500
501 // find elements by css selector
502 //PaperG - added ability for find to lowercase the value of the selector.
503 function find($selector, $idx=null, $lowercase=false)
504 {
505 $selectors = $this->parse_selector($selector);
506 if (($count=count($selectors))===0) return array();
507 $found_keys = array();
508
509 // find each selector
510 for ($c=0; $c<$count; ++$c)
511 {
512 // The change on the below line was documented on the sourceforge code tracker id 2788009
513 // used to be: if (($levle=count($selectors[0]))===0) return array();
514 if (($levle=count($selectors[$c]))===0) return array();
515 if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
516
517 $head = array($this->_[HDOM_INFO_BEGIN]=>1);
518
519 // handle descendant selectors, no recursive!
520 for ($l=0; $l<$levle; ++$l)
521 {
522 $ret = array();
523 foreach ($head as $k=>$v)
524 {
525 $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
526 //PaperG - Pass this optional parameter on to the seek function.
527 $n->seek($selectors[$c][$l], $ret, $lowercase);
528 }
529 $head = $ret;
530 }
531
532 foreach ($head as $k=>$v)
533 {
534 if (!isset($found_keys[$k]))
535 $found_keys[$k] = 1;
536 }
537 }
538
539 // sort keys
540 ksort($found_keys);
541
542 $found = array();
543 foreach ($found_keys as $k=>$v)
544 $found[] = $this->dom->nodes[$k];
545
546 // return nth-element or array
547 if (is_null($idx)) return $found;
548 else if ($idx<0) $idx = count($found) + $idx;
549 return (isset($found[$idx])) ? $found[$idx] : null;
550 }
551
552 // seek for given conditions
553 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
554 protected function seek($selector, &$ret, $lowercase=false)
555 {
556 global $debug_object;
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
558
559 list($tag, $key, $val, $exp, $no_key) = $selector;
560
561 // xpath index
562 if ($tag && $key && is_numeric($key))
563 {
564 $count = 0;
565 foreach ($this->children as $c)
566 {
567 if ($tag==='*' || $tag===$c->tag) {
568 if (++$count==$key) {
569 $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
570 return;
571 }
572 }
573 }
574 return;
575 }
576
577 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
578 if ($end==0) {
579 $parent = $this->parent;
580 while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
581 $end -= 1;
582 $parent = $parent->parent;
583 }
584 $end += $parent->_[HDOM_INFO_END];
585 }
586
587 for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
588 $node = $this->dom->nodes[$i];
589
590 $pass = true;
591
592 if ($tag==='*' && !$key) {
593 if (in_array($node, $this->children, true))
594 $ret[$i] = 1;
595 continue;
596 }
597
598 // compare tag
599 if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
600 // compare key
601 if ($pass && $key) {
602 if ($no_key) {
603 if (isset($node->attr[$key])) $pass=false;
604 } else {
605 if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
606 }
607 }
608 // compare value
609 if ($pass && $key && $val && $val!=='*') {
610 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
611 if ($key == "plaintext") {
612 // $node->plaintext actually returns $node->text();
613 $nodeKeyValue = $node->text();
614 } else {
615 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node->attr[$key];
617 }
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 if ($lowercase) {
622 $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
623 } else {
624 $check = $this->match($exp, $val, $nodeKeyValue);
625 }
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}
627
628 // handle multiple class
629 if (!$check && strcasecmp($key, 'class')===0) {
630 foreach (explode(' ',$node->attr[$key]) as $k) {
631 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
632 if (!empty($k)) {
633 if ($lowercase) {
634 $check = $this->match($exp, strtolower($val), strtolower($k));
635 } else {
636 $check = $this->match($exp, $val, $k);
637 }
638 if ($check) break;
639 }
640 }
641 }
642 if (!$check) $pass = false;
643 }
644 if ($pass) $ret[$i] = 1;
645 unset($node);
646 }
647 // It's passed by reference so this is actually what this function returns.
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}
649 }
650
651 protected function match($exp, $pattern, $value) {
652 global $debug_object;
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
654
655 switch ($exp) {
656 case '=':
657 return ($value===$pattern);
658 case '!=':
659 return ($value!==$pattern);
660 case '^=':
661 return preg_match("/^".preg_quote($pattern,'/')."/", $value);
662 case '$=':
663 return preg_match("/".preg_quote($pattern,'/')."$/", $value);
664 case '*=':
665 if ($pattern[0]=='/') {
666 return preg_match($pattern, $value);
667 }
668 return preg_match("/".$pattern."/i", $value);
669 }
670 return false;
671 }
672
673 protected function parse_selector($selector_string) {
674 global $debug_object;
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
676
677 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
679 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
680// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
681// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
682// farther study is required to determine of this should be documented or removed.
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}
687
688 $selectors = array();
689 $result = array();
690 //print_r($matches);
691
692 foreach ($matches as $m) {
693 $m[0] = trim($m[0]);
694 if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
695 // for browser generated xpath
696 if ($m[1]==='tbody') continue;
697
698 list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
699 if (!empty($m[2])) {$key='id'; $val=$m[2];}
700 if (!empty($m[3])) {$key='class'; $val=$m[3];}
701 if (!empty($m[4])) {$key=$m[4];}
702 if (!empty($m[5])) {$exp=$m[5];}
703 if (!empty($m[6])) {$val=$m[6];}
704
705 // convert to lowercase
706 if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
707 //elements that do NOT have the specified attribute
708 if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
709
710 $result[] = array($tag, $key, $val, $exp, $no_key);
711 if (trim($m[7])===',') {
712 $selectors[] = $result;
713 $result = array();
714 }
715 }
716 if (count($result)>0)
717 $selectors[] = $result;
718 return $selectors;
719 }
720
721 function __get($name) {
722 if (isset($this->attr[$name]))
723 {
724 return $this->convert_text($this->attr[$name]);
725 }
726 switch ($name) {
727 case 'outertext': return $this->outertext();
728 case 'innertext': return $this->innertext();
729 case 'plaintext': return $this->text();
730 case 'xmltext': return $this->xmltext();
731 default: return array_key_exists($name, $this->attr);
732 }
733 }
734
735 function __set($name, $value) {
736 switch ($name) {
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 case 'innertext':
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 return $this->_[HDOM_INFO_INNER] = $value;
741 }
742 if (!isset($this->attr[$name])) {
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 }
746 $this->attr[$name] = $value;
747 }
748
749 function __isset($name) {
750 switch ($name) {
751 case 'outertext': return true;
752 case 'innertext': return true;
753 case 'plaintext': return true;
754 }
755 //no value attr: nowrap, checked selected...
756 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
757 }
758
759 function __unset($name) {
760 if (isset($this->attr[$name]))
761 unset($this->attr[$name]);
762 }
763
764 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
765 function convert_text($text)
766 {
767 global $debug_object;
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
769
770 $converted_text = $text;
771
772 $sourceCharset = "";
773 $targetCharset = "";
774
775 if ($this->dom)
776 {
777 $sourceCharset = strtoupper($this->dom->_charset);
778 $targetCharset = strtoupper($this->dom->_target_charset);
779 }
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 {
784 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
785 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
786 {
787 $converted_text = $text;
788 }
789 else
790 {
791 $converted_text = iconv($sourceCharset, $targetCharset, $text);
792 }
793 }
794
795 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
796 if ($targetCharset == 'UTF-8')
797 {
798 if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
799 {
800 $converted_text = substr($converted_text, 3);
801 }
802 if (substr($converted_text, -3) == "\xef\xbb\xbf")
803 {
804 $converted_text = substr($converted_text, 0, -3);
805 }
806 }
807
808 return $converted_text;
809 }
810
811 /**
812 * Returns true if $string is valid UTF-8 and false otherwise.
813 *
814 * @param mixed $str String to be tested
815 * @return boolean
816 */
817 static function is_utf8($str)
818 {
819 $c=0; $b=0;
820 $bits=0;
821 $len=strlen($str);
822 for($i=0; $i<$len; $i++)
823 {
824 $c=ord($str[$i]);
825 if($c > 128)
826 {
827 if(($c >= 254)) return false;
828 elseif($c >= 252) $bits=6;
829 elseif($c >= 248) $bits=5;
830 elseif($c >= 240) $bits=4;
831 elseif($c >= 224) $bits=3;
832 elseif($c >= 192) $bits=2;
833 else return false;
834 if(($i+$bits) > $len) return false;
835 while($bits > 1)
836 {
837 $i++;
838 $b=ord($str[$i]);
839 if($b < 128 || $b > 191) return false;
840 $bits--;
841 }
842 }
843 }
844 return true;
845 }
846 /*
847 function is_utf8($string)
848 {
849 //this is buggy
850 return (utf8_encode(utf8_decode($string)) == $string);
851 }
852 */
853
854 /**
855 * Function to try a few tricks to determine the displayed size of an img on the page.
856 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
857 *
858 * @author John Schlick
859 * @version April 19 2012
860 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
861 */
862 function get_display_size()
863 {
864 global $debug_object;
865
866 $width = -1;
867 $height = -1;
868
869 if ($this->tag !== 'img')
870 {
871 return false;
872 }
873
874 // See if there is aheight or width attribute in the tag itself.
875 if (isset($this->attr['width']))
876 {
877 $width = $this->attr['width'];
878 }
879
880 if (isset($this->attr['height']))
881 {
882 $height = $this->attr['height'];
883 }
884
885 // Now look for an inline style.
886 if (isset($this->attr['style']))
887 {
888 // Thanks to user gnarf from stackoverflow for this regular expression.
889 $attributes = array();
890 preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
891 foreach ($matches as $match) {
892 $attributes[$match[1]] = $match[2];
893 }
894
895 // If there is a width in the style attributes:
896 if (isset($attributes['width']) && $width == -1)
897 {
898 // check that the last two characters are px (pixels)
899 if (strtolower(substr($attributes['width'], -2)) == 'px')
900 {
901 $proposed_width = substr($attributes['width'], 0, -2);
902 // Now make sure that it's an integer and not something stupid.
903 if (filter_var($proposed_width, FILTER_VALIDATE_INT))
904 {
905 $width = $proposed_width;
906 }
907 }
908 }
909
910 // If there is a width in the style attributes:
911 if (isset($attributes['height']) && $height == -1)
912 {
913 // check that the last two characters are px (pixels)
914 if (strtolower(substr($attributes['height'], -2)) == 'px')
915 {
916 $proposed_height = substr($attributes['height'], 0, -2);
917 // Now make sure that it's an integer and not something stupid.
918 if (filter_var($proposed_height, FILTER_VALIDATE_INT))
919 {
920 $height = $proposed_height;
921 }
922 }
923 }
924
925 }
926
927 // Future enhancement:
928 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
929
930 // Far future enhancement
931 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
932 // Note that in this case, the class or id will have the img subselector for it to apply to the image.
933
934 // ridiculously far future development
935 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
936
937 $result = array('height' => $height,
938 'width' => $width);
939 return $result;
940 }
941
942 // camel naming conventions
943 function getAllAttributes() {return $this->attr;}
944 function getAttribute($name) {return $this->__get($name);}
945 function setAttribute($name, $value) {$this->__set($name, $value);}
946 function hasAttribute($name) {return $this->__isset($name);}
947 function removeAttribute($name) {$this->__set($name, null);}
948 function getElementById($id) {return $this->find("#$id", 0);}
949 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
950 function getElementByTagName($name) {return $this->find($name, 0);}
951 function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
952 function parentNode() {return $this->parent();}
953 function childNodes($idx=-1) {return $this->children($idx);}
954 function firstChild() {return $this->first_child();}
955 function lastChild() {return $this->last_child();}
956 function nextSibling() {return $this->next_sibling();}
957 function previousSibling() {return $this->prev_sibling();}
958 function hasChildNodes() {return $this->has_child();}
959 function nodeName() {return $this->tag;}
960 function appendChild($node) {$node->parent($this); return $node;}
961
962}
963
964/**
965 * simple html dom parser
966 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
967 * Paperg - change $size from protected to public so we can easily access it
968 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
969 *
970 * @package PlaceLocalInclude
971 */
972class simple_html_dom
973{
974 public $root = null;
975 public $nodes = array();
976 public $callback = null;
977 public $lowercase = false;
978 // Used to keep track of how large the text was when we started.
979 public $original_size;
980 public $size;
981 protected $pos;
982 protected $doc;
983 protected $char;
984 protected $cursor;
985 protected $parent;
986 protected $noise = array();
987 protected $token_blank = " \t\r\n";
988 protected $token_equal = ' =/>';
989 protected $token_slash = " />\r\n\t";
990 protected $token_attr = ' >';
991 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
992 public $_charset = '';
993 public $_target_charset = '';
994 protected $default_br_text = "";
995 public $default_span_text = "";
996
997 // use isset instead of in_array, performance boost about 30%...
998 protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
999 protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1000 // Known sourceforge issue #2977341
1001 // B tags that are not closed cause us to return everything to the end of the document.
1002 protected $optional_closing_tags = array(
1003 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1004 'th'=>array('th'=>1),
1005 'td'=>array('td'=>1),
1006 'li'=>array('li'=>1),
1007 'dt'=>array('dt'=>1, 'dd'=>1),
1008 'dd'=>array('dd'=>1, 'dt'=>1),
1009 'dl'=>array('dd'=>1, 'dt'=>1),
1010 'p'=>array('p'=>1),
1011 'nobr'=>array('nobr'=>1),
1012 'b'=>array('b'=>1),
1013 'option'=>array('option'=>1),
1014 );
1015
1016 function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1017 {
1018 if ($str)
1019 {
1020 if (preg_match("/^http:\/\//i",$str) || is_file($str))
1021 {
1022 $this->load_file($str);
1023 }
1024 else
1025 {
1026 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1027 }
1028 }
1029 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1030 if (!$forceTagsClosed) {
1031 $this->optional_closing_array=array();
1032 }
1033 $this->_target_charset = $target_charset;
1034 }
1035
1036 function __destruct()
1037 {
1038 $this->clear();
1039 }
1040
1041 // load html from string
1042 function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1043 {
1044 global $debug_object;
1045
1046 // prepare
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 // strip out comments
1049 $this->remove_noise("'<!--(.*?)-->'is");
1050 // strip out cdata
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags
1055 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1056 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1057 // strip out <style> tags
1058 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1059 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1060 // strip out preformatted tags
1061 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1062 // strip out server side scripts
1063 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1064 // strip smarty scripts
1065 $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1066
1067 // parsing
1068 while ($this->parse());
1069 // end
1070 $this->root->_[HDOM_INFO_END] = $this->cursor;
1071 $this->parse_charset();
1072
1073 // make load function chainable
1074 return $this;
1075
1076 }
1077
1078 // load html from file
1079 function load_file()
1080 {
1081 $args = func_get_args();
1082 $this->load(call_user_func_array('file_get_contents', $args), true);
1083 // Throw an error if we can't properly load the dom.
1084 if (($error=error_get_last())!==null) {
1085 $this->clear();
1086 return false;
1087 }
1088 }
1089
1090 // set callback function
1091 function set_callback($function_name)
1092 {
1093 $this->callback = $function_name;
1094 }
1095
1096 // remove callback function
1097 function remove_callback()
1098 {
1099 $this->callback = null;
1100 }
1101
1102 // save dom as string
1103 function save($filepath='')
1104 {
1105 $ret = $this->root->innertext();
1106 if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1107 return $ret;
1108 }
1109
1110 // find dom node by css selector
1111 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1112 function find($selector, $idx=null, $lowercase=false)
1113 {
1114 return $this->root->find($selector, $idx, $lowercase);
1115 }
1116
1117 // clean up memory due to php5 circular references memory leak...
1118 function clear()
1119 {
1120 foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1121 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1122 if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1123 if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1124 if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1125 unset($this->doc);
1126 unset($this->noise);
1127 }
1128
1129 function dump($show_attr=true)
1130 {
1131 $this->root->dump($show_attr);
1132 }
1133
1134 // prepare HTML data and init everything
1135 protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1136 {
1137 $this->clear();
1138
1139 // set the length of content before we do anything to it.
1140 $this->size = strlen($str);
1141 // Save the original size of the html that we got in. It might be useful to someone.
1142 $this->original_size = $this->size;
1143
1144 //before we save the string as the doc... strip out the \r \n's if we are told to.
1145 if ($stripRN) {
1146 $str = str_replace("\r", " ", $str);
1147 $str = str_replace("\n", " ", $str);
1148
1149 // set the length of content since we have changed it.
1150 $this->size = strlen($str);
1151 }
1152
1153 $this->doc = $str;
1154 $this->pos = 0;
1155 $this->cursor = 1;
1156 $this->noise = array();
1157 $this->nodes = array();
1158 $this->lowercase = $lowercase;
1159 $this->default_br_text = $defaultBRText;
1160 $this->default_span_text = $defaultSpanText;
1161 $this->root = new simple_html_dom_node($this);
1162 $this->root->tag = 'root';
1163 $this->root->_[HDOM_INFO_BEGIN] = -1;
1164 $this->root->nodetype = HDOM_TYPE_ROOT;
1165 $this->parent = $this->root;
1166 if ($this->size>0) $this->char = $this->doc[0];
1167 }
1168
1169 // parse html content
1170 protected function parse()
1171 {
1172 if (($s = $this->copy_until_char('<'))==='')
1173 {
1174 return $this->read_tag();
1175 }
1176
1177 // text
1178 $node = new simple_html_dom_node($this);
1179 ++$this->cursor;
1180 $node->_[HDOM_INFO_TEXT] = $s;
1181 $this->link_nodes($node, false);
1182 return true;
1183 }
1184
1185 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1186 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1187 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1188 protected function parse_charset()
1189 {
1190 global $debug_object;
1191
1192 $charset = null;
1193
1194 if (function_exists('get_last_retrieve_url_contents_content_type'))
1195 {
1196 $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1197 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1198 if ($success)
1199 {
1200 $charset = $matches[1];
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);}
1202 }
1203
1204 }
1205
1206 if (empty($charset))
1207 {
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0);
1209 if (!empty($el))
1210 {
1211 $fullvalue = $el->content;
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);}
1213
1214 if (!empty($fullvalue))
1215 {
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
1217 if ($success)
1218 {
1219 $charset = $matches[1];
1220 }
1221 else
1222 {
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1225 $charset = 'ISO-8859-1';
1226 }
1227 }
1228 }
1229 }
1230
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if (empty($charset))
1233 {
1234 // Have php try to detect the encoding from the text given to us.
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);}
1237
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ($charset === false)
1240 {
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');}
1242 $charset = 'UTF-8';
1243 }
1244 }
1245
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1248 {
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1250 $charset = 'CP1252';
1251 }
1252
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);}
1254
1255 return $this->_charset = $charset;
1256 }
1257
1258 // read tag info
1259 protected function read_tag()
1260 {
1261 if ($this->char!=='<')
1262 {
1263 $this->root->_[HDOM_INFO_END] = $this->cursor;
1264 return false;
1265 }
1266 $begin_tag_pos = $this->pos;
1267 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1268
1269 // end tag
1270 if ($this->char==='/')
1271 {
1272 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1273 // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1274 // $this->skip($this->token_blank_t);
1275 $this->skip($this->token_blank);
1276 $tag = $this->copy_until_char('>');
1277
1278 // skip attributes in end tag
1279 if (($pos = strpos($tag, ' '))!==false)
1280 $tag = substr($tag, 0, $pos);
1281
1282 $parent_lower = strtolower($this->parent->tag);
1283 $tag_lower = strtolower($tag);
1284
1285 if ($parent_lower!==$tag_lower)
1286 {
1287 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1288 {
1289 $this->parent->_[HDOM_INFO_END] = 0;
1290 $org_parent = $this->parent;
1291
1292 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1293 $this->parent = $this->parent->parent;
1294
1295 if (strtolower($this->parent->tag)!==$tag_lower) {
1296 $this->parent = $org_parent; // restore origonal parent
1297 if ($this->parent->parent) $this->parent = $this->parent->parent;
1298 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1299 return $this->as_text_node($tag);
1300 }
1301 }
1302 else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1303 {
1304 $this->parent->_[HDOM_INFO_END] = 0;
1305 $org_parent = $this->parent;
1306
1307 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1308 $this->parent = $this->parent->parent;
1309
1310 if (strtolower($this->parent->tag)!==$tag_lower)
1311 {
1312 $this->parent = $org_parent; // restore origonal parent
1313 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1314 return $this->as_text_node($tag);
1315 }
1316 }
1317 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1318 {
1319 $this->parent->_[HDOM_INFO_END] = 0;
1320 $this->parent = $this->parent->parent;
1321 }
1322 else
1323 return $this->as_text_node($tag);
1324 }
1325
1326 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1327 if ($this->parent->parent) $this->parent = $this->parent->parent;
1328
1329 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1330 return true;
1331 }
1332
1333 $node = new simple_html_dom_node($this);
1334 $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1335 ++$this->cursor;
1336 $tag = $this->copy_until($this->token_slash);
1337 $node->tag_start = $begin_tag_pos;
1338
1339 // doctype, cdata & comments...
1340 if (isset($tag[0]) && $tag[0]==='!') {
1341 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1342
1343 if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1344 $node->nodetype = HDOM_TYPE_COMMENT;
1345 $node->tag = 'comment';
1346 } else {
1347 $node->nodetype = HDOM_TYPE_UNKNOWN;
1348 $node->tag = 'unknown';
1349 }
1350 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1351 $this->link_nodes($node, true);
1352 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1353 return true;
1354 }
1355
1356 // text
1357 if ($pos=strpos($tag, '<')!==false) {
1358 $tag = '<' . substr($tag, 0, -1);
1359 $node->_[HDOM_INFO_TEXT] = $tag;
1360 $this->link_nodes($node, false);
1361 $this->char = $this->doc[--$this->pos]; // prev
1362 return true;
1363 }
1364
1365 if (!preg_match("/^[\w-:]+$/", $tag)) {
1366 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1367 if ($this->char==='<') {
1368 $this->link_nodes($node, false);
1369 return true;
1370 }
1371
1372 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1373 $this->link_nodes($node, false);
1374 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1375 return true;
1376 }
1377
1378 // begin tag
1379 $node->nodetype = HDOM_TYPE_ELEMENT;
1380 $tag_lower = strtolower($tag);
1381 $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1382
1383 // handle optional closing tags
1384 if (isset($this->optional_closing_tags[$tag_lower]) )
1385 {
1386 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1387 {
1388 $this->parent->_[HDOM_INFO_END] = 0;
1389 $this->parent = $this->parent->parent;
1390 }
1391 $node->parent = $this->parent;
1392 }
1393
1394 $guard = 0; // prevent infinity loop
1395 $space = array($this->copy_skip($this->token_blank), '', '');
1396
1397 // attributes
1398 do
1399 {
1400 if ($this->char!==null && $space[0]==='')
1401 {
1402 break;
1403 }
1404 $name = $this->copy_until($this->token_equal);
1405 if ($guard===$this->pos)
1406 {
1407 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1408 continue;
1409 }
1410 $guard = $this->pos;
1411
1412 // handle endless '<'
1413 if ($this->pos>=$this->size-1 && $this->char!=='>') {
1414 $node->nodetype = HDOM_TYPE_TEXT;
1415 $node->_[HDOM_INFO_END] = 0;
1416 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1417 $node->tag = 'text';
1418 $this->link_nodes($node, false);
1419 return true;
1420 }
1421
1422 // handle mismatch '<'
1423 if ($this->doc[$this->pos-1]=='<') {
1424 $node->nodetype = HDOM_TYPE_TEXT;
1425 $node->tag = 'text';
1426 $node->attr = array();
1427 $node->_[HDOM_INFO_END] = 0;
1428 $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1429 $this->pos -= 2;
1430 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1431 $this->link_nodes($node, false);
1432 return true;
1433 }
1434
1435 if ($name!=='/' && $name!=='') {
1436 $space[1] = $this->copy_skip($this->token_blank);
1437 $name = $this->restore_noise($name);
1438 if ($this->lowercase) $name = strtolower($name);
1439 if ($this->char==='=') {
1440 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1441 $this->parse_attr($node, $name, $space);
1442 }
1443 else {
1444 //no value attr: nowrap, checked selected...
1445 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1446 $node->attr[$name] = true;
1447 if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1448 }
1449 $node->_[HDOM_INFO_SPACE][] = $space;
1450 $space = array($this->copy_skip($this->token_blank), '', '');
1451 }
1452 else
1453 break;
1454 } while ($this->char!=='>' && $this->char!=='/');
1455
1456 $this->link_nodes($node, true);
1457 $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1458
1459 // check self closing
1460 if ($this->copy_until_char_escape('>')==='/')
1461 {
1462 $node->_[HDOM_INFO_ENDSPACE] .= '/';
1463 $node->_[HDOM_INFO_END] = 0;
1464 }
1465 else
1466 {
1467 // reset parent
1468 if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1469 }
1470 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1471
1472 // If it's a BR tag, we need to set it's text to the default text.
1473 // This way when we see it in plaintext, we can generate formatting that the user wants.
1474 // since a br tag never has sub nodes, this works well.
1475 if ($node->tag == "br")
1476 {
1477 $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1478 }
1479
1480 return true;
1481 }
1482
1483 // parse attributes
1484 protected function parse_attr($node, $name, &$space)
1485 {
1486 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1487 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1488 if (isset($node->attr[$name]))
1489 {
1490 return;
1491 }
1492
1493 $space[2] = $this->copy_skip($this->token_blank);
1494 switch ($this->char) {
1495 case '"':
1496 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1497 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1498 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1499 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1500 break;
1501 case '\'':
1502 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1503 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1504 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1505 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1506 break;
1507 default:
1508 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1509 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1510 }
1511 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1512 $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1513 $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1514 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1515 if ($name == "class") {
1516 $node->attr[$name] = trim($node->attr[$name]);
1517 }
1518 }
1519
1520 // link node's parent
1521 protected function link_nodes(&$node, $is_child)
1522 {
1523 $node->parent = $this->parent;
1524 $this->parent->nodes[] = $node;
1525 if ($is_child)
1526 {
1527 $this->parent->children[] = $node;
1528 }
1529 }
1530
1531 // as a text node
1532 protected function as_text_node($tag)
1533 {
1534 $node = new simple_html_dom_node($this);
1535 ++$this->cursor;
1536 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1537 $this->link_nodes($node, false);
1538 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1539 return true;
1540 }
1541
1542 protected function skip($chars)
1543 {
1544 $this->pos += strspn($this->doc, $chars, $this->pos);
1545 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1546 }
1547
1548 protected function copy_skip($chars)
1549 {
1550 $pos = $this->pos;
1551 $len = strspn($this->doc, $chars, $pos);
1552 $this->pos += $len;
1553 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1554 if ($len===0) return '';
1555 return substr($this->doc, $pos, $len);
1556 }
1557
1558 protected function copy_until($chars)
1559 {
1560 $pos = $this->pos;
1561 $len = strcspn($this->doc, $chars, $pos);
1562 $this->pos += $len;
1563 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1564 return substr($this->doc, $pos, $len);
1565 }
1566
1567 protected function copy_until_char($char)
1568 {
1569 if ($this->char===null) return '';
1570
1571 if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1572 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1573 $this->char = null;
1574 $this->pos = $this->size;
1575 return $ret;
1576 }
1577
1578 if ($pos===$this->pos) return '';
1579 $pos_old = $this->pos;
1580 $this->char = $this->doc[$pos];
1581 $this->pos = $pos;
1582 return substr($this->doc, $pos_old, $pos-$pos_old);
1583 }
1584
1585 protected function copy_until_char_escape($char)
1586 {
1587 if ($this->char===null) return '';
1588
1589 $start = $this->pos;
1590 while (1)
1591 {
1592 if (($pos = strpos($this->doc, $char, $start))===false)
1593 {
1594 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1595 $this->char = null;
1596 $this->pos = $this->size;
1597 return $ret;
1598 }
1599
1600 if ($pos===$this->pos) return '';
1601
1602 if ($this->doc[$pos-1]==='\\') {
1603 $start = $pos+1;
1604 continue;
1605 }
1606
1607 $pos_old = $this->pos;
1608 $this->char = $this->doc[$pos];
1609 $this->pos = $pos;
1610 return substr($this->doc, $pos_old, $pos-$pos_old);
1611 }
1612 }
1613
1614 // remove noise from html content
1615 // save the noise in the $this->noise array.
1616 protected function remove_noise($pattern, $remove_tag=false)
1617 {
1618 global $debug_object;
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1620
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1622
1623 for ($i=$count-1; $i>-1; --$i)
1624 {
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); }
1627 $idx = ($remove_tag) ? 0 : 1;
1628 $this->noise[$key] = $matches[$i][$idx][0];
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1630 }
1631
1632 // reset the length of content
1633 $this->size = strlen($this->doc);
1634 if ($this->size>0)
1635 {
1636 $this->char = $this->doc[0];
1637 }
1638 }
1639
1640 // restore noise to html content
1641 function restore_noise($text)
1642 {
1643 global $debug_object;
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1645
1646 while (($pos=strpos($text, '___noise___'))!==false)
1647 {
1648 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1649 if (strlen($text) > $pos+15)
1650 {
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); }
1653
1654 if (isset($this->noise[$key]))
1655 {
1656 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1657 }
1658 else
1659 {
1660 // do this to prevent an infinite loop.
1661 $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1662 }
1663 }
1664 else
1665 {
1666 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1667 $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1668 }
1669 }
1670 return $text;
1671 }
1672
1673 // Sometimes we NEED one of the noise elements.
1674 function search_noise($text)
1675 {
1676 global $debug_object;
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1678
1679 foreach($this->noise as $noiseElement)
1680 {
1681 if (strpos($noiseElement, $text)!==false)
1682 {
1683 return $noiseElement;
1684 }
1685 }
1686 }
1687 function __toString()
1688 {
1689 return $this->root->innertext();
1690 }
1691
1692 function __get($name)
1693 {
1694 switch ($name)
1695 {
1696 case 'outertext':
1697 return $this->root->innertext();
1698 case 'innertext':
1699 return $this->root->innertext();
1700 case 'plaintext':
1701 return $this->root->text();
1702 case 'charset':
1703 return $this->_charset;
1704 case 'target_charset':
1705 return $this->_target_charset;
1706 }
1707 }
1708
1709 // camel naming conventions
1710 function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1711 function firstChild() {return $this->root->first_child();}
1712 function lastChild() {return $this->root->last_child();}
1713 function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1714 function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1715 function getElementById($id) {return $this->find("#$id", 0);}
1716 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1717 function getElementByTagName($name) {return $this->find($name, 0);}
1718 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1719 function loadFile() {$args = func_get_args();$this->load_file($args);}
1720}
1721
1722?> \ No newline at end of file