aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authornicosomb <nicolas@loeuillet.org>2013-04-17 10:35:50 +0200
committernicosomb <nicolas@loeuillet.org>2013-04-17 10:35:50 +0200
commit0753bfefd7444712a20cee28fdb057e48602b2eb (patch)
treea76172042948b44f4f194ec0fb0672ed51380961
parenta672f9f50680453e162881ca8bfdf2e575d0891f (diff)
downloadwallabag-0753bfefd7444712a20cee28fdb057e48602b2eb.tar.gz
wallabag-0753bfefd7444712a20cee28fdb057e48602b2eb.tar.zst
wallabag-0753bfefd7444712a20cee28fdb057e48602b2eb.zip
import depuis Pocket, yeah cf #3
-rw-r--r--CREDITS1
-rwxr-xr-xREADME.md4
-rw-r--r--import.php55
-rw-r--r--inc/simple_html_dom.php1722
4 files changed, 1782 insertions, 0 deletions
diff --git a/CREDITS b/CREDITS
index 916eae5d..6df488ee 100644
--- a/CREDITS
+++ b/CREDITS
@@ -4,6 +4,7 @@ poche is based on :
4* Encoding https://github.com/neitanod/forceutf8 4* Encoding https://github.com/neitanod/forceutf8
5* logo by Brightmix http://www.iconfinder.com/icondetails/43256/128/jeans_monotone_pocket_icon 5* logo by Brightmix http://www.iconfinder.com/icondetails/43256/128/jeans_monotone_pocket_icon
6* icons http://icomoon.io 6* icons http://icomoon.io
7* PHP Simple HTML DOM Parser (for Pocket import) http://simplehtmldom.sourceforge.net/
7 8
8poche is developed by Nicolas Lœuillet under the Do What the Fuck You Want to Public License 9poche is developed by Nicolas Lœuillet under the Do What the Fuck You Want to Public License
9 10
diff --git a/README.md b/README.md
index d1e1f0ff..c678d11c 100755
--- a/README.md
+++ b/README.md
@@ -27,6 +27,10 @@ You **have** to protect your db/poche.sqlite file. Modify the virtual host of yo
27</Files> 27</Files>
28``` 28```
29 29
30## Import from Pocket
31
32If you want to import your Pocket datas, [export them here](https://getpocket.com/export). Put the HTML file in your poche directory, execute import.php file locally by following instructions. Be careful, the script can take a very long time.
33
30## License 34## License
31Copyright © 2010-2013 Nicolas Lœuillet <nicolas@loeuillet.org> 35Copyright © 2010-2013 Nicolas Lœuillet <nicolas@loeuillet.org>
32This work is free. You can redistribute it and/or modify it under the 36This work is free. You can redistribute it and/or modify it under the
diff --git a/import.php b/import.php
new file mode 100644
index 00000000..7a657c21
--- /dev/null
+++ b/import.php
@@ -0,0 +1,55 @@
1<?php
2/**
3 * poche, a read it later open source system
4 *
5 * @category poche
6 * @author Nicolas Lœuillet <support@inthepoche.com>
7 * @copyright 2013
8 * @license http://www.wtfpl.net/ see COPYING file
9 */
10
11set_time_limit(0);
12
13include dirname(__FILE__).'/inc/config.php';
14include dirname(__FILE__).'/inc/simple_html_dom.php';
15
16if (!isset($_GET['start'])) {
17 echo 'Please execute the import script locally, it can take a very long time. <br /><a href="import.php?start">Bye bye Pocket, let\'s go !</a>';
18}
19else {
20 $html = new simple_html_dom();
21 $html->load_file('ril_export.html');
22
23 $read = '0';
24 $errors = array();
25 foreach($html->find('ul') as $ul)
26 {
27 foreach($ul->find('li') as $li)
28 {
29 $a = $li->find('a');
30 $url = $a[0]->href;
31 $parametres_url = prepare_url($url);
32 $sql_action = 'INSERT INTO entries ( url, title, content, is_read ) VALUES (?, ?, ?, ?)';
33 $params_action = array($url, $parametres_url['title'], $parametres_url['content'], $read);
34 try
35 {
36 # action query
37 if (isset($sql_action))
38 {
39 $query = $db->getHandle()->prepare($sql_action);
40 $query->execute($params_action);
41 }
42 }
43 catch (Exception $e)
44 {
45 $errors[] = $e->getMessage();
46 }
47 }
48 # Pocket génère un fichier HTML avec deux <ul>
49 # Le premier concerne les éléments non lus
50 # Le second concerne les éléments archivés
51 $read = '-1';
52 }
53
54 echo 'Import from Pocket completed. <a href="index.php">Welcome to #poche !</a>';
55} \ No newline at end of file
diff --git a/inc/simple_html_dom.php b/inc/simple_html_dom.php
new file mode 100644
index 00000000..43b94e57
--- /dev/null
+++ b/inc/simple_html_dom.php
@@ -0,0 +1,1722 @@
1<?php
2/**
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6 * Contributions by:
7 * Yousuke Kumakura (Attribute filters)
8 * Vadim Voituk (Negative indexes supports of "find" method)
9 * Antcs (Constructor with automatically load contents either text or file/url)
10 *
11 * all affected sections have comments starting with "PaperG"
12 *
13 * Paperg - Added case insensitive testing of the value of the selector.
14 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16 * it will almost always be smaller by some amount.
17 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18 * but for most purposes, it's a really good estimation.
19 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 * Allow the user to tell us how much they trust the html.
21 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 * This allows for us to find tags based on the text they contain.
23 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 * Paperg: added parse_charset so that we know about the character set of the source document.
25 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
27 *
28 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
30 *
31 * Licensed under The MIT License
32 * Redistributions of files must retain the above copyright notice.
33 *
34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick
36 * @author Rus Carroll
37 * @version 1.5 ($Rev: 202 $)
38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom
40 */
41
42/**
43 * All of the Defines for the classes below.
44 * @author S.C. Chen <me578022@gmail.com>
45 */
46define('HDOM_TYPE_ELEMENT', 1);
47define('HDOM_TYPE_COMMENT', 2);
48define('HDOM_TYPE_TEXT', 3);
49define('HDOM_TYPE_ENDTAG', 4);
50define('HDOM_TYPE_ROOT', 5);
51define('HDOM_TYPE_UNKNOWN', 6);
52define('HDOM_QUOTE_DOUBLE', 0);
53define('HDOM_QUOTE_SINGLE', 1);
54define('HDOM_QUOTE_NO', 3);
55define('HDOM_INFO_BEGIN', 0);
56define('HDOM_INFO_END', 1);
57define('HDOM_INFO_QUOTE', 2);
58define('HDOM_INFO_SPACE', 3);
59define('HDOM_INFO_TEXT', 4);
60define('HDOM_INFO_INNER', 5);
61define('HDOM_INFO_OUTER', 6);
62define('HDOM_INFO_ENDSPACE',7);
63define('DEFAULT_TARGET_CHARSET', 'UTF-8');
64define('DEFAULT_BR_TEXT', "\r\n");
65define('DEFAULT_SPAN_TEXT', " ");
66define('MAX_FILE_SIZE', 600000);
67// helper functions
68// -----------------------------------------------------------------------------
69// get html dom from file
70// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
72{
73 // We DO force the tags to be terminated.
74 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
75 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 $contents = file_get_contents($url, $use_include_path, $context, $offset);
77 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 //$contents = retrieve_url_contents($url);
79 if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
80 {
81 return false;
82 }
83 // The second parameter can force the selectors to all be lowercase.
84 $dom->load($contents, $lowercase, $stripRN);
85 return $dom;
86}
87
88// get html dom from string
89function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
90{
91 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
92 if (empty($str) || strlen($str) > MAX_FILE_SIZE)
93 {
94 $dom->clear();
95 return false;
96 }
97 $dom->load($str, $lowercase, $stripRN);
98 return $dom;
99}
100
101// dump html dom tree
102function dump_html_tree($node, $show_attr=true, $deep=0)
103{
104 $node->dump($node);
105}
106
107
108/**
109 * simple html dom node
110 * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 * PaperG - added $tag_start to track the start position of the tag in the total byte index
112 *
113 * @package PlaceLocalInclude
114 */
115class simple_html_dom_node
116{
117 public $nodetype = HDOM_TYPE_TEXT;
118 public $tag = 'text';
119 public $attr = array();
120 public $children = array();
121 public $nodes = array();
122 public $parent = null;
123 // The "info" array - see HDOM_INFO_... for what each element contains.
124 public $_ = array();
125 public $tag_start = 0;
126 private $dom = null;
127
128 function __construct($dom)
129 {
130 $this->dom = $dom;
131 $dom->nodes[] = $this;
132 }
133
134 function __destruct()
135 {
136 $this->clear();
137 }
138
139 function __toString()
140 {
141 return $this->outertext();
142 }
143
144 // clean up memory due to php5 circular references memory leak...
145 function clear()
146 {
147 $this->dom = null;
148 $this->nodes = null;
149 $this->parent = null;
150 $this->children = null;
151 }
152
153 // dump node's tree
154 function dump($show_attr=true, $deep=0)
155 {
156 $lead = str_repeat(' ', $deep);
157
158 echo $lead.$this->tag;
159 if ($show_attr && count($this->attr)>0)
160 {
161 echo '(';
162 foreach ($this->attr as $k=>$v)
163 echo "[$k]=>\"".$this->$k.'", ';
164 echo ')';
165 }
166 echo "\n";
167
168 if ($this->nodes)
169 {
170 foreach ($this->nodes as $c)
171 {
172 $c->dump($show_attr, $deep+1);
173 }
174 }
175 }
176
177
178 // Debugging function to dump a single dom node with a bunch of information about it.
179 function dump_node($echo=true)
180 {
181
182 $string = $this->tag;
183 if (count($this->attr)>0)
184 {
185 $string .= '(';
186 foreach ($this->attr as $k=>$v)
187 {
188 $string .= "[$k]=>\"".$this->$k.'", ';
189 }
190 $string .= ')';
191 }
192 if (count($this->_)>0)
193 {
194 $string .= ' $_ (';
195 foreach ($this->_ as $k=>$v)
196 {
197 if (is_array($v))
198 {
199 $string .= "[$k]=>(";
200 foreach ($v as $k2=>$v2)
201 {
202 $string .= "[$k2]=>\"".$v2.'", ';
203 }
204 $string .= ")";
205 } else {
206 $string .= "[$k]=>\"".$v.'", ';
207 }
208 }
209 $string .= ")";
210 }
211
212 if (isset($this->text))
213 {
214 $string .= " text: (" . $this->text . ")";
215 }
216
217 $string .= " HDOM_INNER_INFO: '";
218 if (isset($node->_[HDOM_INFO_INNER]))
219 {
220 $string .= $node->_[HDOM_INFO_INNER] . "'";
221 }
222 else
223 {
224 $string .= ' NULL ';
225 }
226
227 $string .= " children: " . count($this->children);
228 $string .= " nodes: " . count($this->nodes);
229 $string .= " tag_start: " . $this->tag_start;
230 $string .= "\n";
231
232 if ($echo)
233 {
234 echo $string;
235 return;
236 }
237 else
238 {
239 return $string;
240 }
241 }
242
243 // returns the parent of node
244 // If a node is passed in, it will reset the parent of the current node to that one.
245 function parent($parent=null)
246 {
247 // I am SURE that this doesn't work properly.
248 // It fails to unset the current node from it's current parents nodes or children list first.
249 if ($parent !== null)
250 {
251 $this->parent = $parent;
252 $this->parent->nodes[] = $this;
253 $this->parent->children[] = $this;
254 }
255
256 return $this->parent;
257 }
258
259 // verify that node has children
260 function has_child()
261 {
262 return !empty($this->children);
263 }
264
265 // returns children of node
266 function children($idx=-1)
267 {
268 if ($idx===-1)
269 {
270 return $this->children;
271 }
272 if (isset($this->children[$idx])) return $this->children[$idx];
273 return null;
274 }
275
276 // returns the first child of node
277 function first_child()
278 {
279 if (count($this->children)>0)
280 {
281 return $this->children[0];
282 }
283 return null;
284 }
285
286 // returns the last child of node
287 function last_child()
288 {
289 if (($count=count($this->children))>0)
290 {
291 return $this->children[$count-1];
292 }
293 return null;
294 }
295
296 // returns the next sibling of node
297 function next_sibling()
298 {
299 if ($this->parent===null)
300 {
301 return null;
302 }
303
304 $idx = 0;
305 $count = count($this->parent->children);
306 while ($idx<$count && $this!==$this->parent->children[$idx])
307 {
308 ++$idx;
309 }
310 if (++$idx>=$count)
311 {
312 return null;
313 }
314 return $this->parent->children[$idx];
315 }
316
317 // returns the previous sibling of node
318 function prev_sibling()
319 {
320 if ($this->parent===null) return null;
321 $idx = 0;
322 $count = count($this->parent->children);
323 while ($idx<$count && $this!==$this->parent->children[$idx])
324 ++$idx;
325 if (--$idx<0) return null;
326 return $this->parent->children[$idx];
327 }
328
329 // function to locate a specific ancestor tag in the path to the root.
330 function find_ancestor_tag($tag)
331 {
332 global $debug_object;
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
334
335 // Start by including ourselves in the comparison.
336 $returnDom = $this;
337
338 while (!is_null($returnDom))
339 {
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }
341
342 if ($returnDom->tag == $tag)
343 {
344 break;
345 }
346 $returnDom = $returnDom->parent;
347 }
348 return $returnDom;
349 }
350
351 // get dom node's inner html
352 function innertext()
353 {
354 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
355 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
356
357 $ret = '';
358 foreach ($this->nodes as $n)
359 $ret .= $n->outertext();
360 return $ret;
361 }
362
363 // get dom node's outer text (with tag)
364 function outertext()
365 {
366 global $debug_object;
367 if (is_object($debug_object))
368 {
369 $text = '';
370 if ($this->tag == 'text')
371 {
372 if (!empty($this->text))
373 {
374 $text = " with text: " . $this->text;
375 }
376 }
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
378 }
379
380 if ($this->tag==='root') return $this->innertext();
381
382 // trigger callback
383 if ($this->dom && $this->dom->callback!==null)
384 {
385 call_user_func_array($this->dom->callback, array($this));
386 }
387
388 if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
389 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390
391 // render begin tag
392 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
393 {
394 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
395 } else {
396 $ret = "";
397 }
398
399 // render inner text
400 if (isset($this->_[HDOM_INFO_INNER]))
401 {
402 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
403 if ($this->tag != "br")
404 {
405 $ret .= $this->_[HDOM_INFO_INNER];
406 }
407 } else {
408 if ($this->nodes)
409 {
410 foreach ($this->nodes as $n)
411 {
412 $ret .= $this->convert_text($n->outertext());
413 }
414 }
415 }
416
417 // render end tag
418 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
419 $ret .= '</'.$this->tag.'>';
420 return $ret;
421 }
422
423 // get dom node's plain text
424 function text()
425 {
426 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
427 switch ($this->nodetype)
428 {
429 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
430 case HDOM_TYPE_COMMENT: return '';
431 case HDOM_TYPE_UNKNOWN: return '';
432 }
433 if (strcasecmp($this->tag, 'script')===0) return '';
434 if (strcasecmp($this->tag, 'style')===0) return '';
435
436 $ret = '';
437 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
438 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
439 // WHY is this happening?
440 if (!is_null($this->nodes))
441 {
442 foreach ($this->nodes as $n)
443 {
444 $ret .= $this->convert_text($n->text());
445 }
446
447 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
448 if ($this->tag == "span")
449 {
450 $ret .= $this->dom->default_span_text;
451 }
452
453
454 }
455 return $ret;
456 }
457
458 function xmltext()
459 {
460 $ret = $this->innertext();
461 $ret = str_ireplace('<![CDATA[', '', $ret);
462 $ret = str_replace(']]>', '', $ret);
463 return $ret;
464 }
465
466 // build node's text with tag
467 function makeup()
468 {
469 // text, comment, unknown
470 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471
472 $ret = '<'.$this->tag;
473 $i = -1;
474
475 foreach ($this->attr as $key=>$val)
476 {
477 ++$i;
478
479 // skip removed attribute
480 if ($val===null || $val===false)
481 continue;
482
483 $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
484 //no value attr: nowrap, checked selected...
485 if ($val===true)
486 $ret .= $key;
487 else {
488 switch ($this->_[HDOM_INFO_QUOTE][$i])
489 {
490 case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491 case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492 default: $quote = '';
493 }
494 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
495 }
496 }
497 $ret = $this->dom->restore_noise($ret);
498 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
499 }
500
501 // find elements by css selector
502 //PaperG - added ability for find to lowercase the value of the selector.
503 function find($selector, $idx=null, $lowercase=false)
504 {
505 $selectors = $this->parse_selector($selector);
506 if (($count=count($selectors))===0) return array();
507 $found_keys = array();
508
509 // find each selector
510 for ($c=0; $c<$count; ++$c)
511 {
512 // The change on the below line was documented on the sourceforge code tracker id 2788009
513 // used to be: if (($levle=count($selectors[0]))===0) return array();
514 if (($levle=count($selectors[$c]))===0) return array();
515 if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
516
517 $head = array($this->_[HDOM_INFO_BEGIN]=>1);
518
519 // handle descendant selectors, no recursive!
520 for ($l=0; $l<$levle; ++$l)
521 {
522 $ret = array();
523 foreach ($head as $k=>$v)
524 {
525 $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
526 //PaperG - Pass this optional parameter on to the seek function.
527 $n->seek($selectors[$c][$l], $ret, $lowercase);
528 }
529 $head = $ret;
530 }
531
532 foreach ($head as $k=>$v)
533 {
534 if (!isset($found_keys[$k]))
535 $found_keys[$k] = 1;
536 }
537 }
538
539 // sort keys
540 ksort($found_keys);
541
542 $found = array();
543 foreach ($found_keys as $k=>$v)
544 $found[] = $this->dom->nodes[$k];
545
546 // return nth-element or array
547 if (is_null($idx)) return $found;
548 else if ($idx<0) $idx = count($found) + $idx;
549 return (isset($found[$idx])) ? $found[$idx] : null;
550 }
551
552 // seek for given conditions
553 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
554 protected function seek($selector, &$ret, $lowercase=false)
555 {
556 global $debug_object;
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
558
559 list($tag, $key, $val, $exp, $no_key) = $selector;
560
561 // xpath index
562 if ($tag && $key && is_numeric($key))
563 {
564 $count = 0;
565 foreach ($this->children as $c)
566 {
567 if ($tag==='*' || $tag===$c->tag) {
568 if (++$count==$key) {
569 $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
570 return;
571 }
572 }
573 }
574 return;
575 }
576
577 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
578 if ($end==0) {
579 $parent = $this->parent;
580 while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
581 $end -= 1;
582 $parent = $parent->parent;
583 }
584 $end += $parent->_[HDOM_INFO_END];
585 }
586
587 for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
588 $node = $this->dom->nodes[$i];
589
590 $pass = true;
591
592 if ($tag==='*' && !$key) {
593 if (in_array($node, $this->children, true))
594 $ret[$i] = 1;
595 continue;
596 }
597
598 // compare tag
599 if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
600 // compare key
601 if ($pass && $key) {
602 if ($no_key) {
603 if (isset($node->attr[$key])) $pass=false;
604 } else {
605 if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
606 }
607 }
608 // compare value
609 if ($pass && $key && $val && $val!=='*') {
610 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
611 if ($key == "plaintext") {
612 // $node->plaintext actually returns $node->text();
613 $nodeKeyValue = $node->text();
614 } else {
615 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node->attr[$key];
617 }
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 if ($lowercase) {
622 $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
623 } else {
624 $check = $this->match($exp, $val, $nodeKeyValue);
625 }
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}
627
628 // handle multiple class
629 if (!$check && strcasecmp($key, 'class')===0) {
630 foreach (explode(' ',$node->attr[$key]) as $k) {
631 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
632 if (!empty($k)) {
633 if ($lowercase) {
634 $check = $this->match($exp, strtolower($val), strtolower($k));
635 } else {
636 $check = $this->match($exp, $val, $k);
637 }
638 if ($check) break;
639 }
640 }
641 }
642 if (!$check) $pass = false;
643 }
644 if ($pass) $ret[$i] = 1;
645 unset($node);
646 }
647 // It's passed by reference so this is actually what this function returns.
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}
649 }
650
651 protected function match($exp, $pattern, $value) {
652 global $debug_object;
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
654
655 switch ($exp) {
656 case '=':
657 return ($value===$pattern);
658 case '!=':
659 return ($value!==$pattern);
660 case '^=':
661 return preg_match("/^".preg_quote($pattern,'/')."/", $value);
662 case '$=':
663 return preg_match("/".preg_quote($pattern,'/')."$/", $value);
664 case '*=':
665 if ($pattern[0]=='/') {
666 return preg_match($pattern, $value);
667 }
668 return preg_match("/".$pattern."/i", $value);
669 }
670 return false;
671 }
672
673 protected function parse_selector($selector_string) {
674 global $debug_object;
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
676
677 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
679 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
680// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
681// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
682// farther study is required to determine of this should be documented or removed.
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}
687
688 $selectors = array();
689 $result = array();
690 //print_r($matches);
691
692 foreach ($matches as $m) {
693 $m[0] = trim($m[0]);
694 if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
695 // for browser generated xpath
696 if ($m[1]==='tbody') continue;
697
698 list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
699 if (!empty($m[2])) {$key='id'; $val=$m[2];}
700 if (!empty($m[3])) {$key='class'; $val=$m[3];}
701 if (!empty($m[4])) {$key=$m[4];}
702 if (!empty($m[5])) {$exp=$m[5];}
703 if (!empty($m[6])) {$val=$m[6];}
704
705 // convert to lowercase
706 if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
707 //elements that do NOT have the specified attribute
708 if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
709
710 $result[] = array($tag, $key, $val, $exp, $no_key);
711 if (trim($m[7])===',') {
712 $selectors[] = $result;
713 $result = array();
714 }
715 }
716 if (count($result)>0)
717 $selectors[] = $result;
718 return $selectors;
719 }
720
721 function __get($name) {
722 if (isset($this->attr[$name]))
723 {
724 return $this->convert_text($this->attr[$name]);
725 }
726 switch ($name) {
727 case 'outertext': return $this->outertext();
728 case 'innertext': return $this->innertext();
729 case 'plaintext': return $this->text();
730 case 'xmltext': return $this->xmltext();
731 default: return array_key_exists($name, $this->attr);
732 }
733 }
734
735 function __set($name, $value) {
736 switch ($name) {
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 case 'innertext':
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 return $this->_[HDOM_INFO_INNER] = $value;
741 }
742 if (!isset($this->attr[$name])) {
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 }
746 $this->attr[$name] = $value;
747 }
748
749 function __isset($name) {
750 switch ($name) {
751 case 'outertext': return true;
752 case 'innertext': return true;
753 case 'plaintext': return true;
754 }
755 //no value attr: nowrap, checked selected...
756 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
757 }
758
759 function __unset($name) {
760 if (isset($this->attr[$name]))
761 unset($this->attr[$name]);
762 }
763
764 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
765 function convert_text($text)
766 {
767 global $debug_object;
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
769
770 $converted_text = $text;
771
772 $sourceCharset = "";
773 $targetCharset = "";
774
775 if ($this->dom)
776 {
777 $sourceCharset = strtoupper($this->dom->_charset);
778 $targetCharset = strtoupper($this->dom->_target_charset);
779 }
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 {
784 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
785 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
786 {
787 $converted_text = $text;
788 }
789 else
790 {
791 $converted_text = iconv($sourceCharset, $targetCharset, $text);
792 }
793 }
794
795 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
796 if ($targetCharset == 'UTF-8')
797 {
798 if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
799 {
800 $converted_text = substr($converted_text, 3);
801 }
802 if (substr($converted_text, -3) == "\xef\xbb\xbf")
803 {
804 $converted_text = substr($converted_text, 0, -3);
805 }
806 }
807
808 return $converted_text;
809 }
810
811 /**
812 * Returns true if $string is valid UTF-8 and false otherwise.
813 *
814 * @param mixed $str String to be tested
815 * @return boolean
816 */
817 static function is_utf8($str)
818 {
819 $c=0; $b=0;
820 $bits=0;
821 $len=strlen($str);
822 for($i=0; $i<$len; $i++)
823 {
824 $c=ord($str[$i]);
825 if($c > 128)
826 {
827 if(($c >= 254)) return false;
828 elseif($c >= 252) $bits=6;
829 elseif($c >= 248) $bits=5;
830 elseif($c >= 240) $bits=4;
831 elseif($c >= 224) $bits=3;
832 elseif($c >= 192) $bits=2;
833 else return false;
834 if(($i+$bits) > $len) return false;
835 while($bits > 1)
836 {
837 $i++;
838 $b=ord($str[$i]);
839 if($b < 128 || $b > 191) return false;
840 $bits--;
841 }
842 }
843 }
844 return true;
845 }
846 /*
847 function is_utf8($string)
848 {
849 //this is buggy
850 return (utf8_encode(utf8_decode($string)) == $string);
851 }
852 */
853
854 /**
855 * Function to try a few tricks to determine the displayed size of an img on the page.
856 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
857 *
858 * @author John Schlick
859 * @version April 19 2012
860 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
861 */
862 function get_display_size()
863 {
864 global $debug_object;
865
866 $width = -1;
867 $height = -1;
868
869 if ($this->tag !== 'img')
870 {
871 return false;
872 }
873
874 // See if there is aheight or width attribute in the tag itself.
875 if (isset($this->attr['width']))
876 {
877 $width = $this->attr['width'];
878 }
879
880 if (isset($this->attr['height']))
881 {
882 $height = $this->attr['height'];
883 }
884
885 // Now look for an inline style.
886 if (isset($this->attr['style']))
887 {
888 // Thanks to user gnarf from stackoverflow for this regular expression.
889 $attributes = array();
890 preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
891 foreach ($matches as $match) {
892 $attributes[$match[1]] = $match[2];
893 }
894
895 // If there is a width in the style attributes:
896 if (isset($attributes['width']) && $width == -1)
897 {
898 // check that the last two characters are px (pixels)
899 if (strtolower(substr($attributes['width'], -2)) == 'px')
900 {
901 $proposed_width = substr($attributes['width'], 0, -2);
902 // Now make sure that it's an integer and not something stupid.
903 if (filter_var($proposed_width, FILTER_VALIDATE_INT))
904 {
905 $width = $proposed_width;
906 }
907 }
908 }
909
910 // If there is a width in the style attributes:
911 if (isset($attributes['height']) && $height == -1)
912 {
913 // check that the last two characters are px (pixels)
914 if (strtolower(substr($attributes['height'], -2)) == 'px')
915 {
916 $proposed_height = substr($attributes['height'], 0, -2);
917 // Now make sure that it's an integer and not something stupid.
918 if (filter_var($proposed_height, FILTER_VALIDATE_INT))
919 {
920 $height = $proposed_height;
921 }
922 }
923 }
924
925 }
926
927 // Future enhancement:
928 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
929
930 // Far future enhancement
931 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
932 // Note that in this case, the class or id will have the img subselector for it to apply to the image.
933
934 // ridiculously far future development
935 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
936
937 $result = array('height' => $height,
938 'width' => $width);
939 return $result;
940 }
941
942 // camel naming conventions
943 function getAllAttributes() {return $this->attr;}
944 function getAttribute($name) {return $this->__get($name);}
945 function setAttribute($name, $value) {$this->__set($name, $value);}
946 function hasAttribute($name) {return $this->__isset($name);}
947 function removeAttribute($name) {$this->__set($name, null);}
948 function getElementById($id) {return $this->find("#$id", 0);}
949 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
950 function getElementByTagName($name) {return $this->find($name, 0);}
951 function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
952 function parentNode() {return $this->parent();}
953 function childNodes($idx=-1) {return $this->children($idx);}
954 function firstChild() {return $this->first_child();}
955 function lastChild() {return $this->last_child();}
956 function nextSibling() {return $this->next_sibling();}
957 function previousSibling() {return $this->prev_sibling();}
958 function hasChildNodes() {return $this->has_child();}
959 function nodeName() {return $this->tag;}
960 function appendChild($node) {$node->parent($this); return $node;}
961
962}
963
964/**
965 * simple html dom parser
966 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
967 * Paperg - change $size from protected to public so we can easily access it
968 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
969 *
970 * @package PlaceLocalInclude
971 */
972class simple_html_dom
973{
974 public $root = null;
975 public $nodes = array();
976 public $callback = null;
977 public $lowercase = false;
978 // Used to keep track of how large the text was when we started.
979 public $original_size;
980 public $size;
981 protected $pos;
982 protected $doc;
983 protected $char;
984 protected $cursor;
985 protected $parent;
986 protected $noise = array();
987 protected $token_blank = " \t\r\n";
988 protected $token_equal = ' =/>';
989 protected $token_slash = " />\r\n\t";
990 protected $token_attr = ' >';
991 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
992 public $_charset = '';
993 public $_target_charset = '';
994 protected $default_br_text = "";
995 public $default_span_text = "";
996
997 // use isset instead of in_array, performance boost about 30%...
998 protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
999 protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
1000 // Known sourceforge issue #2977341
1001 // B tags that are not closed cause us to return everything to the end of the document.
1002 protected $optional_closing_tags = array(
1003 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1004 'th'=>array('th'=>1),
1005 'td'=>array('td'=>1),
1006 'li'=>array('li'=>1),
1007 'dt'=>array('dt'=>1, 'dd'=>1),
1008 'dd'=>array('dd'=>1, 'dt'=>1),
1009 'dl'=>array('dd'=>1, 'dt'=>1),
1010 'p'=>array('p'=>1),
1011 'nobr'=>array('nobr'=>1),
1012 'b'=>array('b'=>1),
1013 'option'=>array('option'=>1),
1014 );
1015
1016 function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1017 {
1018 if ($str)
1019 {
1020 if (preg_match("/^http:\/\//i",$str) || is_file($str))
1021 {
1022 $this->load_file($str);
1023 }
1024 else
1025 {
1026 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1027 }
1028 }
1029 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1030 if (!$forceTagsClosed) {
1031 $this->optional_closing_array=array();
1032 }
1033 $this->_target_charset = $target_charset;
1034 }
1035
1036 function __destruct()
1037 {
1038 $this->clear();
1039 }
1040
1041 // load html from string
1042 function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1043 {
1044 global $debug_object;
1045
1046 // prepare
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 // strip out comments
1049 $this->remove_noise("'<!--(.*?)-->'is");
1050 // strip out cdata
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags
1055 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1056 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1057 // strip out <style> tags
1058 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1059 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1060 // strip out preformatted tags
1061 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1062 // strip out server side scripts
1063 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1064 // strip smarty scripts
1065 $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1066
1067 // parsing
1068 while ($this->parse());
1069 // end
1070 $this->root->_[HDOM_INFO_END] = $this->cursor;
1071 $this->parse_charset();
1072
1073 // make load function chainable
1074 return $this;
1075
1076 }
1077
1078 // load html from file
1079 function load_file()
1080 {
1081 $args = func_get_args();
1082 $this->load(call_user_func_array('file_get_contents', $args), true);
1083 // Throw an error if we can't properly load the dom.
1084 if (($error=error_get_last())!==null) {
1085 $this->clear();
1086 return false;
1087 }
1088 }
1089
1090 // set callback function
1091 function set_callback($function_name)
1092 {
1093 $this->callback = $function_name;
1094 }
1095
1096 // remove callback function
1097 function remove_callback()
1098 {
1099 $this->callback = null;
1100 }
1101
1102 // save dom as string
1103 function save($filepath='')
1104 {
1105 $ret = $this->root->innertext();
1106 if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1107 return $ret;
1108 }
1109
1110 // find dom node by css selector
1111 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1112 function find($selector, $idx=null, $lowercase=false)
1113 {
1114 return $this->root->find($selector, $idx, $lowercase);
1115 }
1116
1117 // clean up memory due to php5 circular references memory leak...
1118 function clear()
1119 {
1120 foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1121 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1122 if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1123 if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1124 if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1125 unset($this->doc);
1126 unset($this->noise);
1127 }
1128
1129 function dump($show_attr=true)
1130 {
1131 $this->root->dump($show_attr);
1132 }
1133
1134 // prepare HTML data and init everything
1135 protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1136 {
1137 $this->clear();
1138
1139 // set the length of content before we do anything to it.
1140 $this->size = strlen($str);
1141 // Save the original size of the html that we got in. It might be useful to someone.
1142 $this->original_size = $this->size;
1143
1144 //before we save the string as the doc... strip out the \r \n's if we are told to.
1145 if ($stripRN) {
1146 $str = str_replace("\r", " ", $str);
1147 $str = str_replace("\n", " ", $str);
1148
1149 // set the length of content since we have changed it.
1150 $this->size = strlen($str);
1151 }
1152
1153 $this->doc = $str;
1154 $this->pos = 0;
1155 $this->cursor = 1;
1156 $this->noise = array();
1157 $this->nodes = array();
1158 $this->lowercase = $lowercase;
1159 $this->default_br_text = $defaultBRText;
1160 $this->default_span_text = $defaultSpanText;
1161 $this->root = new simple_html_dom_node($this);
1162 $this->root->tag = 'root';
1163 $this->root->_[HDOM_INFO_BEGIN] = -1;
1164 $this->root->nodetype = HDOM_TYPE_ROOT;
1165 $this->parent = $this->root;
1166 if ($this->size>0) $this->char = $this->doc[0];
1167 }
1168
1169 // parse html content
1170 protected function parse()
1171 {
1172 if (($s = $this->copy_until_char('<'))==='')
1173 {
1174 return $this->read_tag();
1175 }
1176
1177 // text
1178 $node = new simple_html_dom_node($this);
1179 ++$this->cursor;
1180 $node->_[HDOM_INFO_TEXT] = $s;
1181 $this->link_nodes($node, false);
1182 return true;
1183 }
1184
1185 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1186 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1187 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1188 protected function parse_charset()
1189 {
1190 global $debug_object;
1191
1192 $charset = null;
1193
1194 if (function_exists('get_last_retrieve_url_contents_content_type'))
1195 {
1196 $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1197 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1198 if ($success)
1199 {
1200 $charset = $matches[1];
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);}
1202 }
1203
1204 }
1205
1206 if (empty($charset))
1207 {
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0);
1209 if (!empty($el))
1210 {
1211 $fullvalue = $el->content;
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);}
1213
1214 if (!empty($fullvalue))
1215 {
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
1217 if ($success)
1218 {
1219 $charset = $matches[1];
1220 }
1221 else
1222 {
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1225 $charset = 'ISO-8859-1';
1226 }
1227 }
1228 }
1229 }
1230
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if (empty($charset))
1233 {
1234 // Have php try to detect the encoding from the text given to us.
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);}
1237
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ($charset === false)
1240 {
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');}
1242 $charset = 'UTF-8';
1243 }
1244 }
1245
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1248 {
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1250 $charset = 'CP1252';
1251 }
1252
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);}
1254
1255 return $this->_charset = $charset;
1256 }
1257
1258 // read tag info
1259 protected function read_tag()
1260 {
1261 if ($this->char!=='<')
1262 {
1263 $this->root->_[HDOM_INFO_END] = $this->cursor;
1264 return false;
1265 }
1266 $begin_tag_pos = $this->pos;
1267 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1268
1269 // end tag
1270 if ($this->char==='/')
1271 {
1272 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1273 // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1274 // $this->skip($this->token_blank_t);
1275 $this->skip($this->token_blank);
1276 $tag = $this->copy_until_char('>');
1277
1278 // skip attributes in end tag
1279 if (($pos = strpos($tag, ' '))!==false)
1280 $tag = substr($tag, 0, $pos);
1281
1282 $parent_lower = strtolower($this->parent->tag);
1283 $tag_lower = strtolower($tag);
1284
1285 if ($parent_lower!==$tag_lower)
1286 {
1287 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1288 {
1289 $this->parent->_[HDOM_INFO_END] = 0;
1290 $org_parent = $this->parent;
1291
1292 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1293 $this->parent = $this->parent->parent;
1294
1295 if (strtolower($this->parent->tag)!==$tag_lower) {
1296 $this->parent = $org_parent; // restore origonal parent
1297 if ($this->parent->parent) $this->parent = $this->parent->parent;
1298 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1299 return $this->as_text_node($tag);
1300 }
1301 }
1302 else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1303 {
1304 $this->parent->_[HDOM_INFO_END] = 0;
1305 $org_parent = $this->parent;
1306
1307 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1308 $this->parent = $this->parent->parent;
1309
1310 if (strtolower($this->parent->tag)!==$tag_lower)
1311 {
1312 $this->parent = $org_parent; // restore origonal parent
1313 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1314 return $this->as_text_node($tag);
1315 }
1316 }
1317 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1318 {
1319 $this->parent->_[HDOM_INFO_END] = 0;
1320 $this->parent = $this->parent->parent;
1321 }
1322 else
1323 return $this->as_text_node($tag);
1324 }
1325
1326 $this->parent->_[HDOM_INFO_END] = $this->cursor;
1327 if ($this->parent->parent) $this->parent = $this->parent->parent;
1328
1329 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1330 return true;
1331 }
1332
1333 $node = new simple_html_dom_node($this);
1334 $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1335 ++$this->cursor;
1336 $tag = $this->copy_until($this->token_slash);
1337 $node->tag_start = $begin_tag_pos;
1338
1339 // doctype, cdata & comments...
1340 if (isset($tag[0]) && $tag[0]==='!') {
1341 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1342
1343 if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1344 $node->nodetype = HDOM_TYPE_COMMENT;
1345 $node->tag = 'comment';
1346 } else {
1347 $node->nodetype = HDOM_TYPE_UNKNOWN;
1348 $node->tag = 'unknown';
1349 }
1350 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1351 $this->link_nodes($node, true);
1352 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1353 return true;
1354 }
1355
1356 // text
1357 if ($pos=strpos($tag, '<')!==false) {
1358 $tag = '<' . substr($tag, 0, -1);
1359 $node->_[HDOM_INFO_TEXT] = $tag;
1360 $this->link_nodes($node, false);
1361 $this->char = $this->doc[--$this->pos]; // prev
1362 return true;
1363 }
1364
1365 if (!preg_match("/^[\w-:]+$/", $tag)) {
1366 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1367 if ($this->char==='<') {
1368 $this->link_nodes($node, false);
1369 return true;
1370 }
1371
1372 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1373 $this->link_nodes($node, false);
1374 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1375 return true;
1376 }
1377
1378 // begin tag
1379 $node->nodetype = HDOM_TYPE_ELEMENT;
1380 $tag_lower = strtolower($tag);
1381 $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1382
1383 // handle optional closing tags
1384 if (isset($this->optional_closing_tags[$tag_lower]) )
1385 {
1386 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1387 {
1388 $this->parent->_[HDOM_INFO_END] = 0;
1389 $this->parent = $this->parent->parent;
1390 }
1391 $node->parent = $this->parent;
1392 }
1393
1394 $guard = 0; // prevent infinity loop
1395 $space = array($this->copy_skip($this->token_blank), '', '');
1396
1397 // attributes
1398 do
1399 {
1400 if ($this->char!==null && $space[0]==='')
1401 {
1402 break;
1403 }
1404 $name = $this->copy_until($this->token_equal);
1405 if ($guard===$this->pos)
1406 {
1407 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1408 continue;
1409 }
1410 $guard = $this->pos;
1411
1412 // handle endless '<'
1413 if ($this->pos>=$this->size-1 && $this->char!=='>') {
1414 $node->nodetype = HDOM_TYPE_TEXT;
1415 $node->_[HDOM_INFO_END] = 0;
1416 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1417 $node->tag = 'text';
1418 $this->link_nodes($node, false);
1419 return true;
1420 }
1421
1422 // handle mismatch '<'
1423 if ($this->doc[$this->pos-1]=='<') {
1424 $node->nodetype = HDOM_TYPE_TEXT;
1425 $node->tag = 'text';
1426 $node->attr = array();
1427 $node->_[HDOM_INFO_END] = 0;
1428 $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1429 $this->pos -= 2;
1430 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1431 $this->link_nodes($node, false);
1432 return true;
1433 }
1434
1435 if ($name!=='/' && $name!=='') {
1436 $space[1] = $this->copy_skip($this->token_blank);
1437 $name = $this->restore_noise($name);
1438 if ($this->lowercase) $name = strtolower($name);
1439 if ($this->char==='=') {
1440 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1441 $this->parse_attr($node, $name, $space);
1442 }
1443 else {
1444 //no value attr: nowrap, checked selected...
1445 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1446 $node->attr[$name] = true;
1447 if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
1448 }
1449 $node->_[HDOM_INFO_SPACE][] = $space;
1450 $space = array($this->copy_skip($this->token_blank), '', '');
1451 }
1452 else
1453 break;
1454 } while ($this->char!=='>' && $this->char!=='/');
1455
1456 $this->link_nodes($node, true);
1457 $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1458
1459 // check self closing
1460 if ($this->copy_until_char_escape('>')==='/')
1461 {
1462 $node->_[HDOM_INFO_ENDSPACE] .= '/';
1463 $node->_[HDOM_INFO_END] = 0;
1464 }
1465 else
1466 {
1467 // reset parent
1468 if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1469 }
1470 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1471
1472 // If it's a BR tag, we need to set it's text to the default text.
1473 // This way when we see it in plaintext, we can generate formatting that the user wants.
1474 // since a br tag never has sub nodes, this works well.
1475 if ($node->tag == "br")
1476 {
1477 $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1478 }
1479
1480 return true;
1481 }
1482
1483 // parse attributes
1484 protected function parse_attr($node, $name, &$space)
1485 {
1486 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1487 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1488 if (isset($node->attr[$name]))
1489 {
1490 return;
1491 }
1492
1493 $space[2] = $this->copy_skip($this->token_blank);
1494 switch ($this->char) {
1495 case '"':
1496 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1497 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1498 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1499 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1500 break;
1501 case '\'':
1502 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1503 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1504 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1505 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1506 break;
1507 default:
1508 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1509 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1510 }
1511 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1512 $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1513 $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1514 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1515 if ($name == "class") {
1516 $node->attr[$name] = trim($node->attr[$name]);
1517 }
1518 }
1519
1520 // link node's parent
1521 protected function link_nodes(&$node, $is_child)
1522 {
1523 $node->parent = $this->parent;
1524 $this->parent->nodes[] = $node;
1525 if ($is_child)
1526 {
1527 $this->parent->children[] = $node;
1528 }
1529 }
1530
1531 // as a text node
1532 protected function as_text_node($tag)
1533 {
1534 $node = new simple_html_dom_node($this);
1535 ++$this->cursor;
1536 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1537 $this->link_nodes($node, false);
1538 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1539 return true;
1540 }
1541
1542 protected function skip($chars)
1543 {
1544 $this->pos += strspn($this->doc, $chars, $this->pos);
1545 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1546 }
1547
1548 protected function copy_skip($chars)
1549 {
1550 $pos = $this->pos;
1551 $len = strspn($this->doc, $chars, $pos);
1552 $this->pos += $len;
1553 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1554 if ($len===0) return '';
1555 return substr($this->doc, $pos, $len);
1556 }
1557
1558 protected function copy_until($chars)
1559 {
1560 $pos = $this->pos;
1561 $len = strcspn($this->doc, $chars, $pos);
1562 $this->pos += $len;
1563 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
1564 return substr($this->doc, $pos, $len);
1565 }
1566
1567 protected function copy_until_char($char)
1568 {
1569 if ($this->char===null) return '';
1570
1571 if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1572 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1573 $this->char = null;
1574 $this->pos = $this->size;
1575 return $ret;
1576 }
1577
1578 if ($pos===$this->pos) return '';
1579 $pos_old = $this->pos;
1580 $this->char = $this->doc[$pos];
1581 $this->pos = $pos;
1582 return substr($this->doc, $pos_old, $pos-$pos_old);
1583 }
1584
1585 protected function copy_until_char_escape($char)
1586 {
1587 if ($this->char===null) return '';
1588
1589 $start = $this->pos;
1590 while (1)
1591 {
1592 if (($pos = strpos($this->doc, $char, $start))===false)
1593 {
1594 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1595 $this->char = null;
1596 $this->pos = $this->size;
1597 return $ret;
1598 }
1599
1600 if ($pos===$this->pos) return '';
1601
1602 if ($this->doc[$pos-1]==='\\') {
1603 $start = $pos+1;
1604 continue;
1605 }
1606
1607 $pos_old = $this->pos;
1608 $this->char = $this->doc[$pos];
1609 $this->pos = $pos;
1610 return substr($this->doc, $pos_old, $pos-$pos_old);
1611 }
1612 }
1613
1614 // remove noise from html content
1615 // save the noise in the $this->noise array.
1616 protected function remove_noise($pattern, $remove_tag=false)
1617 {
1618 global $debug_object;
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1620
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1622
1623 for ($i=$count-1; $i>-1; --$i)
1624 {
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); }
1627 $idx = ($remove_tag) ? 0 : 1;
1628 $this->noise[$key] = $matches[$i][$idx][0];
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1630 }
1631
1632 // reset the length of content
1633 $this->size = strlen($this->doc);
1634 if ($this->size>0)
1635 {
1636 $this->char = $this->doc[0];
1637 }
1638 }
1639
1640 // restore noise to html content
1641 function restore_noise($text)
1642 {
1643 global $debug_object;
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1645
1646 while (($pos=strpos($text, '___noise___'))!==false)
1647 {
1648 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1649 if (strlen($text) > $pos+15)
1650 {
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); }
1653
1654 if (isset($this->noise[$key]))
1655 {
1656 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1657 }
1658 else
1659 {
1660 // do this to prevent an infinite loop.
1661 $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1662 }
1663 }
1664 else
1665 {
1666 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1667 $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1668 }
1669 }
1670 return $text;
1671 }
1672
1673 // Sometimes we NEED one of the noise elements.
1674 function search_noise($text)
1675 {
1676 global $debug_object;
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
1678
1679 foreach($this->noise as $noiseElement)
1680 {
1681 if (strpos($noiseElement, $text)!==false)
1682 {
1683 return $noiseElement;
1684 }
1685 }
1686 }
1687 function __toString()
1688 {
1689 return $this->root->innertext();
1690 }
1691
1692 function __get($name)
1693 {
1694 switch ($name)
1695 {
1696 case 'outertext':
1697 return $this->root->innertext();
1698 case 'innertext':
1699 return $this->root->innertext();
1700 case 'plaintext':
1701 return $this->root->text();
1702 case 'charset':
1703 return $this->_charset;
1704 case 'target_charset':
1705 return $this->_target_charset;
1706 }
1707 }
1708
1709 // camel naming conventions
1710 function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1711 function firstChild() {return $this->root->first_child();}
1712 function lastChild() {return $this->root->last_child();}
1713 function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1714 function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1715 function getElementById($id) {return $this->find("#$id", 0);}
1716 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1717 function getElementByTagName($name) {return $this->find($name, 0);}
1718 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1719 function loadFile() {$args = func_get_args();$this->load_file($args);}
1720}
1721
1722?> \ No newline at end of file