]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/simple_html_dom.php
close #69: in the config page, you are notified of the release of a new version
[github/wallabag/wallabag.git] / inc / 3rdparty / simple_html_dom.php
CommitLineData
0753bfef 1<?php\r
2/**\r
3 * Website: http://sourceforge.net/projects/simplehtmldom/\r
4 * Additional projects that may be used: http://sourceforge.net/projects/debugobject/\r
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)\r
6 * Contributions by:\r
7 * Yousuke Kumakura (Attribute filters)\r
8 * Vadim Voituk (Negative indexes supports of "find" method)\r
9 * Antcs (Constructor with automatically load contents either text or file/url)\r
10 *\r
11 * all affected sections have comments starting with "PaperG"\r
12 *\r
13 * Paperg - Added case insensitive testing of the value of the selector.\r
14 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.\r
15 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,\r
16 * it will almost always be smaller by some amount.\r
17 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.\r
18 * but for most purposes, it's a really good estimation.\r
19 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.\r
20 * Allow the user to tell us how much they trust the html.\r
21 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.\r
22 * This allows for us to find tags based on the text they contain.\r
23 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.\r
24 * Paperg: added parse_charset so that we know about the character set of the source document.\r
25 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the\r
26 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.\r
27 *\r
28 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.\r
29 * PaperG (John Schlick) Added get_display_size for "IMG" tags.\r
30 *\r
31 * Licensed under The MIT License\r
32 * Redistributions of files must retain the above copyright notice.\r
33 *\r
34 * @author S.C. Chen <me578022@gmail.com>\r
35 * @author John Schlick\r
36 * @author Rus Carroll\r
37 * @version 1.5 ($Rev: 202 $)\r
38 * @package PlaceLocalInclude\r
39 * @subpackage simple_html_dom\r
40 */\r
41\r
42/**\r
43 * All of the Defines for the classes below.\r
44 * @author S.C. Chen <me578022@gmail.com>\r
45 */\r
46define('HDOM_TYPE_ELEMENT', 1);\r
47define('HDOM_TYPE_COMMENT', 2);\r
48define('HDOM_TYPE_TEXT', 3);\r
49define('HDOM_TYPE_ENDTAG', 4);\r
50define('HDOM_TYPE_ROOT', 5);\r
51define('HDOM_TYPE_UNKNOWN', 6);\r
52define('HDOM_QUOTE_DOUBLE', 0);\r
53define('HDOM_QUOTE_SINGLE', 1);\r
54define('HDOM_QUOTE_NO', 3);\r
55define('HDOM_INFO_BEGIN', 0);\r
56define('HDOM_INFO_END', 1);\r
57define('HDOM_INFO_QUOTE', 2);\r
58define('HDOM_INFO_SPACE', 3);\r
59define('HDOM_INFO_TEXT', 4);\r
60define('HDOM_INFO_INNER', 5);\r
61define('HDOM_INFO_OUTER', 6);\r
62define('HDOM_INFO_ENDSPACE',7);\r
63define('DEFAULT_TARGET_CHARSET', 'UTF-8');\r
64define('DEFAULT_BR_TEXT', "\r\n");\r
65define('DEFAULT_SPAN_TEXT', " ");\r
66define('MAX_FILE_SIZE', 600000);\r
67// helper functions\r
68// -----------------------------------------------------------------------------\r
69// get html dom from file\r
70// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.\r
71function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)\r
72{\r
73 // We DO force the tags to be terminated.\r
74 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);\r
75 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.\r
76 $contents = file_get_contents($url, $use_include_path, $context, $offset);\r
77 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.\r
78 //$contents = retrieve_url_contents($url);\r
79 if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)\r
80 {\r
81 return false;\r
82 }\r
83 // The second parameter can force the selectors to all be lowercase.\r
84 $dom->load($contents, $lowercase, $stripRN);\r
85 return $dom;\r
86}\r
87\r
88// get html dom from string\r
89function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)\r
90{\r
91 $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);\r
92 if (empty($str) || strlen($str) > MAX_FILE_SIZE)\r
93 {\r
94 $dom->clear();\r
95 return false;\r
96 }\r
97 $dom->load($str, $lowercase, $stripRN);\r
98 return $dom;\r
99}\r
100\r
101// dump html dom tree\r
102function dump_html_tree($node, $show_attr=true, $deep=0)\r
103{\r
104 $node->dump($node);\r
105}\r
106\r
107\r
108/**\r
109 * simple html dom node\r
110 * PaperG - added ability for "find" routine to lowercase the value of the selector.\r
111 * PaperG - added $tag_start to track the start position of the tag in the total byte index\r
112 *\r
113 * @package PlaceLocalInclude\r
114 */\r
115class simple_html_dom_node\r
116{\r
117 public $nodetype = HDOM_TYPE_TEXT;\r
118 public $tag = 'text';\r
119 public $attr = array();\r
120 public $children = array();\r
121 public $nodes = array();\r
122 public $parent = null;\r
123 // The "info" array - see HDOM_INFO_... for what each element contains.\r
124 public $_ = array();\r
125 public $tag_start = 0;\r
126 private $dom = null;\r
127\r
128 function __construct($dom)\r
129 {\r
130 $this->dom = $dom;\r
131 $dom->nodes[] = $this;\r
132 }\r
133\r
134 function __destruct()\r
135 {\r
136 $this->clear();\r
137 }\r
138\r
139 function __toString()\r
140 {\r
141 return $this->outertext();\r
142 }\r
143\r
144 // clean up memory due to php5 circular references memory leak...\r
145 function clear()\r
146 {\r
147 $this->dom = null;\r
148 $this->nodes = null;\r
149 $this->parent = null;\r
150 $this->children = null;\r
151 }\r
152\r
153 // dump node's tree\r
154 function dump($show_attr=true, $deep=0)\r
155 {\r
156 $lead = str_repeat(' ', $deep);\r
157\r
158 echo $lead.$this->tag;\r
159 if ($show_attr && count($this->attr)>0)\r
160 {\r
161 echo '(';\r
162 foreach ($this->attr as $k=>$v)\r
163 echo "[$k]=>\"".$this->$k.'", ';\r
164 echo ')';\r
165 }\r
166 echo "\n";\r
167\r
168 if ($this->nodes)\r
169 {\r
170 foreach ($this->nodes as $c)\r
171 {\r
172 $c->dump($show_attr, $deep+1);\r
173 }\r
174 }\r
175 }\r
176\r
177\r
178 // Debugging function to dump a single dom node with a bunch of information about it.\r
179 function dump_node($echo=true)\r
180 {\r
181\r
182 $string = $this->tag;\r
183 if (count($this->attr)>0)\r
184 {\r
185 $string .= '(';\r
186 foreach ($this->attr as $k=>$v)\r
187 {\r
188 $string .= "[$k]=>\"".$this->$k.'", ';\r
189 }\r
190 $string .= ')';\r
191 }\r
192 if (count($this->_)>0)\r
193 {\r
194 $string .= ' $_ (';\r
195 foreach ($this->_ as $k=>$v)\r
196 {\r
197 if (is_array($v))\r
198 {\r
199 $string .= "[$k]=>(";\r
200 foreach ($v as $k2=>$v2)\r
201 {\r
202 $string .= "[$k2]=>\"".$v2.'", ';\r
203 }\r
204 $string .= ")";\r
205 } else {\r
206 $string .= "[$k]=>\"".$v.'", ';\r
207 }\r
208 }\r
209 $string .= ")";\r
210 }\r
211\r
212 if (isset($this->text))\r
213 {\r
214 $string .= " text: (" . $this->text . ")";\r
215 }\r
216\r
217 $string .= " HDOM_INNER_INFO: '";\r
218 if (isset($node->_[HDOM_INFO_INNER]))\r
219 {\r
220 $string .= $node->_[HDOM_INFO_INNER] . "'";\r
221 }\r
222 else\r
223 {\r
224 $string .= ' NULL ';\r
225 }\r
226\r
227 $string .= " children: " . count($this->children);\r
228 $string .= " nodes: " . count($this->nodes);\r
229 $string .= " tag_start: " . $this->tag_start;\r
230 $string .= "\n";\r
231\r
232 if ($echo)\r
233 {\r
234 echo $string;\r
235 return;\r
236 }\r
237 else\r
238 {\r
239 return $string;\r
240 }\r
241 }\r
242\r
243 // returns the parent of node\r
244 // If a node is passed in, it will reset the parent of the current node to that one.\r
245 function parent($parent=null)\r
246 {\r
247 // I am SURE that this doesn't work properly.\r
248 // It fails to unset the current node from it's current parents nodes or children list first.\r
249 if ($parent !== null)\r
250 {\r
251 $this->parent = $parent;\r
252 $this->parent->nodes[] = $this;\r
253 $this->parent->children[] = $this;\r
254 }\r
255\r
256 return $this->parent;\r
257 }\r
258\r
259 // verify that node has children\r
260 function has_child()\r
261 {\r
262 return !empty($this->children);\r
263 }\r
264\r
265 // returns children of node\r
266 function children($idx=-1)\r
267 {\r
268 if ($idx===-1)\r
269 {\r
270 return $this->children;\r
271 }\r
272 if (isset($this->children[$idx])) return $this->children[$idx];\r
273 return null;\r
274 }\r
275\r
276 // returns the first child of node\r
277 function first_child()\r
278 {\r
279 if (count($this->children)>0)\r
280 {\r
281 return $this->children[0];\r
282 }\r
283 return null;\r
284 }\r
285\r
286 // returns the last child of node\r
287 function last_child()\r
288 {\r
289 if (($count=count($this->children))>0)\r
290 {\r
291 return $this->children[$count-1];\r
292 }\r
293 return null;\r
294 }\r
295\r
296 // returns the next sibling of node\r
297 function next_sibling()\r
298 {\r
299 if ($this->parent===null)\r
300 {\r
301 return null;\r
302 }\r
303\r
304 $idx = 0;\r
305 $count = count($this->parent->children);\r
306 while ($idx<$count && $this!==$this->parent->children[$idx])\r
307 {\r
308 ++$idx;\r
309 }\r
310 if (++$idx>=$count)\r
311 {\r
312 return null;\r
313 }\r
314 return $this->parent->children[$idx];\r
315 }\r
316\r
317 // returns the previous sibling of node\r
318 function prev_sibling()\r
319 {\r
320 if ($this->parent===null) return null;\r
321 $idx = 0;\r
322 $count = count($this->parent->children);\r
323 while ($idx<$count && $this!==$this->parent->children[$idx])\r
324 ++$idx;\r
325 if (--$idx<0) return null;\r
326 return $this->parent->children[$idx];\r
327 }\r
328\r
329 // function to locate a specific ancestor tag in the path to the root.\r
330 function find_ancestor_tag($tag)\r
331 {\r
332 global $debug_object;\r
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
334\r
335 // Start by including ourselves in the comparison.\r
336 $returnDom = $this;\r
337\r
338 while (!is_null($returnDom))\r
339 {\r
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }\r
341\r
342 if ($returnDom->tag == $tag)\r
343 {\r
344 break;\r
345 }\r
346 $returnDom = $returnDom->parent;\r
347 }\r
348 return $returnDom;\r
349 }\r
350\r
351 // get dom node's inner html\r
352 function innertext()\r
353 {\r
354 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];\r
355 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);\r
356\r
357 $ret = '';\r
358 foreach ($this->nodes as $n)\r
359 $ret .= $n->outertext();\r
360 return $ret;\r
361 }\r
362\r
363 // get dom node's outer text (with tag)\r
364 function outertext()\r
365 {\r
366 global $debug_object;\r
367 if (is_object($debug_object))\r
368 {\r
369 $text = '';\r
370 if ($this->tag == 'text')\r
371 {\r
372 if (!empty($this->text))\r
373 {\r
374 $text = " with text: " . $this->text;\r
375 }\r
376 }\r
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);\r
378 }\r
379\r
380 if ($this->tag==='root') return $this->innertext();\r
381\r
382 // trigger callback\r
383 if ($this->dom && $this->dom->callback!==null)\r
384 {\r
385 call_user_func_array($this->dom->callback, array($this));\r
386 }\r
387\r
388 if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];\r
389 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);\r
390\r
391 // render begin tag\r
392 if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])\r
393 {\r
394 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();\r
395 } else {\r
396 $ret = "";\r
397 }\r
398\r
399 // render inner text\r
400 if (isset($this->_[HDOM_INFO_INNER]))\r
401 {\r
402 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.\r
403 if ($this->tag != "br")\r
404 {\r
405 $ret .= $this->_[HDOM_INFO_INNER];\r
406 }\r
407 } else {\r
408 if ($this->nodes)\r
409 {\r
410 foreach ($this->nodes as $n)\r
411 {\r
412 $ret .= $this->convert_text($n->outertext());\r
413 }\r
414 }\r
415 }\r
416\r
417 // render end tag\r
418 if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)\r
419 $ret .= '</'.$this->tag.'>';\r
420 return $ret;\r
421 }\r
422\r
423 // get dom node's plain text\r
424 function text()\r
425 {\r
426 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];\r
427 switch ($this->nodetype)\r
428 {\r
429 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);\r
430 case HDOM_TYPE_COMMENT: return '';\r
431 case HDOM_TYPE_UNKNOWN: return '';\r
432 }\r
433 if (strcasecmp($this->tag, 'script')===0) return '';\r
434 if (strcasecmp($this->tag, 'style')===0) return '';\r
435\r
436 $ret = '';\r
437 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.\r
438 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.\r
439 // WHY is this happening?\r
440 if (!is_null($this->nodes))\r
441 {\r
442 foreach ($this->nodes as $n)\r
443 {\r
444 $ret .= $this->convert_text($n->text());\r
445 }\r
446\r
447 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.\r
448 if ($this->tag == "span")\r
449 {\r
450 $ret .= $this->dom->default_span_text;\r
451 }\r
452\r
453\r
454 }\r
455 return $ret;\r
456 }\r
457\r
458 function xmltext()\r
459 {\r
460 $ret = $this->innertext();\r
461 $ret = str_ireplace('<![CDATA[', '', $ret);\r
462 $ret = str_replace(']]>', '', $ret);\r
463 return $ret;\r
464 }\r
465\r
466 // build node's text with tag\r
467 function makeup()\r
468 {\r
469 // text, comment, unknown\r
470 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);\r
471\r
472 $ret = '<'.$this->tag;\r
473 $i = -1;\r
474\r
475 foreach ($this->attr as $key=>$val)\r
476 {\r
477 ++$i;\r
478\r
479 // skip removed attribute\r
480 if ($val===null || $val===false)\r
481 continue;\r
482\r
483 $ret .= $this->_[HDOM_INFO_SPACE][$i][0];\r
484 //no value attr: nowrap, checked selected...\r
485 if ($val===true)\r
486 $ret .= $key;\r
487 else {\r
488 switch ($this->_[HDOM_INFO_QUOTE][$i])\r
489 {\r
490 case HDOM_QUOTE_DOUBLE: $quote = '"'; break;\r
491 case HDOM_QUOTE_SINGLE: $quote = '\''; break;\r
492 default: $quote = '';\r
493 }\r
494 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;\r
495 }\r
496 }\r
497 $ret = $this->dom->restore_noise($ret);\r
498 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';\r
499 }\r
500\r
501 // find elements by css selector\r
502 //PaperG - added ability for find to lowercase the value of the selector.\r
503 function find($selector, $idx=null, $lowercase=false)\r
504 {\r
505 $selectors = $this->parse_selector($selector);\r
506 if (($count=count($selectors))===0) return array();\r
507 $found_keys = array();\r
508\r
509 // find each selector\r
510 for ($c=0; $c<$count; ++$c)\r
511 {\r
512 // The change on the below line was documented on the sourceforge code tracker id 2788009\r
513 // used to be: if (($levle=count($selectors[0]))===0) return array();\r
514 if (($levle=count($selectors[$c]))===0) return array();\r
515 if (!isset($this->_[HDOM_INFO_BEGIN])) return array();\r
516\r
517 $head = array($this->_[HDOM_INFO_BEGIN]=>1);\r
518\r
519 // handle descendant selectors, no recursive!\r
520 for ($l=0; $l<$levle; ++$l)\r
521 {\r
522 $ret = array();\r
523 foreach ($head as $k=>$v)\r
524 {\r
525 $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];\r
526 //PaperG - Pass this optional parameter on to the seek function.\r
527 $n->seek($selectors[$c][$l], $ret, $lowercase);\r
528 }\r
529 $head = $ret;\r
530 }\r
531\r
532 foreach ($head as $k=>$v)\r
533 {\r
534 if (!isset($found_keys[$k]))\r
535 $found_keys[$k] = 1;\r
536 }\r
537 }\r
538\r
539 // sort keys\r
540 ksort($found_keys);\r
541\r
542 $found = array();\r
543 foreach ($found_keys as $k=>$v)\r
544 $found[] = $this->dom->nodes[$k];\r
545\r
546 // return nth-element or array\r
547 if (is_null($idx)) return $found;\r
548 else if ($idx<0) $idx = count($found) + $idx;\r
549 return (isset($found[$idx])) ? $found[$idx] : null;\r
550 }\r
551\r
552 // seek for given conditions\r
553 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.\r
554 protected function seek($selector, &$ret, $lowercase=false)\r
555 {\r
556 global $debug_object;\r
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
558\r
559 list($tag, $key, $val, $exp, $no_key) = $selector;\r
560\r
561 // xpath index\r
562 if ($tag && $key && is_numeric($key))\r
563 {\r
564 $count = 0;\r
565 foreach ($this->children as $c)\r
566 {\r
567 if ($tag==='*' || $tag===$c->tag) {\r
568 if (++$count==$key) {\r
569 $ret[$c->_[HDOM_INFO_BEGIN]] = 1;\r
570 return;\r
571 }\r
572 }\r
573 }\r
574 return;\r
575 }\r
576\r
577 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;\r
578 if ($end==0) {\r
579 $parent = $this->parent;\r
580 while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {\r
581 $end -= 1;\r
582 $parent = $parent->parent;\r
583 }\r
584 $end += $parent->_[HDOM_INFO_END];\r
585 }\r
586\r
587 for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {\r
588 $node = $this->dom->nodes[$i];\r
589\r
590 $pass = true;\r
591\r
592 if ($tag==='*' && !$key) {\r
593 if (in_array($node, $this->children, true))\r
594 $ret[$i] = 1;\r
595 continue;\r
596 }\r
597\r
598 // compare tag\r
599 if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}\r
600 // compare key\r
601 if ($pass && $key) {\r
602 if ($no_key) {\r
603 if (isset($node->attr[$key])) $pass=false;\r
604 } else {\r
605 if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;\r
606 }\r
607 }\r
608 // compare value\r
609 if ($pass && $key && $val && $val!=='*') {\r
610 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?\r
611 if ($key == "plaintext") {\r
612 // $node->plaintext actually returns $node->text();\r
613 $nodeKeyValue = $node->text();\r
614 } else {\r
615 // this is a normal search, we want the value of that attribute of the tag.\r
616 $nodeKeyValue = $node->attr[$key];\r
617 }\r
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}\r
619\r
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.\r
621 if ($lowercase) {\r
622 $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));\r
623 } else {\r
624 $check = $this->match($exp, $val, $nodeKeyValue);\r
625 }\r
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}\r
627\r
628 // handle multiple class\r
629 if (!$check && strcasecmp($key, 'class')===0) {\r
630 foreach (explode(' ',$node->attr[$key]) as $k) {\r
631 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.\r
632 if (!empty($k)) {\r
633 if ($lowercase) {\r
634 $check = $this->match($exp, strtolower($val), strtolower($k));\r
635 } else {\r
636 $check = $this->match($exp, $val, $k);\r
637 }\r
638 if ($check) break;\r
639 }\r
640 }\r
641 }\r
642 if (!$check) $pass = false;\r
643 }\r
644 if ($pass) $ret[$i] = 1;\r
645 unset($node);\r
646 }\r
647 // It's passed by reference so this is actually what this function returns.\r
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}\r
649 }\r
650\r
651 protected function match($exp, $pattern, $value) {\r
652 global $debug_object;\r
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
654\r
655 switch ($exp) {\r
656 case '=':\r
657 return ($value===$pattern);\r
658 case '!=':\r
659 return ($value!==$pattern);\r
660 case '^=':\r
661 return preg_match("/^".preg_quote($pattern,'/')."/", $value);\r
662 case '$=':\r
663 return preg_match("/".preg_quote($pattern,'/')."$/", $value);\r
664 case '*=':\r
665 if ($pattern[0]=='/') {\r
666 return preg_match($pattern, $value);\r
667 }\r
668 return preg_match("/".$pattern."/i", $value);\r
669 }\r
670 return false;\r
671 }\r
672\r
673 protected function parse_selector($selector_string) {\r
674 global $debug_object;\r
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
676\r
677 // pattern of CSS selectors, modified from mootools\r
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.\r
679 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.\r
680// Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.\r
681// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.\r
682// farther study is required to determine of this should be documented or removed.\r
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";\r
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";\r
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);\r
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}\r
687\r
688 $selectors = array();\r
689 $result = array();\r
690 //print_r($matches);\r
691\r
692 foreach ($matches as $m) {\r
693 $m[0] = trim($m[0]);\r
694 if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;\r
695 // for browser generated xpath\r
696 if ($m[1]==='tbody') continue;\r
697\r
698 list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);\r
699 if (!empty($m[2])) {$key='id'; $val=$m[2];}\r
700 if (!empty($m[3])) {$key='class'; $val=$m[3];}\r
701 if (!empty($m[4])) {$key=$m[4];}\r
702 if (!empty($m[5])) {$exp=$m[5];}\r
703 if (!empty($m[6])) {$val=$m[6];}\r
704\r
705 // convert to lowercase\r
706 if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}\r
707 //elements that do NOT have the specified attribute\r
708 if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}\r
709\r
710 $result[] = array($tag, $key, $val, $exp, $no_key);\r
711 if (trim($m[7])===',') {\r
712 $selectors[] = $result;\r
713 $result = array();\r
714 }\r
715 }\r
716 if (count($result)>0)\r
717 $selectors[] = $result;\r
718 return $selectors;\r
719 }\r
720\r
721 function __get($name) {\r
722 if (isset($this->attr[$name]))\r
723 {\r
724 return $this->convert_text($this->attr[$name]);\r
725 }\r
726 switch ($name) {\r
727 case 'outertext': return $this->outertext();\r
728 case 'innertext': return $this->innertext();\r
729 case 'plaintext': return $this->text();\r
730 case 'xmltext': return $this->xmltext();\r
731 default: return array_key_exists($name, $this->attr);\r
732 }\r
733 }\r
734\r
735 function __set($name, $value) {\r
736 switch ($name) {\r
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;\r
738 case 'innertext':\r
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;\r
740 return $this->_[HDOM_INFO_INNER] = $value;\r
741 }\r
742 if (!isset($this->attr[$name])) {\r
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');\r
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;\r
745 }\r
746 $this->attr[$name] = $value;\r
747 }\r
748\r
749 function __isset($name) {\r
750 switch ($name) {\r
751 case 'outertext': return true;\r
752 case 'innertext': return true;\r
753 case 'plaintext': return true;\r
754 }\r
755 //no value attr: nowrap, checked selected...\r
756 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);\r
757 }\r
758\r
759 function __unset($name) {\r
760 if (isset($this->attr[$name]))\r
761 unset($this->attr[$name]);\r
762 }\r
763\r
764 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.\r
765 function convert_text($text)\r
766 {\r
767 global $debug_object;\r
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
769\r
770 $converted_text = $text;\r
771\r
772 $sourceCharset = "";\r
773 $targetCharset = "";\r
774\r
775 if ($this->dom)\r
776 {\r
777 $sourceCharset = strtoupper($this->dom->_charset);\r
778 $targetCharset = strtoupper($this->dom->_target_charset);\r
779 }\r
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}\r
781\r
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))\r
783 {\r
784 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8\r
785 if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))\r
786 {\r
787 $converted_text = $text;\r
788 }\r
789 else\r
790 {\r
791 $converted_text = iconv($sourceCharset, $targetCharset, $text);\r
792 }\r
793 }\r
794\r
795 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.\r
796 if ($targetCharset == 'UTF-8')\r
797 {\r
798 if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")\r
799 {\r
800 $converted_text = substr($converted_text, 3);\r
801 }\r
802 if (substr($converted_text, -3) == "\xef\xbb\xbf")\r
803 {\r
804 $converted_text = substr($converted_text, 0, -3);\r
805 }\r
806 }\r
807\r
808 return $converted_text;\r
809 }\r
810\r
811 /**\r
812 * Returns true if $string is valid UTF-8 and false otherwise.\r
813 *\r
814 * @param mixed $str String to be tested\r
815 * @return boolean\r
816 */\r
817 static function is_utf8($str)\r
818 {\r
819 $c=0; $b=0;\r
820 $bits=0;\r
821 $len=strlen($str);\r
822 for($i=0; $i<$len; $i++)\r
823 {\r
824 $c=ord($str[$i]);\r
825 if($c > 128)\r
826 {\r
827 if(($c >= 254)) return false;\r
828 elseif($c >= 252) $bits=6;\r
829 elseif($c >= 248) $bits=5;\r
830 elseif($c >= 240) $bits=4;\r
831 elseif($c >= 224) $bits=3;\r
832 elseif($c >= 192) $bits=2;\r
833 else return false;\r
834 if(($i+$bits) > $len) return false;\r
835 while($bits > 1)\r
836 {\r
837 $i++;\r
838 $b=ord($str[$i]);\r
839 if($b < 128 || $b > 191) return false;\r
840 $bits--;\r
841 }\r
842 }\r
843 }\r
844 return true;\r
845 }\r
846 /*\r
847 function is_utf8($string)\r
848 {\r
849 //this is buggy\r
850 return (utf8_encode(utf8_decode($string)) == $string);\r
851 }\r
852 */\r
853\r
854 /**\r
855 * Function to try a few tricks to determine the displayed size of an img on the page.\r
856 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.\r
857 *\r
858 * @author John Schlick\r
859 * @version April 19 2012\r
860 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.\r
861 */\r
862 function get_display_size()\r
863 {\r
864 global $debug_object;\r
865\r
866 $width = -1;\r
867 $height = -1;\r
868\r
869 if ($this->tag !== 'img')\r
870 {\r
871 return false;\r
872 }\r
873\r
874 // See if there is aheight or width attribute in the tag itself.\r
875 if (isset($this->attr['width']))\r
876 {\r
877 $width = $this->attr['width'];\r
878 }\r
879\r
880 if (isset($this->attr['height']))\r
881 {\r
882 $height = $this->attr['height'];\r
883 }\r
884\r
885 // Now look for an inline style.\r
886 if (isset($this->attr['style']))\r
887 {\r
888 // Thanks to user gnarf from stackoverflow for this regular expression.\r
889 $attributes = array();\r
890 preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);\r
891 foreach ($matches as $match) {\r
892 $attributes[$match[1]] = $match[2];\r
893 }\r
894\r
895 // If there is a width in the style attributes:\r
896 if (isset($attributes['width']) && $width == -1)\r
897 {\r
898 // check that the last two characters are px (pixels)\r
899 if (strtolower(substr($attributes['width'], -2)) == 'px')\r
900 {\r
901 $proposed_width = substr($attributes['width'], 0, -2);\r
902 // Now make sure that it's an integer and not something stupid.\r
903 if (filter_var($proposed_width, FILTER_VALIDATE_INT))\r
904 {\r
905 $width = $proposed_width;\r
906 }\r
907 }\r
908 }\r
909\r
910 // If there is a width in the style attributes:\r
911 if (isset($attributes['height']) && $height == -1)\r
912 {\r
913 // check that the last two characters are px (pixels)\r
914 if (strtolower(substr($attributes['height'], -2)) == 'px')\r
915 {\r
916 $proposed_height = substr($attributes['height'], 0, -2);\r
917 // Now make sure that it's an integer and not something stupid.\r
918 if (filter_var($proposed_height, FILTER_VALIDATE_INT))\r
919 {\r
920 $height = $proposed_height;\r
921 }\r
922 }\r
923 }\r
924\r
925 }\r
926\r
927 // Future enhancement:\r
928 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.\r
929\r
930 // Far future enhancement\r
931 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width\r
932 // Note that in this case, the class or id will have the img subselector for it to apply to the image.\r
933\r
934 // ridiculously far future development\r
935 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.\r
936\r
937 $result = array('height' => $height,\r
938 'width' => $width);\r
939 return $result;\r
940 }\r
941\r
942 // camel naming conventions\r
943 function getAllAttributes() {return $this->attr;}\r
944 function getAttribute($name) {return $this->__get($name);}\r
945 function setAttribute($name, $value) {$this->__set($name, $value);}\r
946 function hasAttribute($name) {return $this->__isset($name);}\r
947 function removeAttribute($name) {$this->__set($name, null);}\r
948 function getElementById($id) {return $this->find("#$id", 0);}\r
949 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}\r
950 function getElementByTagName($name) {return $this->find($name, 0);}\r
951 function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}\r
952 function parentNode() {return $this->parent();}\r
953 function childNodes($idx=-1) {return $this->children($idx);}\r
954 function firstChild() {return $this->first_child();}\r
955 function lastChild() {return $this->last_child();}\r
956 function nextSibling() {return $this->next_sibling();}\r
957 function previousSibling() {return $this->prev_sibling();}\r
958 function hasChildNodes() {return $this->has_child();}\r
959 function nodeName() {return $this->tag;}\r
960 function appendChild($node) {$node->parent($this); return $node;}\r
961\r
962}\r
963\r
964/**\r
965 * simple html dom parser\r
966 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.\r
967 * Paperg - change $size from protected to public so we can easily access it\r
968 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.\r
969 *\r
970 * @package PlaceLocalInclude\r
971 */\r
972class simple_html_dom\r
973{\r
974 public $root = null;\r
975 public $nodes = array();\r
976 public $callback = null;\r
977 public $lowercase = false;\r
978 // Used to keep track of how large the text was when we started.\r
979 public $original_size;\r
980 public $size;\r
981 protected $pos;\r
982 protected $doc;\r
983 protected $char;\r
984 protected $cursor;\r
985 protected $parent;\r
986 protected $noise = array();\r
987 protected $token_blank = " \t\r\n";\r
988 protected $token_equal = ' =/>';\r
989 protected $token_slash = " />\r\n\t";\r
990 protected $token_attr = ' >';\r
991 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.\r
992 public $_charset = '';\r
993 public $_target_charset = '';\r
994 protected $default_br_text = "";\r
995 public $default_span_text = "";\r
996\r
997 // use isset instead of in_array, performance boost about 30%...\r
998 protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);\r
999 protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);\r
1000 // Known sourceforge issue #2977341\r
1001 // B tags that are not closed cause us to return everything to the end of the document.\r
1002 protected $optional_closing_tags = array(\r
1003 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),\r
1004 'th'=>array('th'=>1),\r
1005 'td'=>array('td'=>1),\r
1006 'li'=>array('li'=>1),\r
1007 'dt'=>array('dt'=>1, 'dd'=>1),\r
1008 'dd'=>array('dd'=>1, 'dt'=>1),\r
1009 'dl'=>array('dd'=>1, 'dt'=>1),\r
1010 'p'=>array('p'=>1),\r
1011 'nobr'=>array('nobr'=>1),\r
1012 'b'=>array('b'=>1),\r
1013 'option'=>array('option'=>1),\r
1014 );\r
1015\r
1016 function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)\r
1017 {\r
1018 if ($str)\r
1019 {\r
1020 if (preg_match("/^http:\/\//i",$str) || is_file($str))\r
1021 {\r
1022 $this->load_file($str);\r
1023 }\r
1024 else\r
1025 {\r
1026 $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);\r
1027 }\r
1028 }\r
1029 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.\r
1030 if (!$forceTagsClosed) {\r
1031 $this->optional_closing_array=array();\r
1032 }\r
1033 $this->_target_charset = $target_charset;\r
1034 }\r
1035\r
1036 function __destruct()\r
1037 {\r
1038 $this->clear();\r
1039 }\r
1040\r
1041 // load html from string\r
1042 function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)\r
1043 {\r
1044 global $debug_object;\r
1045\r
1046 // prepare\r
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);\r
1048 // strip out comments\r
1049 $this->remove_noise("'<!--(.*?)-->'is");\r
1050 // strip out cdata\r
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);\r
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037\r
1053 // Script tags removal now preceeds style tag removal.\r
1054 // strip out <script> tags\r
1055 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");\r
1056 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");\r
1057 // strip out <style> tags\r
1058 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");\r
1059 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");\r
1060 // strip out preformatted tags\r
1061 $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");\r
1062 // strip out server side scripts\r
1063 $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);\r
1064 // strip smarty scripts\r
1065 $this->remove_noise("'(\{\w)(.*?)(\})'s", true);\r
1066\r
1067 // parsing\r
1068 while ($this->parse());\r
1069 // end\r
1070 $this->root->_[HDOM_INFO_END] = $this->cursor;\r
1071 $this->parse_charset();\r
1072\r
1073 // make load function chainable\r
1074 return $this;\r
1075\r
1076 }\r
1077\r
1078 // load html from file\r
1079 function load_file()\r
1080 {\r
1081 $args = func_get_args();\r
1082 $this->load(call_user_func_array('file_get_contents', $args), true);\r
1083 // Throw an error if we can't properly load the dom.\r
1084 if (($error=error_get_last())!==null) {\r
1085 $this->clear();\r
1086 return false;\r
1087 }\r
1088 }\r
1089\r
1090 // set callback function\r
1091 function set_callback($function_name)\r
1092 {\r
1093 $this->callback = $function_name;\r
1094 }\r
1095\r
1096 // remove callback function\r
1097 function remove_callback()\r
1098 {\r
1099 $this->callback = null;\r
1100 }\r
1101\r
1102 // save dom as string\r
1103 function save($filepath='')\r
1104 {\r
1105 $ret = $this->root->innertext();\r
1106 if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);\r
1107 return $ret;\r
1108 }\r
1109\r
1110 // find dom node by css selector\r
1111 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.\r
1112 function find($selector, $idx=null, $lowercase=false)\r
1113 {\r
1114 return $this->root->find($selector, $idx, $lowercase);\r
1115 }\r
1116\r
1117 // clean up memory due to php5 circular references memory leak...\r
1118 function clear()\r
1119 {\r
1120 foreach ($this->nodes as $n) {$n->clear(); $n = null;}\r
1121 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.\r
1122 if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}\r
1123 if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}\r
1124 if (isset($this->root)) {$this->root->clear(); unset($this->root);}\r
1125 unset($this->doc);\r
1126 unset($this->noise);\r
1127 }\r
1128\r
1129 function dump($show_attr=true)\r
1130 {\r
1131 $this->root->dump($show_attr);\r
1132 }\r
1133\r
1134 // prepare HTML data and init everything\r
1135 protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)\r
1136 {\r
1137 $this->clear();\r
1138\r
1139 // set the length of content before we do anything to it.\r
1140 $this->size = strlen($str);\r
1141 // Save the original size of the html that we got in. It might be useful to someone.\r
1142 $this->original_size = $this->size;\r
1143\r
1144 //before we save the string as the doc... strip out the \r \n's if we are told to.\r
1145 if ($stripRN) {\r
1146 $str = str_replace("\r", " ", $str);\r
1147 $str = str_replace("\n", " ", $str);\r
1148\r
1149 // set the length of content since we have changed it.\r
1150 $this->size = strlen($str);\r
1151 }\r
1152\r
1153 $this->doc = $str;\r
1154 $this->pos = 0;\r
1155 $this->cursor = 1;\r
1156 $this->noise = array();\r
1157 $this->nodes = array();\r
1158 $this->lowercase = $lowercase;\r
1159 $this->default_br_text = $defaultBRText;\r
1160 $this->default_span_text = $defaultSpanText;\r
1161 $this->root = new simple_html_dom_node($this);\r
1162 $this->root->tag = 'root';\r
1163 $this->root->_[HDOM_INFO_BEGIN] = -1;\r
1164 $this->root->nodetype = HDOM_TYPE_ROOT;\r
1165 $this->parent = $this->root;\r
1166 if ($this->size>0) $this->char = $this->doc[0];\r
1167 }\r
1168\r
1169 // parse html content\r
1170 protected function parse()\r
1171 {\r
1172 if (($s = $this->copy_until_char('<'))==='')\r
1173 {\r
1174 return $this->read_tag();\r
1175 }\r
1176\r
1177 // text\r
1178 $node = new simple_html_dom_node($this);\r
1179 ++$this->cursor;\r
1180 $node->_[HDOM_INFO_TEXT] = $s;\r
1181 $this->link_nodes($node, false);\r
1182 return true;\r
1183 }\r
1184\r
1185 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.\r
1186 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec\r
1187 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.\r
1188 protected function parse_charset()\r
1189 {\r
1190 global $debug_object;\r
1191\r
1192 $charset = null;\r
1193\r
1194 if (function_exists('get_last_retrieve_url_contents_content_type'))\r
1195 {\r
1196 $contentTypeHeader = get_last_retrieve_url_contents_content_type();\r
1197 $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);\r
1198 if ($success)\r
1199 {\r
1200 $charset = $matches[1];\r
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);}\r
1202 }\r
1203\r
1204 }\r
1205\r
1206 if (empty($charset))\r
1207 {\r
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0);\r
1209 if (!empty($el))\r
1210 {\r
1211 $fullvalue = $el->content;\r
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);}\r
1213\r
1214 if (!empty($fullvalue))\r
1215 {\r
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches);\r
1217 if ($success)\r
1218 {\r
1219 $charset = $matches[1];\r
1220 }\r
1221 else\r
1222 {\r
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1\r
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}\r
1225 $charset = 'ISO-8859-1';\r
1226 }\r
1227 }\r
1228 }\r
1229 }\r
1230\r
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got...\r
1232 if (empty($charset))\r
1233 {\r
1234 // Have php try to detect the encoding from the text given to us.\r
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );\r
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);}\r
1237\r
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...\r
1239 if ($charset === false)\r
1240 {\r
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');}\r
1242 $charset = 'UTF-8';\r
1243 }\r
1244 }\r
1245\r
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.\r
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))\r
1248 {\r
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}\r
1250 $charset = 'CP1252';\r
1251 }\r
1252\r
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);}\r
1254\r
1255 return $this->_charset = $charset;\r
1256 }\r
1257\r
1258 // read tag info\r
1259 protected function read_tag()\r
1260 {\r
1261 if ($this->char!=='<')\r
1262 {\r
1263 $this->root->_[HDOM_INFO_END] = $this->cursor;\r
1264 return false;\r
1265 }\r
1266 $begin_tag_pos = $this->pos;\r
1267 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1268\r
1269 // end tag\r
1270 if ($this->char==='/')\r
1271 {\r
1272 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1273 // This represents the change in the simple_html_dom trunk from revision 180 to 181.\r
1274 // $this->skip($this->token_blank_t);\r
1275 $this->skip($this->token_blank);\r
1276 $tag = $this->copy_until_char('>');\r
1277\r
1278 // skip attributes in end tag\r
1279 if (($pos = strpos($tag, ' '))!==false)\r
1280 $tag = substr($tag, 0, $pos);\r
1281\r
1282 $parent_lower = strtolower($this->parent->tag);\r
1283 $tag_lower = strtolower($tag);\r
1284\r
1285 if ($parent_lower!==$tag_lower)\r
1286 {\r
1287 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))\r
1288 {\r
1289 $this->parent->_[HDOM_INFO_END] = 0;\r
1290 $org_parent = $this->parent;\r
1291\r
1292 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)\r
1293 $this->parent = $this->parent->parent;\r
1294\r
1295 if (strtolower($this->parent->tag)!==$tag_lower) {\r
1296 $this->parent = $org_parent; // restore origonal parent\r
1297 if ($this->parent->parent) $this->parent = $this->parent->parent;\r
1298 $this->parent->_[HDOM_INFO_END] = $this->cursor;\r
1299 return $this->as_text_node($tag);\r
1300 }\r
1301 }\r
1302 else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))\r
1303 {\r
1304 $this->parent->_[HDOM_INFO_END] = 0;\r
1305 $org_parent = $this->parent;\r
1306\r
1307 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)\r
1308 $this->parent = $this->parent->parent;\r
1309\r
1310 if (strtolower($this->parent->tag)!==$tag_lower)\r
1311 {\r
1312 $this->parent = $org_parent; // restore origonal parent\r
1313 $this->parent->_[HDOM_INFO_END] = $this->cursor;\r
1314 return $this->as_text_node($tag);\r
1315 }\r
1316 }\r
1317 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)\r
1318 {\r
1319 $this->parent->_[HDOM_INFO_END] = 0;\r
1320 $this->parent = $this->parent->parent;\r
1321 }\r
1322 else\r
1323 return $this->as_text_node($tag);\r
1324 }\r
1325\r
1326 $this->parent->_[HDOM_INFO_END] = $this->cursor;\r
1327 if ($this->parent->parent) $this->parent = $this->parent->parent;\r
1328\r
1329 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1330 return true;\r
1331 }\r
1332\r
1333 $node = new simple_html_dom_node($this);\r
1334 $node->_[HDOM_INFO_BEGIN] = $this->cursor;\r
1335 ++$this->cursor;\r
1336 $tag = $this->copy_until($this->token_slash);\r
1337 $node->tag_start = $begin_tag_pos;\r
1338\r
1339 // doctype, cdata & comments...\r
1340 if (isset($tag[0]) && $tag[0]==='!') {\r
1341 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');\r
1342\r
1343 if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {\r
1344 $node->nodetype = HDOM_TYPE_COMMENT;\r
1345 $node->tag = 'comment';\r
1346 } else {\r
1347 $node->nodetype = HDOM_TYPE_UNKNOWN;\r
1348 $node->tag = 'unknown';\r
1349 }\r
1350 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';\r
1351 $this->link_nodes($node, true);\r
1352 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1353 return true;\r
1354 }\r
1355\r
1356 // text\r
1357 if ($pos=strpos($tag, '<')!==false) {\r
1358 $tag = '<' . substr($tag, 0, -1);\r
1359 $node->_[HDOM_INFO_TEXT] = $tag;\r
1360 $this->link_nodes($node, false);\r
1361 $this->char = $this->doc[--$this->pos]; // prev\r
1362 return true;\r
1363 }\r
1364\r
1365 if (!preg_match("/^[\w-:]+$/", $tag)) {\r
1366 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');\r
1367 if ($this->char==='<') {\r
1368 $this->link_nodes($node, false);\r
1369 return true;\r
1370 }\r
1371\r
1372 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';\r
1373 $this->link_nodes($node, false);\r
1374 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1375 return true;\r
1376 }\r
1377\r
1378 // begin tag\r
1379 $node->nodetype = HDOM_TYPE_ELEMENT;\r
1380 $tag_lower = strtolower($tag);\r
1381 $node->tag = ($this->lowercase) ? $tag_lower : $tag;\r
1382\r
1383 // handle optional closing tags\r
1384 if (isset($this->optional_closing_tags[$tag_lower]) )\r
1385 {\r
1386 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))\r
1387 {\r
1388 $this->parent->_[HDOM_INFO_END] = 0;\r
1389 $this->parent = $this->parent->parent;\r
1390 }\r
1391 $node->parent = $this->parent;\r
1392 }\r
1393\r
1394 $guard = 0; // prevent infinity loop\r
1395 $space = array($this->copy_skip($this->token_blank), '', '');\r
1396\r
1397 // attributes\r
1398 do\r
1399 {\r
1400 if ($this->char!==null && $space[0]==='')\r
1401 {\r
1402 break;\r
1403 }\r
1404 $name = $this->copy_until($this->token_equal);\r
1405 if ($guard===$this->pos)\r
1406 {\r
1407 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1408 continue;\r
1409 }\r
1410 $guard = $this->pos;\r
1411\r
1412 // handle endless '<'\r
1413 if ($this->pos>=$this->size-1 && $this->char!=='>') {\r
1414 $node->nodetype = HDOM_TYPE_TEXT;\r
1415 $node->_[HDOM_INFO_END] = 0;\r
1416 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;\r
1417 $node->tag = 'text';\r
1418 $this->link_nodes($node, false);\r
1419 return true;\r
1420 }\r
1421\r
1422 // handle mismatch '<'\r
1423 if ($this->doc[$this->pos-1]=='<') {\r
1424 $node->nodetype = HDOM_TYPE_TEXT;\r
1425 $node->tag = 'text';\r
1426 $node->attr = array();\r
1427 $node->_[HDOM_INFO_END] = 0;\r
1428 $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);\r
1429 $this->pos -= 2;\r
1430 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1431 $this->link_nodes($node, false);\r
1432 return true;\r
1433 }\r
1434\r
1435 if ($name!=='/' && $name!=='') {\r
1436 $space[1] = $this->copy_skip($this->token_blank);\r
1437 $name = $this->restore_noise($name);\r
1438 if ($this->lowercase) $name = strtolower($name);\r
1439 if ($this->char==='=') {\r
1440 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1441 $this->parse_attr($node, $name, $space);\r
1442 }\r
1443 else {\r
1444 //no value attr: nowrap, checked selected...\r
1445 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;\r
1446 $node->attr[$name] = true;\r
1447 if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev\r
1448 }\r
1449 $node->_[HDOM_INFO_SPACE][] = $space;\r
1450 $space = array($this->copy_skip($this->token_blank), '', '');\r
1451 }\r
1452 else\r
1453 break;\r
1454 } while ($this->char!=='>' && $this->char!=='/');\r
1455\r
1456 $this->link_nodes($node, true);\r
1457 $node->_[HDOM_INFO_ENDSPACE] = $space[0];\r
1458\r
1459 // check self closing\r
1460 if ($this->copy_until_char_escape('>')==='/')\r
1461 {\r
1462 $node->_[HDOM_INFO_ENDSPACE] .= '/';\r
1463 $node->_[HDOM_INFO_END] = 0;\r
1464 }\r
1465 else\r
1466 {\r
1467 // reset parent\r
1468 if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;\r
1469 }\r
1470 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1471\r
1472 // If it's a BR tag, we need to set it's text to the default text.\r
1473 // This way when we see it in plaintext, we can generate formatting that the user wants.\r
1474 // since a br tag never has sub nodes, this works well.\r
1475 if ($node->tag == "br")\r
1476 {\r
1477 $node->_[HDOM_INFO_INNER] = $this->default_br_text;\r
1478 }\r
1479\r
1480 return true;\r
1481 }\r
1482\r
1483 // parse attributes\r
1484 protected function parse_attr($node, $name, &$space)\r
1485 {\r
1486 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037\r
1487 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.\r
1488 if (isset($node->attr[$name]))\r
1489 {\r
1490 return;\r
1491 }\r
1492\r
1493 $space[2] = $this->copy_skip($this->token_blank);\r
1494 switch ($this->char) {\r
1495 case '"':\r
1496 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;\r
1497 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1498 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));\r
1499 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1500 break;\r
1501 case '\'':\r
1502 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;\r
1503 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1504 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));\r
1505 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1506 break;\r
1507 default:\r
1508 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;\r
1509 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));\r
1510 }\r
1511 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.\r
1512 $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);\r
1513 $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);\r
1514 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.\r
1515 if ($name == "class") {\r
1516 $node->attr[$name] = trim($node->attr[$name]);\r
1517 }\r
1518 }\r
1519\r
1520 // link node's parent\r
1521 protected function link_nodes(&$node, $is_child)\r
1522 {\r
1523 $node->parent = $this->parent;\r
1524 $this->parent->nodes[] = $node;\r
1525 if ($is_child)\r
1526 {\r
1527 $this->parent->children[] = $node;\r
1528 }\r
1529 }\r
1530\r
1531 // as a text node\r
1532 protected function as_text_node($tag)\r
1533 {\r
1534 $node = new simple_html_dom_node($this);\r
1535 ++$this->cursor;\r
1536 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';\r
1537 $this->link_nodes($node, false);\r
1538 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1539 return true;\r
1540 }\r
1541\r
1542 protected function skip($chars)\r
1543 {\r
1544 $this->pos += strspn($this->doc, $chars, $this->pos);\r
1545 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1546 }\r
1547\r
1548 protected function copy_skip($chars)\r
1549 {\r
1550 $pos = $this->pos;\r
1551 $len = strspn($this->doc, $chars, $pos);\r
1552 $this->pos += $len;\r
1553 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1554 if ($len===0) return '';\r
1555 return substr($this->doc, $pos, $len);\r
1556 }\r
1557\r
1558 protected function copy_until($chars)\r
1559 {\r
1560 $pos = $this->pos;\r
1561 $len = strcspn($this->doc, $chars, $pos);\r
1562 $this->pos += $len;\r
1563 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next\r
1564 return substr($this->doc, $pos, $len);\r
1565 }\r
1566\r
1567 protected function copy_until_char($char)\r
1568 {\r
1569 if ($this->char===null) return '';\r
1570\r
1571 if (($pos = strpos($this->doc, $char, $this->pos))===false) {\r
1572 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);\r
1573 $this->char = null;\r
1574 $this->pos = $this->size;\r
1575 return $ret;\r
1576 }\r
1577\r
1578 if ($pos===$this->pos) return '';\r
1579 $pos_old = $this->pos;\r
1580 $this->char = $this->doc[$pos];\r
1581 $this->pos = $pos;\r
1582 return substr($this->doc, $pos_old, $pos-$pos_old);\r
1583 }\r
1584\r
1585 protected function copy_until_char_escape($char)\r
1586 {\r
1587 if ($this->char===null) return '';\r
1588\r
1589 $start = $this->pos;\r
1590 while (1)\r
1591 {\r
1592 if (($pos = strpos($this->doc, $char, $start))===false)\r
1593 {\r
1594 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);\r
1595 $this->char = null;\r
1596 $this->pos = $this->size;\r
1597 return $ret;\r
1598 }\r
1599\r
1600 if ($pos===$this->pos) return '';\r
1601\r
1602 if ($this->doc[$pos-1]==='\\') {\r
1603 $start = $pos+1;\r
1604 continue;\r
1605 }\r
1606\r
1607 $pos_old = $this->pos;\r
1608 $this->char = $this->doc[$pos];\r
1609 $this->pos = $pos;\r
1610 return substr($this->doc, $pos_old, $pos-$pos_old);\r
1611 }\r
1612 }\r
1613\r
1614 // remove noise from html content\r
1615 // save the noise in the $this->noise array.\r
1616 protected function remove_noise($pattern, $remove_tag=false)\r
1617 {\r
1618 global $debug_object;\r
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
1620\r
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);\r
1622\r
1623 for ($i=$count-1; $i>-1; --$i)\r
1624 {\r
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);\r
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); }\r
1627 $idx = ($remove_tag) ? 0 : 1;\r
1628 $this->noise[$key] = $matches[$i][$idx][0];\r
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));\r
1630 }\r
1631\r
1632 // reset the length of content\r
1633 $this->size = strlen($this->doc);\r
1634 if ($this->size>0)\r
1635 {\r
1636 $this->char = $this->doc[0];\r
1637 }\r
1638 }\r
1639\r
1640 // restore noise to html content\r
1641 function restore_noise($text)\r
1642 {\r
1643 global $debug_object;\r
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
1645\r
1646 while (($pos=strpos($text, '___noise___'))!==false)\r
1647 {\r
1648 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...\r
1649 if (strlen($text) > $pos+15)\r
1650 {\r
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];\r
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); }\r
1653\r
1654 if (isset($this->noise[$key]))\r
1655 {\r
1656 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);\r
1657 }\r
1658 else\r
1659 {\r
1660 // do this to prevent an infinite loop.\r
1661 $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);\r
1662 }\r
1663 }\r
1664 else\r
1665 {\r
1666 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.\r
1667 $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);\r
1668 }\r
1669 }\r
1670 return $text;\r
1671 }\r
1672\r
1673 // Sometimes we NEED one of the noise elements.\r
1674 function search_noise($text)\r
1675 {\r
1676 global $debug_object;\r
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
1678\r
1679 foreach($this->noise as $noiseElement)\r
1680 {\r
1681 if (strpos($noiseElement, $text)!==false)\r
1682 {\r
1683 return $noiseElement;\r
1684 }\r
1685 }\r
1686 }\r
1687 function __toString()\r
1688 {\r
1689 return $this->root->innertext();\r
1690 }\r
1691\r
1692 function __get($name)\r
1693 {\r
1694 switch ($name)\r
1695 {\r
1696 case 'outertext':\r
1697 return $this->root->innertext();\r
1698 case 'innertext':\r
1699 return $this->root->innertext();\r
1700 case 'plaintext':\r
1701 return $this->root->text();\r
1702 case 'charset':\r
1703 return $this->_charset;\r
1704 case 'target_charset':\r
1705 return $this->_target_charset;\r
1706 }\r
1707 }\r
1708\r
1709 // camel naming conventions\r
1710 function childNodes($idx=-1) {return $this->root->childNodes($idx);}\r
1711 function firstChild() {return $this->root->first_child();}\r
1712 function lastChild() {return $this->root->last_child();}\r
1713 function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}\r
1714 function createTextNode($value) {return @end(str_get_html($value)->nodes);}\r
1715 function getElementById($id) {return $this->find("#$id", 0);}\r
1716 function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}\r
1717 function getElementByTagName($name) {return $this->find($name, 0);}\r
1718 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}\r
1719 function loadFile() {$args = func_get_args();$this->load_file($args);}\r
1720}\r
1721\r
1722?>