aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/simple_html_dom.php
diff options
context:
space:
mode:
authortcitworld <tcit@tcit.fr>2014-06-25 18:54:39 +0200
committertcitworld <tcit@tcit.fr>2014-06-25 18:54:39 +0200
commit69213014d1fb8f05cffe7bf83467d938a282f29b (patch)
treeaed7dc63fed6c360fdcdfccf0fa8d74bb77d1048 /inc/3rdparty/simple_html_dom.php
parentc9563378eaa061a339ca5c0aa9e4a45f98e50c9f (diff)
parentaa126ba458a02e8b1e43b15fc28f550ee72a9619 (diff)
downloadwallabag-69213014d1fb8f05cffe7bf83467d938a282f29b.tar.gz
wallabag-69213014d1fb8f05cffe7bf83467d938a282f29b.tar.zst
wallabag-69213014d1fb8f05cffe7bf83467d938a282f29b.zip
Merge pull request #736 from mariroz/dev
fix of issue #718: Error parsing file imported from Pocket #718
Diffstat (limited to 'inc/3rdparty/simple_html_dom.php')
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/simple_html_dom.php105
1 files changed, 65 insertions, 40 deletions
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
index 43b94e57..9b73b105 100644..100755
--- a/inc/3rdparty/simple_html_dom.php
+++ b/inc/3rdparty/simple_html_dom.php
@@ -34,7 +34,7 @@
34 * @author S.C. Chen <me578022@gmail.com> 34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick 35 * @author John Schlick
36 * @author Rus Carroll 36 * @author Rus Carroll
37 * @version 1.5 ($Rev: 202 $) 37 * @version 1.5 ($Rev: 210 $)
38 * @package PlaceLocalInclude 38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom 39 * @subpackage simple_html_dom
40 */ 40 */
@@ -269,7 +269,10 @@ class simple_html_dom_node
269 { 269 {
270 return $this->children; 270 return $this->children;
271 } 271 }
272 if (isset($this->children[$idx])) return $this->children[$idx]; 272 if (isset($this->children[$idx]))
273 {
274 return $this->children[$idx];
275 }
273 return null; 276 return null;
274 } 277 }
275 278
@@ -330,14 +333,14 @@ class simple_html_dom_node
330 function find_ancestor_tag($tag) 333 function find_ancestor_tag($tag)
331 { 334 {
332 global $debug_object; 335 global $debug_object;
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 336 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
334 337
335 // Start by including ourselves in the comparison. 338 // Start by including ourselves in the comparison.
336 $returnDom = $this; 339 $returnDom = $this;
337 340
338 while (!is_null($returnDom)) 341 while (!is_null($returnDom))
339 { 342 {
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); } 343 if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
341 344
342 if ($returnDom->tag == $tag) 345 if ($returnDom->tag == $tag)
343 { 346 {
@@ -374,7 +377,7 @@ class simple_html_dom_node
374 $text = " with text: " . $this->text; 377 $text = " with text: " . $this->text;
375 } 378 }
376 } 379 }
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); 380 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
378 } 381 }
379 382
380 if ($this->tag==='root') return $this->innertext(); 383 if ($this->tag==='root') return $this->innertext();
@@ -532,7 +535,9 @@ class simple_html_dom_node
532 foreach ($head as $k=>$v) 535 foreach ($head as $k=>$v)
533 { 536 {
534 if (!isset($found_keys[$k])) 537 if (!isset($found_keys[$k]))
538 {
535 $found_keys[$k] = 1; 539 $found_keys[$k] = 1;
540 }
536 } 541 }
537 } 542 }
538 543
@@ -554,7 +559,7 @@ class simple_html_dom_node
554 protected function seek($selector, &$ret, $lowercase=false) 559 protected function seek($selector, &$ret, $lowercase=false)
555 { 560 {
556 global $debug_object; 561 global $debug_object;
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 562 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
558 563
559 list($tag, $key, $val, $exp, $no_key) = $selector; 564 list($tag, $key, $val, $exp, $no_key) = $selector;
560 565
@@ -615,7 +620,7 @@ class simple_html_dom_node
615 // this is a normal search, we want the value of that attribute of the tag. 620 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node->attr[$key]; 621 $nodeKeyValue = $node->attr[$key];
617 } 622 }
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 623 if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619 624
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 625 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 if ($lowercase) { 626 if ($lowercase) {
@@ -623,7 +628,7 @@ class simple_html_dom_node
623 } else { 628 } else {
624 $check = $this->match($exp, $val, $nodeKeyValue); 629 $check = $this->match($exp, $val, $nodeKeyValue);
625 } 630 }
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));} 631 if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
627 632
628 // handle multiple class 633 // handle multiple class
629 if (!$check && strcasecmp($key, 'class')===0) { 634 if (!$check && strcasecmp($key, 'class')===0) {
@@ -645,12 +650,12 @@ class simple_html_dom_node
645 unset($node); 650 unset($node);
646 } 651 }
647 // It's passed by reference so this is actually what this function returns. 652 // It's passed by reference so this is actually what this function returns.
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);} 653 if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
649 } 654 }
650 655
651 protected function match($exp, $pattern, $value) { 656 protected function match($exp, $pattern, $value) {
652 global $debug_object; 657 global $debug_object;
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 658 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
654 659
655 switch ($exp) { 660 switch ($exp) {
656 case '=': 661 case '=':
@@ -672,7 +677,7 @@ class simple_html_dom_node
672 677
673 protected function parse_selector($selector_string) { 678 protected function parse_selector($selector_string) {
674 global $debug_object; 679 global $debug_object;
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 680 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
676 681
677 // pattern of CSS selectors, modified from mootools 682 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 683 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
@@ -683,7 +688,7 @@ class simple_html_dom_node
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 688// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 690 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);} 691 if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
687 692
688 $selectors = array(); 693 $selectors = array();
689 $result = array(); 694 $result = array();
@@ -718,12 +723,14 @@ class simple_html_dom_node
718 return $selectors; 723 return $selectors;
719 } 724 }
720 725
721 function __get($name) { 726 function __get($name)
727 {
722 if (isset($this->attr[$name])) 728 if (isset($this->attr[$name]))
723 { 729 {
724 return $this->convert_text($this->attr[$name]); 730 return $this->convert_text($this->attr[$name]);
725 } 731 }
726 switch ($name) { 732 switch ($name)
733 {
727 case 'outertext': return $this->outertext(); 734 case 'outertext': return $this->outertext();
728 case 'innertext': return $this->innertext(); 735 case 'innertext': return $this->innertext();
729 case 'plaintext': return $this->text(); 736 case 'plaintext': return $this->text();
@@ -732,22 +739,30 @@ class simple_html_dom_node
732 } 739 }
733 } 740 }
734 741
735 function __set($name, $value) { 742 function __set($name, $value)
736 switch ($name) { 743 {
744 global $debug_object;
745 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746
747 switch ($name)
748 {
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 749 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 case 'innertext': 750 case 'innertext':
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 751 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 return $this->_[HDOM_INFO_INNER] = $value; 752 return $this->_[HDOM_INFO_INNER] = $value;
741 } 753 }
742 if (!isset($this->attr[$name])) { 754 if (!isset($this->attr[$name]))
755 {
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 756 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 757 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 } 758 }
746 $this->attr[$name] = $value; 759 $this->attr[$name] = $value;
747 } 760 }
748 761
749 function __isset($name) { 762 function __isset($name)
750 switch ($name) { 763 {
764 switch ($name)
765 {
751 case 'outertext': return true; 766 case 'outertext': return true;
752 case 'innertext': return true; 767 case 'innertext': return true;
753 case 'plaintext': return true; 768 case 'plaintext': return true;
@@ -765,7 +780,7 @@ class simple_html_dom_node
765 function convert_text($text) 780 function convert_text($text)
766 { 781 {
767 global $debug_object; 782 global $debug_object;
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 783 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
769 784
770 $converted_text = $text; 785 $converted_text = $text;
771 786
@@ -777,7 +792,7 @@ class simple_html_dom_node
777 $sourceCharset = strtoupper($this->dom->_charset); 792 $sourceCharset = strtoupper($this->dom->_charset);
778 $targetCharset = strtoupper($this->dom->_target_charset); 793 $targetCharset = strtoupper($this->dom->_target_charset);
779 } 794 }
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 795 if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781 796
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 797 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 { 798 {
@@ -1045,10 +1060,10 @@ class simple_html_dom
1045 1060
1046 // prepare 1061 // prepare
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1062 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 // strip out comments
1049 $this->remove_noise("'<!--(.*?)-->'is");
1050 // strip out cdata 1063 // strip out cdata
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1064 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065 // strip out comments
1066 $this->remove_noise("'<!--(.*?)-->'is");
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1067 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal. 1068 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags 1069 // strip out <script> tags
@@ -1078,10 +1093,15 @@ class simple_html_dom
1078 // load html from file 1093 // load html from file
1079 function load_file() 1094 function load_file()
1080 { 1095 {
1096 //external error: NOT related to dom loading
1097 $extError=error_get_last();
1098
1081 $args = func_get_args(); 1099 $args = func_get_args();
1082 $this->load(call_user_func_array('file_get_contents', $args), true); 1100 $this->load(call_user_func_array('file_get_contents', $args), true);
1101
1083 // Throw an error if we can't properly load the dom. 1102 // Throw an error if we can't properly load the dom.
1084 if (($error=error_get_last())!==null) { 1103 $error=error_get_last();
1104 if ($error!==$extError) {
1085 $this->clear(); 1105 $this->clear();
1086 return false; 1106 return false;
1087 } 1107 }
@@ -1198,22 +1218,22 @@ class simple_html_dom
1198 if ($success) 1218 if ($success)
1199 { 1219 {
1200 $charset = $matches[1]; 1220 $charset = $matches[1];
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);} 1221 if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1202 } 1222 }
1203 1223
1204 } 1224 }
1205 1225
1206 if (empty($charset)) 1226 if (empty($charset))
1207 { 1227 {
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0); 1228 $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1209 if (!empty($el)) 1229 if (!empty($el))
1210 { 1230 {
1211 $fullvalue = $el->content; 1231 $fullvalue = $el->content;
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);} 1232 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1213 1233
1214 if (!empty($fullvalue)) 1234 if (!empty($fullvalue))
1215 { 1235 {
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches); 1236 $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1217 if ($success) 1237 if ($success)
1218 { 1238 {
1219 $charset = $matches[1]; 1239 $charset = $matches[1];
@@ -1221,7 +1241,7 @@ class simple_html_dom
1221 else 1241 else
1222 { 1242 {
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1243 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1244 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1225 $charset = 'ISO-8859-1'; 1245 $charset = 'ISO-8859-1';
1226 } 1246 }
1227 } 1247 }
@@ -1231,14 +1251,19 @@ class simple_html_dom
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1251 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if (empty($charset)) 1252 if (empty($charset))
1233 { 1253 {
1234 // Have php try to detect the encoding from the text given to us. 1254 // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1255 $charset = false;
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);} 1256 if (function_exists('mb_detect_encoding'))
1257 {
1258 // Have php try to detect the encoding from the text given to us.
1259 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1260 if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1261 }
1237 1262
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1263 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ($charset === false) 1264 if ($charset === false)
1240 { 1265 {
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');} 1266 if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1242 $charset = 'UTF-8'; 1267 $charset = 'UTF-8';
1243 } 1268 }
1244 } 1269 }
@@ -1246,11 +1271,11 @@ class simple_html_dom
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1271 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1272 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1248 { 1273 {
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1274 if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1250 $charset = 'CP1252'; 1275 $charset = 'CP1252';
1251 } 1276 }
1252 1277
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);} 1278 if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1254 1279
1255 return $this->_charset = $charset; 1280 return $this->_charset = $charset;
1256 } 1281 }
@@ -1616,14 +1641,14 @@ class simple_html_dom
1616 protected function remove_noise($pattern, $remove_tag=false) 1641 protected function remove_noise($pattern, $remove_tag=false)
1617 { 1642 {
1618 global $debug_object; 1643 global $debug_object;
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1644 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1620 1645
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1646 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1622 1647
1623 for ($i=$count-1; $i>-1; --$i) 1648 for ($i=$count-1; $i>-1; --$i)
1624 { 1649 {
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1650 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); } 1651 if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1627 $idx = ($remove_tag) ? 0 : 1; 1652 $idx = ($remove_tag) ? 0 : 1;
1628 $this->noise[$key] = $matches[$i][$idx][0]; 1653 $this->noise[$key] = $matches[$i][$idx][0];
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1654 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
@@ -1641,7 +1666,7 @@ class simple_html_dom
1641 function restore_noise($text) 1666 function restore_noise($text)
1642 { 1667 {
1643 global $debug_object; 1668 global $debug_object;
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1669 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1645 1670
1646 while (($pos=strpos($text, '___noise___'))!==false) 1671 while (($pos=strpos($text, '___noise___'))!==false)
1647 { 1672 {
@@ -1649,7 +1674,7 @@ class simple_html_dom
1649 if (strlen($text) > $pos+15) 1674 if (strlen($text) > $pos+15)
1650 { 1675 {
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1676 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); } 1677 if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1653 1678
1654 if (isset($this->noise[$key])) 1679 if (isset($this->noise[$key]))
1655 { 1680 {
@@ -1674,7 +1699,7 @@ class simple_html_dom
1674 function search_noise($text) 1699 function search_noise($text)
1675 { 1700 {
1676 global $debug_object; 1701 global $debug_object;
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1702 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1678 1703
1679 foreach($this->noise as $noiseElement) 1704 foreach($this->noise as $noiseElement)
1680 { 1705 {