diff options
author | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-07-15 11:49:24 +0200 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas@loeuillet.org> | 2014-07-15 11:49:24 +0200 |
commit | 0f6273cdb8c77436593782d42f271fddc7a7875d (patch) | |
tree | cd6e0959768f5fac7eac054572a97b3a30674af2 /inc/3rdparty | |
parent | a9f5e572dde4f986a498d2fbe92a38a1b22f9595 (diff) | |
parent | 26452f891f3ba75f2636733dbfe943535636df06 (diff) | |
download | wallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.tar.gz wallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.tar.zst wallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.zip |
Merge pull request #761 from wallabag/dev1.7.1
1.7.1
Diffstat (limited to 'inc/3rdparty')
-rw-r--r-- | inc/3rdparty/libraries/PHPePub/EPub.php | 8 | ||||
-rwxr-xr-x | inc/3rdparty/libraries/feedwriter/FeedWriter.php | 1 | ||||
-rwxr-xr-x[-rw-r--r--] | inc/3rdparty/libraries/readability/Readability.php | 17 | ||||
-rwxr-xr-x | inc/3rdparty/makefulltextfeed.php | 8 | ||||
-rwxr-xr-x | inc/3rdparty/makefulltextfeedHelpers.php | 10 | ||||
-rwxr-xr-x[-rw-r--r--] | inc/3rdparty/simple_html_dom.php | 105 | ||||
-rw-r--r-- | inc/3rdparty/site_config/standard/.about.com.txt | 14 | ||||
-rw-r--r-- | inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt | 9 | ||||
-rwxr-xr-x[-rw-r--r--] | inc/3rdparty/site_config/standard/politico.com.txt | 4 |
9 files changed, 131 insertions, 45 deletions
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php index f1f41bd5..d9b990b7 100644 --- a/inc/3rdparty/libraries/PHPePub/EPub.php +++ b/inc/3rdparty/libraries/PHPePub/EPub.php | |||
@@ -41,6 +41,8 @@ class EPub { | |||
41 | 41 | ||
42 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; | 42 | private $bookVersion = EPub::BOOK_VERSION_EPUB2; |
43 | 43 | ||
44 | private $debugInside = FALSE; | ||
45 | |||
44 | public $maxImageWidth = 768; | 46 | public $maxImageWidth = 768; |
45 | public $maxImageHeight = 1024; | 47 | public $maxImageHeight = 1024; |
46 | 48 | ||
@@ -132,10 +134,14 @@ class EPub { | |||
132 | * | 134 | * |
133 | * @return void | 135 | * @return void |
134 | */ | 136 | */ |
135 | function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { | 137 | function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $debugInside = FALSE, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { |
136 | include_once("Zip.php"); | 138 | include_once("Zip.php"); |
137 | include_once("Logger.php"); | 139 | include_once("Logger.php"); |
138 | 140 | ||
141 | if (!$debugInside) { | ||
142 | error_reporting(E_ERROR | E_PARSE); | ||
143 | } | ||
144 | |||
139 | $this->bookVersion = $bookVersion; | 145 | $this->bookVersion = $bookVersion; |
140 | $this->writingDirection = $writingDirection; | 146 | $this->writingDirection = $writingDirection; |
141 | $this->languageCode = $languageCode; | 147 | $this->languageCode = $languageCode; |
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php index aa064afb..9446cddf 100755 --- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php +++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php | |||
@@ -2,6 +2,7 @@ | |||
2 | define('RSS2', 1, true); | 2 | define('RSS2', 1, true); |
3 | define('JSON', 2, true); | 3 | define('JSON', 2, true); |
4 | define('JSONP', 3, true); | 4 | define('JSONP', 3, true); |
5 | define('ATOM', 4, true); | ||
5 | 6 | ||
6 | /** | 7 | /** |
7 | * Univarsel Feed Writer class | 8 | * Univarsel Feed Writer class |
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php index d0f09d74..4fa3ba63 100644..100755 --- a/inc/3rdparty/libraries/readability/Readability.php +++ b/inc/3rdparty/libraries/readability/Readability.php | |||
@@ -679,6 +679,7 @@ class Readability | |||
679 | } else { | 679 | } else { |
680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; | 680 | $topCandidate->innerHTML = $page->documentElement->innerHTML; |
681 | $page->documentElement->innerHTML = ''; | 681 | $page->documentElement->innerHTML = ''; |
682 | $this->reinitBody(); | ||
682 | $page->documentElement->appendChild($topCandidate); | 683 | $page->documentElement->appendChild($topCandidate); |
683 | } | 684 | } |
684 | } else { | 685 | } else { |
@@ -794,8 +795,7 @@ class Readability | |||
794 | { | 795 | { |
795 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 | 796 | // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 |
796 | // in the meantime, we check and create an empty element if it's not there. | 797 | // in the meantime, we check and create an empty element if it's not there. |
797 | if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); | 798 | $this->reinitBody(); |
798 | $this->body->innerHTML = $this->bodyCache; | ||
799 | 799 | ||
800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { | 800 | if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { |
801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); | 801 | $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); |
@@ -1134,5 +1134,18 @@ class Readability | |||
1134 | public function removeFlag($flag) { | 1134 | public function removeFlag($flag) { |
1135 | $this->flags = $this->flags & ~$flag; | 1135 | $this->flags = $this->flags & ~$flag; |
1136 | } | 1136 | } |
1137 | |||
1138 | /** | ||
1139 | * Will recreate previously deleted body property | ||
1140 | * | ||
1141 | * @return void | ||
1142 | */ | ||
1143 | protected function reinitBody() { | ||
1144 | if (!isset($this->body->childNodes)) { | ||
1145 | $this->body = $this->dom->createElement('body'); | ||
1146 | $this->body->innerHTML = $this->bodyCache; | ||
1147 | } | ||
1148 | } | ||
1149 | |||
1137 | } | 1150 | } |
1138 | ?> \ No newline at end of file | 1151 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php index 7a56be8c..a081f88b 100755 --- a/inc/3rdparty/makefulltextfeed.php +++ b/inc/3rdparty/makefulltextfeed.php | |||
@@ -28,7 +28,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. | |||
28 | // Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article | 28 | // Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article |
29 | // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage | 29 | // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage |
30 | 30 | ||
31 | error_reporting(E_ALL ^ E_NOTICE); | 31 | //error_reporting(E_ALL ^ E_NOTICE); |
32 | ini_set("display_errors", 1); | 32 | ini_set("display_errors", 1); |
33 | @set_time_limit(120); | 33 | @set_time_limit(120); |
34 | 34 | ||
@@ -671,7 +671,11 @@ foreach ($items as $key => $item) { | |||
671 | $html .= $item->get_description(); | 671 | $html .= $item->get_description(); |
672 | } else { | 672 | } else { |
673 | $readability->clean($content_block, 'select'); | 673 | $readability->clean($content_block, 'select'); |
674 | if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); | 674 | // get base URL |
675 | $base_url = get_base_url($readability->dom); | ||
676 | if (!$base_url) $base_url = $effective_url; | ||
677 | // rewrite URLs | ||
678 | if ($options->rewrite_relative_urls) makeAbsolute($base_url, $content_block); | ||
675 | // footnotes | 679 | // footnotes |
676 | if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { | 680 | if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { |
677 | $readability->addFootnotes($content_block); | 681 | $readability->addFootnotes($content_block); |
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php index 4e985372..ac872ab8 100755 --- a/inc/3rdparty/makefulltextfeedHelpers.php +++ b/inc/3rdparty/makefulltextfeedHelpers.php | |||
@@ -377,3 +377,13 @@ function debug($msg) { | |||
377 | flush(); | 377 | flush(); |
378 | } | 378 | } |
379 | } | 379 | } |
380 | |||
381 | function get_base_url($dom) { | ||
382 | $xpath = new DOMXPath($dom); | ||
383 | $base_url = @$xpath->evaluate('string(//head/base/@href)', $dom); | ||
384 | if ($base_url !== '') { | ||
385 | return $base_url; | ||
386 | } else { | ||
387 | return false; | ||
388 | } | ||
389 | } | ||
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php index 43b94e57..9b73b105 100644..100755 --- a/inc/3rdparty/simple_html_dom.php +++ b/inc/3rdparty/simple_html_dom.php | |||
@@ -34,7 +34,7 @@ | |||
34 | * @author S.C. Chen <me578022@gmail.com> | 34 | * @author S.C. Chen <me578022@gmail.com> |
35 | * @author John Schlick | 35 | * @author John Schlick |
36 | * @author Rus Carroll | 36 | * @author Rus Carroll |
37 | * @version 1.5 ($Rev: 202 $) | 37 | * @version 1.5 ($Rev: 210 $) |
38 | * @package PlaceLocalInclude | 38 | * @package PlaceLocalInclude |
39 | * @subpackage simple_html_dom | 39 | * @subpackage simple_html_dom |
40 | */ | 40 | */ |
@@ -269,7 +269,10 @@ class simple_html_dom_node | |||
269 | { | 269 | { |
270 | return $this->children; | 270 | return $this->children; |
271 | } | 271 | } |
272 | if (isset($this->children[$idx])) return $this->children[$idx]; | 272 | if (isset($this->children[$idx])) |
273 | { | ||
274 | return $this->children[$idx]; | ||
275 | } | ||
273 | return null; | 276 | return null; |
274 | } | 277 | } |
275 | 278 | ||
@@ -330,14 +333,14 @@ class simple_html_dom_node | |||
330 | function find_ancestor_tag($tag) | 333 | function find_ancestor_tag($tag) |
331 | { | 334 | { |
332 | global $debug_object; | 335 | global $debug_object; |
333 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | 336 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } |
334 | 337 | ||
335 | // Start by including ourselves in the comparison. | 338 | // Start by including ourselves in the comparison. |
336 | $returnDom = $this; | 339 | $returnDom = $this; |
337 | 340 | ||
338 | while (!is_null($returnDom)) | 341 | while (!is_null($returnDom)) |
339 | { | 342 | { |
340 | if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); } | 343 | if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); } |
341 | 344 | ||
342 | if ($returnDom->tag == $tag) | 345 | if ($returnDom->tag == $tag) |
343 | { | 346 | { |
@@ -374,7 +377,7 @@ class simple_html_dom_node | |||
374 | $text = " with text: " . $this->text; | 377 | $text = " with text: " . $this->text; |
375 | } | 378 | } |
376 | } | 379 | } |
377 | $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); | 380 | $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text); |
378 | } | 381 | } |
379 | 382 | ||
380 | if ($this->tag==='root') return $this->innertext(); | 383 | if ($this->tag==='root') return $this->innertext(); |
@@ -532,7 +535,9 @@ class simple_html_dom_node | |||
532 | foreach ($head as $k=>$v) | 535 | foreach ($head as $k=>$v) |
533 | { | 536 | { |
534 | if (!isset($found_keys[$k])) | 537 | if (!isset($found_keys[$k])) |
538 | { | ||
535 | $found_keys[$k] = 1; | 539 | $found_keys[$k] = 1; |
540 | } | ||
536 | } | 541 | } |
537 | } | 542 | } |
538 | 543 | ||
@@ -554,7 +559,7 @@ class simple_html_dom_node | |||
554 | protected function seek($selector, &$ret, $lowercase=false) | 559 | protected function seek($selector, &$ret, $lowercase=false) |
555 | { | 560 | { |
556 | global $debug_object; | 561 | global $debug_object; |
557 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | 562 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } |
558 | 563 | ||
559 | list($tag, $key, $val, $exp, $no_key) = $selector; | 564 | list($tag, $key, $val, $exp, $no_key) = $selector; |
560 | 565 | ||
@@ -615,7 +620,7 @@ class simple_html_dom_node | |||
615 | // this is a normal search, we want the value of that attribute of the tag. | 620 | // this is a normal search, we want the value of that attribute of the tag. |
616 | $nodeKeyValue = $node->attr[$key]; | 621 | $nodeKeyValue = $node->attr[$key]; |
617 | } | 622 | } |
618 | if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} | 623 | if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} |
619 | 624 | ||
620 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. | 625 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. |
621 | if ($lowercase) { | 626 | if ($lowercase) { |
@@ -623,7 +628,7 @@ class simple_html_dom_node | |||
623 | } else { | 628 | } else { |
624 | $check = $this->match($exp, $val, $nodeKeyValue); | 629 | $check = $this->match($exp, $val, $nodeKeyValue); |
625 | } | 630 | } |
626 | if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));} | 631 | if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));} |
627 | 632 | ||
628 | // handle multiple class | 633 | // handle multiple class |
629 | if (!$check && strcasecmp($key, 'class')===0) { | 634 | if (!$check && strcasecmp($key, 'class')===0) { |
@@ -645,12 +650,12 @@ class simple_html_dom_node | |||
645 | unset($node); | 650 | unset($node); |
646 | } | 651 | } |
647 | // It's passed by reference so this is actually what this function returns. | 652 | // It's passed by reference so this is actually what this function returns. |
648 | if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);} | 653 | if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);} |
649 | } | 654 | } |
650 | 655 | ||
651 | protected function match($exp, $pattern, $value) { | 656 | protected function match($exp, $pattern, $value) { |
652 | global $debug_object; | 657 | global $debug_object; |
653 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | 658 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} |
654 | 659 | ||
655 | switch ($exp) { | 660 | switch ($exp) { |
656 | case '=': | 661 | case '=': |
@@ -672,7 +677,7 @@ class simple_html_dom_node | |||
672 | 677 | ||
673 | protected function parse_selector($selector_string) { | 678 | protected function parse_selector($selector_string) { |
674 | global $debug_object; | 679 | global $debug_object; |
675 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | 680 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} |
676 | 681 | ||
677 | // pattern of CSS selectors, modified from mootools | 682 | // pattern of CSS selectors, modified from mootools |
678 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. | 683 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. |
@@ -683,7 +688,7 @@ class simple_html_dom_node | |||
683 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | 688 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
684 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | 689 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
685 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); | 690 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); |
686 | if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);} | 691 | if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);} |
687 | 692 | ||
688 | $selectors = array(); | 693 | $selectors = array(); |
689 | $result = array(); | 694 | $result = array(); |
@@ -718,12 +723,14 @@ class simple_html_dom_node | |||
718 | return $selectors; | 723 | return $selectors; |
719 | } | 724 | } |
720 | 725 | ||
721 | function __get($name) { | 726 | function __get($name) |
727 | { | ||
722 | if (isset($this->attr[$name])) | 728 | if (isset($this->attr[$name])) |
723 | { | 729 | { |
724 | return $this->convert_text($this->attr[$name]); | 730 | return $this->convert_text($this->attr[$name]); |
725 | } | 731 | } |
726 | switch ($name) { | 732 | switch ($name) |
733 | { | ||
727 | case 'outertext': return $this->outertext(); | 734 | case 'outertext': return $this->outertext(); |
728 | case 'innertext': return $this->innertext(); | 735 | case 'innertext': return $this->innertext(); |
729 | case 'plaintext': return $this->text(); | 736 | case 'plaintext': return $this->text(); |
@@ -732,22 +739,30 @@ class simple_html_dom_node | |||
732 | } | 739 | } |
733 | } | 740 | } |
734 | 741 | ||
735 | function __set($name, $value) { | 742 | function __set($name, $value) |
736 | switch ($name) { | 743 | { |
744 | global $debug_object; | ||
745 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} | ||
746 | |||
747 | switch ($name) | ||
748 | { | ||
737 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; | 749 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; |
738 | case 'innertext': | 750 | case 'innertext': |
739 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; | 751 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; |
740 | return $this->_[HDOM_INFO_INNER] = $value; | 752 | return $this->_[HDOM_INFO_INNER] = $value; |
741 | } | 753 | } |
742 | if (!isset($this->attr[$name])) { | 754 | if (!isset($this->attr[$name])) |
755 | { | ||
743 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); | 756 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); |
744 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | 757 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
745 | } | 758 | } |
746 | $this->attr[$name] = $value; | 759 | $this->attr[$name] = $value; |
747 | } | 760 | } |
748 | 761 | ||
749 | function __isset($name) { | 762 | function __isset($name) |
750 | switch ($name) { | 763 | { |
764 | switch ($name) | ||
765 | { | ||
751 | case 'outertext': return true; | 766 | case 'outertext': return true; |
752 | case 'innertext': return true; | 767 | case 'innertext': return true; |
753 | case 'plaintext': return true; | 768 | case 'plaintext': return true; |
@@ -765,7 +780,7 @@ class simple_html_dom_node | |||
765 | function convert_text($text) | 780 | function convert_text($text) |
766 | { | 781 | { |
767 | global $debug_object; | 782 | global $debug_object; |
768 | if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} | 783 | if (is_object($debug_object)) {$debug_object->debug_log_entry(1);} |
769 | 784 | ||
770 | $converted_text = $text; | 785 | $converted_text = $text; |
771 | 786 | ||
@@ -777,7 +792,7 @@ class simple_html_dom_node | |||
777 | $sourceCharset = strtoupper($this->dom->_charset); | 792 | $sourceCharset = strtoupper($this->dom->_charset); |
778 | $targetCharset = strtoupper($this->dom->_target_charset); | 793 | $targetCharset = strtoupper($this->dom->_target_charset); |
779 | } | 794 | } |
780 | if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} | 795 | if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} |
781 | 796 | ||
782 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) | 797 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) |
783 | { | 798 | { |
@@ -1045,10 +1060,10 @@ class simple_html_dom | |||
1045 | 1060 | ||
1046 | // prepare | 1061 | // prepare |
1047 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); | 1062 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
1048 | // strip out comments | ||
1049 | $this->remove_noise("'<!--(.*?)-->'is"); | ||
1050 | // strip out cdata | 1063 | // strip out cdata |
1051 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); | 1064 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
1065 | // strip out comments | ||
1066 | $this->remove_noise("'<!--(.*?)-->'is"); | ||
1052 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 | 1067 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
1053 | // Script tags removal now preceeds style tag removal. | 1068 | // Script tags removal now preceeds style tag removal. |
1054 | // strip out <script> tags | 1069 | // strip out <script> tags |
@@ -1078,10 +1093,15 @@ class simple_html_dom | |||
1078 | // load html from file | 1093 | // load html from file |
1079 | function load_file() | 1094 | function load_file() |
1080 | { | 1095 | { |
1096 | //external error: NOT related to dom loading | ||
1097 | $extError=error_get_last(); | ||
1098 | |||
1081 | $args = func_get_args(); | 1099 | $args = func_get_args(); |
1082 | $this->load(call_user_func_array('file_get_contents', $args), true); | 1100 | $this->load(call_user_func_array('file_get_contents', $args), true); |
1101 | |||
1083 | // Throw an error if we can't properly load the dom. | 1102 | // Throw an error if we can't properly load the dom. |
1084 | if (($error=error_get_last())!==null) { | 1103 | $error=error_get_last(); |
1104 | if ($error!==$extError) { | ||
1085 | $this->clear(); | 1105 | $this->clear(); |
1086 | return false; | 1106 | return false; |
1087 | } | 1107 | } |
@@ -1198,22 +1218,22 @@ class simple_html_dom | |||
1198 | if ($success) | 1218 | if ($success) |
1199 | { | 1219 | { |
1200 | $charset = $matches[1]; | 1220 | $charset = $matches[1]; |
1201 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);} | 1221 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);} |
1202 | } | 1222 | } |
1203 | 1223 | ||
1204 | } | 1224 | } |
1205 | 1225 | ||
1206 | if (empty($charset)) | 1226 | if (empty($charset)) |
1207 | { | 1227 | { |
1208 | $el = $this->root->find('meta[http-equiv=Content-Type]',0); | 1228 | $el = $this->root->find('meta[http-equiv=Content-Type]',0, true); |
1209 | if (!empty($el)) | 1229 | if (!empty($el)) |
1210 | { | 1230 | { |
1211 | $fullvalue = $el->content; | 1231 | $fullvalue = $el->content; |
1212 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);} | 1232 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);} |
1213 | 1233 | ||
1214 | if (!empty($fullvalue)) | 1234 | if (!empty($fullvalue)) |
1215 | { | 1235 | { |
1216 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); | 1236 | $success = preg_match('/charset=(.+)/i', $fullvalue, $matches); |
1217 | if ($success) | 1237 | if ($success) |
1218 | { | 1238 | { |
1219 | $charset = $matches[1]; | 1239 | $charset = $matches[1]; |
@@ -1221,7 +1241,7 @@ class simple_html_dom | |||
1221 | else | 1241 | else |
1222 | { | 1242 | { |
1223 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 | 1243 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
1224 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} | 1244 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} |
1225 | $charset = 'ISO-8859-1'; | 1245 | $charset = 'ISO-8859-1'; |
1226 | } | 1246 | } |
1227 | } | 1247 | } |
@@ -1231,14 +1251,19 @@ class simple_html_dom | |||
1231 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... | 1251 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
1232 | if (empty($charset)) | 1252 | if (empty($charset)) |
1233 | { | 1253 | { |
1234 | // Have php try to detect the encoding from the text given to us. | 1254 | // Use this in case mb_detect_charset isn't installed/loaded on this machine. |
1235 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); | 1255 | $charset = false; |
1236 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);} | 1256 | if (function_exists('mb_detect_encoding')) |
1257 | { | ||
1258 | // Have php try to detect the encoding from the text given to us. | ||
1259 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); | ||
1260 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);} | ||
1261 | } | ||
1237 | 1262 | ||
1238 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... | 1263 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
1239 | if ($charset === false) | 1264 | if ($charset === false) |
1240 | { | 1265 | { |
1241 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');} | 1266 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');} |
1242 | $charset = 'UTF-8'; | 1267 | $charset = 'UTF-8'; |
1243 | } | 1268 | } |
1244 | } | 1269 | } |
@@ -1246,11 +1271,11 @@ class simple_html_dom | |||
1246 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. | 1271 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
1247 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) | 1272 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) |
1248 | { | 1273 | { |
1249 | if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} | 1274 | if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} |
1250 | $charset = 'CP1252'; | 1275 | $charset = 'CP1252'; |
1251 | } | 1276 | } |
1252 | 1277 | ||
1253 | if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);} | 1278 | if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);} |
1254 | 1279 | ||
1255 | return $this->_charset = $charset; | 1280 | return $this->_charset = $charset; |
1256 | } | 1281 | } |
@@ -1616,14 +1641,14 @@ class simple_html_dom | |||
1616 | protected function remove_noise($pattern, $remove_tag=false) | 1641 | protected function remove_noise($pattern, $remove_tag=false) |
1617 | { | 1642 | { |
1618 | global $debug_object; | 1643 | global $debug_object; |
1619 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | 1644 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } |
1620 | 1645 | ||
1621 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); | 1646 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
1622 | 1647 | ||
1623 | for ($i=$count-1; $i>-1; --$i) | 1648 | for ($i=$count-1; $i>-1; --$i) |
1624 | { | 1649 | { |
1625 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); | 1650 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); |
1626 | if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); } | 1651 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); } |
1627 | $idx = ($remove_tag) ? 0 : 1; | 1652 | $idx = ($remove_tag) ? 0 : 1; |
1628 | $this->noise[$key] = $matches[$i][$idx][0]; | 1653 | $this->noise[$key] = $matches[$i][$idx][0]; |
1629 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); | 1654 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
@@ -1641,7 +1666,7 @@ class simple_html_dom | |||
1641 | function restore_noise($text) | 1666 | function restore_noise($text) |
1642 | { | 1667 | { |
1643 | global $debug_object; | 1668 | global $debug_object; |
1644 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | 1669 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } |
1645 | 1670 | ||
1646 | while (($pos=strpos($text, '___noise___'))!==false) | 1671 | while (($pos=strpos($text, '___noise___'))!==false) |
1647 | { | 1672 | { |
@@ -1649,7 +1674,7 @@ class simple_html_dom | |||
1649 | if (strlen($text) > $pos+15) | 1674 | if (strlen($text) > $pos+15) |
1650 | { | 1675 | { |
1651 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; | 1676 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; |
1652 | if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); } | 1677 | if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); } |
1653 | 1678 | ||
1654 | if (isset($this->noise[$key])) | 1679 | if (isset($this->noise[$key])) |
1655 | { | 1680 | { |
@@ -1674,7 +1699,7 @@ class simple_html_dom | |||
1674 | function search_noise($text) | 1699 | function search_noise($text) |
1675 | { | 1700 | { |
1676 | global $debug_object; | 1701 | global $debug_object; |
1677 | if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } | 1702 | if (is_object($debug_object)) { $debug_object->debug_log_entry(1); } |
1678 | 1703 | ||
1679 | foreach($this->noise as $noiseElement) | 1704 | foreach($this->noise as $noiseElement) |
1680 | { | 1705 | { |
diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt new file mode 100644 index 00000000..e1ebaee3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/.about.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | body: //div[@id='articlebody'] | ||
2 | title: //h1 | ||
3 | author: //p[@id='by']//a | ||
4 | |||
5 | next_page_link: //span[@class='next']/a | ||
6 | # Not the same as below! | ||
7 | |||
8 | prune: yes | ||
9 | tidy: no | ||
10 | |||
11 | # Annoying 'next' links plainly inside the article body | ||
12 | strip: //*[text()[contains(.,'Next: ')]] | ||
13 | |||
14 | test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm | ||
diff --git a/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt new file mode 100644 index 00000000..24c949e9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='header']//h1[1] | ||
2 | |||
3 | body: //div[@id='content'] | ||
4 | |||
5 | strip_id_or_class: toc | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://moo.nac.uci.edu/~hjm/HOWTO_move_data.html | ||
diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt index 121fd5b9..c5302d1b 100644..100755 --- a/inc/3rdparty/site_config/standard/politico.com.txt +++ b/inc/3rdparty/site_config/standard/politico.com.txt | |||
@@ -4,10 +4,14 @@ body://div[contains(@class,"story-text")] | |||
4 | # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] | 4 | # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] |
5 | 5 | ||
6 | next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a | 6 | next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a |
7 | next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a | ||
7 | date://meta[@name="publish_date"]/@content | 8 | date://meta[@name="publish_date"]/@content |
8 | 9 | ||
9 | strip://div[contains(@class, "breadcrumbs")] | 10 | strip://div[contains(@class, "breadcrumbs")] |
10 | strip://a[contains(@class, "hidden")] | 11 | strip://a[contains(@class, "hidden")] |
11 | strip://div[contains(@class, "story-embed")] | 12 | strip://div[contains(@class, "story-embed")] |
12 | strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. | 13 | strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. |
14 | strip://div[contains(@class, "story-interrupt")] | ||
15 | strip://footer[contains(@class, "author-bio")] | ||
16 | |||
13 | test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file | 17 | test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file |