aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty
diff options
context:
space:
mode:
authorNicolas LÅ“uillet <nicolas@loeuillet.org>2014-07-15 11:49:24 +0200
committerNicolas LÅ“uillet <nicolas@loeuillet.org>2014-07-15 11:49:24 +0200
commit0f6273cdb8c77436593782d42f271fddc7a7875d (patch)
treecd6e0959768f5fac7eac054572a97b3a30674af2 /inc/3rdparty
parenta9f5e572dde4f986a498d2fbe92a38a1b22f9595 (diff)
parent26452f891f3ba75f2636733dbfe943535636df06 (diff)
downloadwallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.tar.gz
wallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.tar.zst
wallabag-0f6273cdb8c77436593782d42f271fddc7a7875d.zip
Merge pull request #761 from wallabag/dev1.7.1
1.7.1
Diffstat (limited to 'inc/3rdparty')
-rw-r--r--inc/3rdparty/libraries/PHPePub/EPub.php8
-rwxr-xr-xinc/3rdparty/libraries/feedwriter/FeedWriter.php1
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/readability/Readability.php17
-rwxr-xr-xinc/3rdparty/makefulltextfeed.php8
-rwxr-xr-xinc/3rdparty/makefulltextfeedHelpers.php10
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/simple_html_dom.php105
-rw-r--r--inc/3rdparty/site_config/standard/.about.com.txt14
-rw-r--r--inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt9
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/site_config/standard/politico.com.txt4
9 files changed, 131 insertions, 45 deletions
diff --git a/inc/3rdparty/libraries/PHPePub/EPub.php b/inc/3rdparty/libraries/PHPePub/EPub.php
index f1f41bd5..d9b990b7 100644
--- a/inc/3rdparty/libraries/PHPePub/EPub.php
+++ b/inc/3rdparty/libraries/PHPePub/EPub.php
@@ -41,6 +41,8 @@ class EPub {
41 41
42 private $bookVersion = EPub::BOOK_VERSION_EPUB2; 42 private $bookVersion = EPub::BOOK_VERSION_EPUB2;
43 43
44 private $debugInside = FALSE;
45
44 public $maxImageWidth = 768; 46 public $maxImageWidth = 768;
45 public $maxImageHeight = 1024; 47 public $maxImageHeight = 1024;
46 48
@@ -132,10 +134,14 @@ class EPub {
132 * 134 *
133 * @return void 135 * @return void
134 */ 136 */
135 function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) { 137 function __construct($bookVersion = EPub::BOOK_VERSION_EPUB2, $debugInside = FALSE, $languageCode = "en", $writingDirection = EPub::DIRECTION_LEFT_TO_RIGHT) {
136 include_once("Zip.php"); 138 include_once("Zip.php");
137 include_once("Logger.php"); 139 include_once("Logger.php");
138 140
141 if (!$debugInside) {
142 error_reporting(E_ERROR | E_PARSE);
143 }
144
139 $this->bookVersion = $bookVersion; 145 $this->bookVersion = $bookVersion;
140 $this->writingDirection = $writingDirection; 146 $this->writingDirection = $writingDirection;
141 $this->languageCode = $languageCode; 147 $this->languageCode = $languageCode;
diff --git a/inc/3rdparty/libraries/feedwriter/FeedWriter.php b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
index aa064afb..9446cddf 100755
--- a/inc/3rdparty/libraries/feedwriter/FeedWriter.php
+++ b/inc/3rdparty/libraries/feedwriter/FeedWriter.php
@@ -2,6 +2,7 @@
2define('RSS2', 1, true); 2define('RSS2', 1, true);
3define('JSON', 2, true); 3define('JSON', 2, true);
4define('JSONP', 3, true); 4define('JSONP', 3, true);
5define('ATOM', 4, true);
5 6
6 /** 7 /**
7 * Univarsel Feed Writer class 8 * Univarsel Feed Writer class
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index d0f09d74..4fa3ba63 100644..100755
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -679,6 +679,7 @@ class Readability
679 } else { 679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML; 680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = ''; 681 $page->documentElement->innerHTML = '';
682 $this->reinitBody();
682 $page->documentElement->appendChild($topCandidate); 683 $page->documentElement->appendChild($topCandidate);
683 } 684 }
684 } else { 685 } else {
@@ -794,8 +795,7 @@ class Readability
794 { 795 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 796 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there. 797 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 798 $this->reinitBody();
798 $this->body->innerHTML = $this->bodyCache;
799 799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
@@ -1134,5 +1134,18 @@ class Readability
1134 public function removeFlag($flag) { 1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag; 1135 $this->flags = $this->flags & ~$flag;
1136 } 1136 }
1137
1138 /**
1139 * Will recreate previously deleted body property
1140 *
1141 * @return void
1142 */
1143 protected function reinitBody() {
1144 if (!isset($this->body->childNodes)) {
1145 $this->body = $this->dom->createElement('body');
1146 $this->body->innerHTML = $this->bodyCache;
1147 }
1148 }
1149
1137} 1150}
1138?> \ No newline at end of file 1151?> \ No newline at end of file
diff --git a/inc/3rdparty/makefulltextfeed.php b/inc/3rdparty/makefulltextfeed.php
index 7a56be8c..a081f88b 100755
--- a/inc/3rdparty/makefulltextfeed.php
+++ b/inc/3rdparty/makefulltextfeed.php
@@ -28,7 +28,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
28// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article 28// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
29// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage 29// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
30 30
31error_reporting(E_ALL ^ E_NOTICE); 31//error_reporting(E_ALL ^ E_NOTICE);
32ini_set("display_errors", 1); 32ini_set("display_errors", 1);
33@set_time_limit(120); 33@set_time_limit(120);
34 34
@@ -671,7 +671,11 @@ foreach ($items as $key => $item) {
671 $html .= $item->get_description(); 671 $html .= $item->get_description();
672 } else { 672 } else {
673 $readability->clean($content_block, 'select'); 673 $readability->clean($content_block, 'select');
674 if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); 674 // get base URL
675 $base_url = get_base_url($readability->dom);
676 if (!$base_url) $base_url = $effective_url;
677 // rewrite URLs
678 if ($options->rewrite_relative_urls) makeAbsolute($base_url, $content_block);
675 // footnotes 679 // footnotes
676 if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { 680 if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
677 $readability->addFootnotes($content_block); 681 $readability->addFootnotes($content_block);
diff --git a/inc/3rdparty/makefulltextfeedHelpers.php b/inc/3rdparty/makefulltextfeedHelpers.php
index 4e985372..ac872ab8 100755
--- a/inc/3rdparty/makefulltextfeedHelpers.php
+++ b/inc/3rdparty/makefulltextfeedHelpers.php
@@ -377,3 +377,13 @@ function debug($msg) {
377 flush(); 377 flush();
378 } 378 }
379} 379}
380
381function get_base_url($dom) {
382 $xpath = new DOMXPath($dom);
383 $base_url = @$xpath->evaluate('string(//head/base/@href)', $dom);
384 if ($base_url !== '') {
385 return $base_url;
386 } else {
387 return false;
388 }
389}
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
index 43b94e57..9b73b105 100644..100755
--- a/inc/3rdparty/simple_html_dom.php
+++ b/inc/3rdparty/simple_html_dom.php
@@ -34,7 +34,7 @@
34 * @author S.C. Chen <me578022@gmail.com> 34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick 35 * @author John Schlick
36 * @author Rus Carroll 36 * @author Rus Carroll
37 * @version 1.5 ($Rev: 202 $) 37 * @version 1.5 ($Rev: 210 $)
38 * @package PlaceLocalInclude 38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom 39 * @subpackage simple_html_dom
40 */ 40 */
@@ -269,7 +269,10 @@ class simple_html_dom_node
269 { 269 {
270 return $this->children; 270 return $this->children;
271 } 271 }
272 if (isset($this->children[$idx])) return $this->children[$idx]; 272 if (isset($this->children[$idx]))
273 {
274 return $this->children[$idx];
275 }
273 return null; 276 return null;
274 } 277 }
275 278
@@ -330,14 +333,14 @@ class simple_html_dom_node
330 function find_ancestor_tag($tag) 333 function find_ancestor_tag($tag)
331 { 334 {
332 global $debug_object; 335 global $debug_object;
333 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 336 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
334 337
335 // Start by including ourselves in the comparison. 338 // Start by including ourselves in the comparison.
336 $returnDom = $this; 339 $returnDom = $this;
337 340
338 while (!is_null($returnDom)) 341 while (!is_null($returnDom))
339 { 342 {
340 if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); } 343 if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }
341 344
342 if ($returnDom->tag == $tag) 345 if ($returnDom->tag == $tag)
343 { 346 {
@@ -374,7 +377,7 @@ class simple_html_dom_node
374 $text = " with text: " . $this->text; 377 $text = " with text: " . $this->text;
375 } 378 }
376 } 379 }
377 $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); 380 $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
378 } 381 }
379 382
380 if ($this->tag==='root') return $this->innertext(); 383 if ($this->tag==='root') return $this->innertext();
@@ -532,7 +535,9 @@ class simple_html_dom_node
532 foreach ($head as $k=>$v) 535 foreach ($head as $k=>$v)
533 { 536 {
534 if (!isset($found_keys[$k])) 537 if (!isset($found_keys[$k]))
538 {
535 $found_keys[$k] = 1; 539 $found_keys[$k] = 1;
540 }
536 } 541 }
537 } 542 }
538 543
@@ -554,7 +559,7 @@ class simple_html_dom_node
554 protected function seek($selector, &$ret, $lowercase=false) 559 protected function seek($selector, &$ret, $lowercase=false)
555 { 560 {
556 global $debug_object; 561 global $debug_object;
557 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 562 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
558 563
559 list($tag, $key, $val, $exp, $no_key) = $selector; 564 list($tag, $key, $val, $exp, $no_key) = $selector;
560 565
@@ -615,7 +620,7 @@ class simple_html_dom_node
615 // this is a normal search, we want the value of that attribute of the tag. 620 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node->attr[$key]; 621 $nodeKeyValue = $node->attr[$key];
617 } 622 }
618 if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);} 623 if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
619 624
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. 625 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
621 if ($lowercase) { 626 if ($lowercase) {
@@ -623,7 +628,7 @@ class simple_html_dom_node
623 } else { 628 } else {
624 $check = $this->match($exp, $val, $nodeKeyValue); 629 $check = $this->match($exp, $val, $nodeKeyValue);
625 } 630 }
626 if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));} 631 if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}
627 632
628 // handle multiple class 633 // handle multiple class
629 if (!$check && strcasecmp($key, 'class')===0) { 634 if (!$check && strcasecmp($key, 'class')===0) {
@@ -645,12 +650,12 @@ class simple_html_dom_node
645 unset($node); 650 unset($node);
646 } 651 }
647 // It's passed by reference so this is actually what this function returns. 652 // It's passed by reference so this is actually what this function returns.
648 if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);} 653 if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}
649 } 654 }
650 655
651 protected function match($exp, $pattern, $value) { 656 protected function match($exp, $pattern, $value) {
652 global $debug_object; 657 global $debug_object;
653 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 658 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
654 659
655 switch ($exp) { 660 switch ($exp) {
656 case '=': 661 case '=':
@@ -672,7 +677,7 @@ class simple_html_dom_node
672 677
673 protected function parse_selector($selector_string) { 678 protected function parse_selector($selector_string) {
674 global $debug_object; 679 global $debug_object;
675 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 680 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
676 681
677 // pattern of CSS selectors, modified from mootools 682 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. 683 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
@@ -683,7 +688,7 @@ class simple_html_dom_node
683// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 688// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; 689 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
685 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 690 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
686 if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);} 691 if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}
687 692
688 $selectors = array(); 693 $selectors = array();
689 $result = array(); 694 $result = array();
@@ -718,12 +723,14 @@ class simple_html_dom_node
718 return $selectors; 723 return $selectors;
719 } 724 }
720 725
721 function __get($name) { 726 function __get($name)
727 {
722 if (isset($this->attr[$name])) 728 if (isset($this->attr[$name]))
723 { 729 {
724 return $this->convert_text($this->attr[$name]); 730 return $this->convert_text($this->attr[$name]);
725 } 731 }
726 switch ($name) { 732 switch ($name)
733 {
727 case 'outertext': return $this->outertext(); 734 case 'outertext': return $this->outertext();
728 case 'innertext': return $this->innertext(); 735 case 'innertext': return $this->innertext();
729 case 'plaintext': return $this->text(); 736 case 'plaintext': return $this->text();
@@ -732,22 +739,30 @@ class simple_html_dom_node
732 } 739 }
733 } 740 }
734 741
735 function __set($name, $value) { 742 function __set($name, $value)
736 switch ($name) { 743 {
744 global $debug_object;
745 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
746
747 switch ($name)
748 {
737 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 749 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
738 case 'innertext': 750 case 'innertext':
739 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 751 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
740 return $this->_[HDOM_INFO_INNER] = $value; 752 return $this->_[HDOM_INFO_INNER] = $value;
741 } 753 }
742 if (!isset($this->attr[$name])) { 754 if (!isset($this->attr[$name]))
755 {
743 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 756 $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
744 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 757 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
745 } 758 }
746 $this->attr[$name] = $value; 759 $this->attr[$name] = $value;
747 } 760 }
748 761
749 function __isset($name) { 762 function __isset($name)
750 switch ($name) { 763 {
764 switch ($name)
765 {
751 case 'outertext': return true; 766 case 'outertext': return true;
752 case 'innertext': return true; 767 case 'innertext': return true;
753 case 'plaintext': return true; 768 case 'plaintext': return true;
@@ -765,7 +780,7 @@ class simple_html_dom_node
765 function convert_text($text) 780 function convert_text($text)
766 { 781 {
767 global $debug_object; 782 global $debug_object;
768 if (is_object($debug_object)) {$debug_object->debugLogEntry(1);} 783 if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
769 784
770 $converted_text = $text; 785 $converted_text = $text;
771 786
@@ -777,7 +792,7 @@ class simple_html_dom_node
777 $sourceCharset = strtoupper($this->dom->_charset); 792 $sourceCharset = strtoupper($this->dom->_charset);
778 $targetCharset = strtoupper($this->dom->_target_charset); 793 $targetCharset = strtoupper($this->dom->_target_charset);
779 } 794 }
780 if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);} 795 if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
781 796
782 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) 797 if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
783 { 798 {
@@ -1045,10 +1060,10 @@ class simple_html_dom
1045 1060
1046 // prepare 1061 // prepare
1047 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); 1062 $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1048 // strip out comments
1049 $this->remove_noise("'<!--(.*?)-->'is");
1050 // strip out cdata 1063 // strip out cdata
1051 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); 1064 $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1065 // strip out comments
1066 $this->remove_noise("'<!--(.*?)-->'is");
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 1067 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal. 1068 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags 1069 // strip out <script> tags
@@ -1078,10 +1093,15 @@ class simple_html_dom
1078 // load html from file 1093 // load html from file
1079 function load_file() 1094 function load_file()
1080 { 1095 {
1096 //external error: NOT related to dom loading
1097 $extError=error_get_last();
1098
1081 $args = func_get_args(); 1099 $args = func_get_args();
1082 $this->load(call_user_func_array('file_get_contents', $args), true); 1100 $this->load(call_user_func_array('file_get_contents', $args), true);
1101
1083 // Throw an error if we can't properly load the dom. 1102 // Throw an error if we can't properly load the dom.
1084 if (($error=error_get_last())!==null) { 1103 $error=error_get_last();
1104 if ($error!==$extError) {
1085 $this->clear(); 1105 $this->clear();
1086 return false; 1106 return false;
1087 } 1107 }
@@ -1198,22 +1218,22 @@ class simple_html_dom
1198 if ($success) 1218 if ($success)
1199 { 1219 {
1200 $charset = $matches[1]; 1220 $charset = $matches[1];
1201 if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);} 1221 if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}
1202 } 1222 }
1203 1223
1204 } 1224 }
1205 1225
1206 if (empty($charset)) 1226 if (empty($charset))
1207 { 1227 {
1208 $el = $this->root->find('meta[http-equiv=Content-Type]',0); 1228 $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);
1209 if (!empty($el)) 1229 if (!empty($el))
1210 { 1230 {
1211 $fullvalue = $el->content; 1231 $fullvalue = $el->content;
1212 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);} 1232 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}
1213 1233
1214 if (!empty($fullvalue)) 1234 if (!empty($fullvalue))
1215 { 1235 {
1216 $success = preg_match('/charset=(.+)/', $fullvalue, $matches); 1236 $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);
1217 if ($success) 1237 if ($success)
1218 { 1238 {
1219 $charset = $matches[1]; 1239 $charset = $matches[1];
@@ -1221,7 +1241,7 @@ class simple_html_dom
1221 else 1241 else
1222 { 1242 {
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 1243 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');} 1244 if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1225 $charset = 'ISO-8859-1'; 1245 $charset = 'ISO-8859-1';
1226 } 1246 }
1227 } 1247 }
@@ -1231,14 +1251,19 @@ class simple_html_dom
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got... 1251 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if (empty($charset)) 1252 if (empty($charset))
1233 { 1253 {
1234 // Have php try to detect the encoding from the text given to us. 1254 // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1235 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); 1255 $charset = false;
1236 if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);} 1256 if (function_exists('mb_detect_encoding'))
1257 {
1258 // Have php try to detect the encoding from the text given to us.
1259 $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1260 if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}
1261 }
1237 1262
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... 1263 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ($charset === false) 1264 if ($charset === false)
1240 { 1265 {
1241 if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');} 1266 if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}
1242 $charset = 'UTF-8'; 1267 $charset = 'UTF-8';
1243 } 1268 }
1244 } 1269 }
@@ -1246,11 +1271,11 @@ class simple_html_dom
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. 1271 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) 1272 if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1248 { 1273 {
1249 if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');} 1274 if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1250 $charset = 'CP1252'; 1275 $charset = 'CP1252';
1251 } 1276 }
1252 1277
1253 if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);} 1278 if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}
1254 1279
1255 return $this->_charset = $charset; 1280 return $this->_charset = $charset;
1256 } 1281 }
@@ -1616,14 +1641,14 @@ class simple_html_dom
1616 protected function remove_noise($pattern, $remove_tag=false) 1641 protected function remove_noise($pattern, $remove_tag=false)
1617 { 1642 {
1618 global $debug_object; 1643 global $debug_object;
1619 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1644 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1620 1645
1621 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 1646 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1622 1647
1623 for ($i=$count-1; $i>-1; --$i) 1648 for ($i=$count-1; $i>-1; --$i)
1624 { 1649 {
1625 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); 1650 $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1626 if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); } 1651 if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }
1627 $idx = ($remove_tag) ? 0 : 1; 1652 $idx = ($remove_tag) ? 0 : 1;
1628 $this->noise[$key] = $matches[$i][$idx][0]; 1653 $this->noise[$key] = $matches[$i][$idx][0];
1629 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 1654 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
@@ -1641,7 +1666,7 @@ class simple_html_dom
1641 function restore_noise($text) 1666 function restore_noise($text)
1642 { 1667 {
1643 global $debug_object; 1668 global $debug_object;
1644 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1669 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1645 1670
1646 while (($pos=strpos($text, '___noise___'))!==false) 1671 while (($pos=strpos($text, '___noise___'))!==false)
1647 { 1672 {
@@ -1649,7 +1674,7 @@ class simple_html_dom
1649 if (strlen($text) > $pos+15) 1674 if (strlen($text) > $pos+15)
1650 { 1675 {
1651 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; 1676 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1652 if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); } 1677 if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }
1653 1678
1654 if (isset($this->noise[$key])) 1679 if (isset($this->noise[$key]))
1655 { 1680 {
@@ -1674,7 +1699,7 @@ class simple_html_dom
1674 function search_noise($text) 1699 function search_noise($text)
1675 { 1700 {
1676 global $debug_object; 1701 global $debug_object;
1677 if (is_object($debug_object)) { $debug_object->debugLogEntry(1); } 1702 if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1678 1703
1679 foreach($this->noise as $noiseElement) 1704 foreach($this->noise as $noiseElement)
1680 { 1705 {
diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt
new file mode 100644
index 00000000..e1ebaee3
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/.about.com.txt
@@ -0,0 +1,14 @@
1body: //div[@id='articlebody']
2title: //h1
3author: //p[@id='by']//a
4
5next_page_link: //span[@class='next']/a
6# Not the same as below!
7
8prune: yes
9tidy: no
10
11# Annoying 'next' links plainly inside the article body
12strip: //*[text()[contains(.,'Next: ')]]
13
14test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm
diff --git a/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt
new file mode 100644
index 00000000..24c949e9
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt
@@ -0,0 +1,9 @@
1title: //div[@id='header']//h1[1]
2
3body: //div[@id='content']
4
5strip_id_or_class: toc
6
7prune: no
8
9test_url: http://moo.nac.uci.edu/~hjm/HOWTO_move_data.html
diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt
index 121fd5b9..c5302d1b 100644..100755
--- a/inc/3rdparty/site_config/standard/politico.com.txt
+++ b/inc/3rdparty/site_config/standard/politico.com.txt
@@ -4,10 +4,14 @@ body://div[contains(@class,"story-text")]
4# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] 4# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"]
5 5
6next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a 6next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a
7next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a
7date://meta[@name="publish_date"]/@content 8date://meta[@name="publish_date"]/@content
8 9
9strip://div[contains(@class, "breadcrumbs")] 10strip://div[contains(@class, "breadcrumbs")]
10strip://a[contains(@class, "hidden")] 11strip://a[contains(@class, "hidden")]
11strip://div[contains(@class, "story-embed")] 12strip://div[contains(@class, "story-embed")]
12strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. 13strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/..
14strip://div[contains(@class, "story-interrupt")]
15strip://footer[contains(@class, "author-bio")]
16
13test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file 17test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file