diff options
author | nicosomb <nicolas@loeuillet.org> | 2013-04-23 15:19:33 +0200 |
---|---|---|
committer | nicosomb <nicolas@loeuillet.org> | 2013-04-23 15:19:33 +0200 |
commit | 3e7188185d9d4be054ed3807c8b910d1e5f504f8 (patch) | |
tree | 1caacbc5ed979eb4e0266fc35588e4d88c9cb9ed /inc | |
parent | 8d5aab49c185313338245f9c3a878628e16d7c85 (diff) | |
parent | cdcc8d2533d2ed65ac6a89c9a6d0041de7361ce1 (diff) | |
download | wallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.tar.gz wallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.tar.zst wallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.zip |
Merge branch 'memiks-gestion_erreur_readability' into dev
Diffstat (limited to 'inc')
-rw-r--r-- | inc/Readability.php | 7 | ||||
-rw-r--r-- | inc/functions.php | 30 |
2 files changed, 29 insertions, 8 deletions
diff --git a/inc/Readability.php b/inc/Readability.php index c50bf2ef..2ce90f6d 100644 --- a/inc/Readability.php +++ b/inc/Readability.php | |||
@@ -80,7 +80,7 @@ class Readability | |||
80 | public $debug = false; | 80 | public $debug = false; |
81 | protected $body = null; // | 81 | protected $body = null; // |
82 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later | 82 | protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later |
83 | protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set. | 83 | protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set. |
84 | protected $success = false; // indicates whether we were able to extract or not | 84 | protected $success = false; // indicates whether we were able to extract or not |
85 | 85 | ||
86 | /** | 86 | /** |
@@ -90,7 +90,7 @@ class Readability | |||
90 | public $regexps = array( | 90 | public $regexps = array( |
91 | 'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', | 91 | 'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', |
92 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', | 92 | 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
93 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', | 93 | 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|attachment/i', |
94 | 'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', | 94 | 'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', |
95 | 'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i', | 95 | 'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i', |
96 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', | 96 | 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', |
@@ -106,7 +106,7 @@ class Readability | |||
106 | * Create instance of Readability | 106 | * Create instance of Readability |
107 | * @param string UTF-8 encoded string | 107 | * @param string UTF-8 encoded string |
108 | * @param string (optional) URL associated with HTML (used for footnotes) | 108 | * @param string (optional) URL associated with HTML (used for footnotes) |
109 | */ | 109 | */ |
110 | function __construct($html, $url=null) | 110 | function __construct($html, $url=null) |
111 | { | 111 | { |
112 | /* Turn all double br's into p's */ | 112 | /* Turn all double br's into p's */ |
@@ -185,6 +185,7 @@ class Readability | |||
185 | $articleContent = $this->dom->createElement('div'); | 185 | $articleContent = $this->dom->createElement('div'); |
186 | $articleContent->setAttribute('id', 'readability-content'); | 186 | $articleContent->setAttribute('id', 'readability-content'); |
187 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; | 187 | $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; |
188 | return $this->success; | ||
188 | } | 189 | } |
189 | 190 | ||
190 | $overlay->setAttribute('id', 'readOverlay'); | 191 | $overlay->setAttribute('id', 'readOverlay'); |
diff --git a/inc/functions.php b/inc/functions.php index 750d430e..b830b616 100644 --- a/inc/functions.php +++ b/inc/functions.php | |||
@@ -39,6 +39,10 @@ function get_external_file($url) | |||
39 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); | 39 | curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); |
40 | curl_setopt($curl, CURLOPT_HEADER, false); | 40 | curl_setopt($curl, CURLOPT_HEADER, false); |
41 | 41 | ||
42 | // FOR SSL do not verified certificate | ||
43 | curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); | ||
44 | curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE ); | ||
45 | |||
42 | // FeedBurner requires a proper USER-AGENT... | 46 | // FeedBurner requires a proper USER-AGENT... |
43 | curl_setopt($curl, CURL_HTTP_VERSION_1_1, true); | 47 | curl_setopt($curl, CURL_HTTP_VERSION_1_1, true); |
44 | curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate"); | 48 | curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate"); |
@@ -54,7 +58,15 @@ function get_external_file($url) | |||
54 | } else { | 58 | } else { |
55 | 59 | ||
56 | // create http context and add timeout and user-agent | 60 | // create http context and add timeout and user-agent |
57 | $context = stream_context_create(array('http'=>array('timeout' => $timeout,'header'=> "User-Agent: ".$useragent,/*spoot Mozilla Firefox*/'follow_location' => true))); | 61 | $context = stream_context_create(array( |
62 | 'http'=>array('timeout' => $timeout, | ||
63 | 'header'=> "User-Agent: ".$useragent, /*spoot Mozilla Firefox*/ | ||
64 | 'follow_location' => true), | ||
65 | // FOR SSL do not verified certificate | ||
66 | 'ssl' => array('verify_peer' => false, | ||
67 | 'allow_self_signed' => true) | ||
68 | ) | ||
69 | ); | ||
58 | 70 | ||
59 | // only download page lesser than 4MB | 71 | // only download page lesser than 4MB |
60 | $data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source. | 72 | $data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source. |
@@ -98,6 +110,8 @@ function get_external_file($url) | |||
98 | */ | 110 | */ |
99 | function prepare_url($url) | 111 | function prepare_url($url) |
100 | { | 112 | { |
113 | global $msg; | ||
114 | |||
101 | $parametres = array(); | 115 | $parametres = array(); |
102 | $url = html_entity_decode(trim($url)); | 116 | $url = html_entity_decode(trim($url)); |
103 | 117 | ||
@@ -108,14 +122,20 @@ function prepare_url($url) | |||
108 | $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i); | 122 | $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i); |
109 | 123 | ||
110 | $title = $url; | 124 | $title = $url; |
111 | if (!preg_match('!^https?://!i', $url)) | 125 | $html = Encoding::toUTF8(get_external_file($url,15)); |
112 | $url = 'http://' . $url; | 126 | // If get_external_file if not able to retrieve HTTPS content try the same URL with HTTP protocol |
127 | if (!preg_match('!^https?://!i', $url) && (!isset($html) || strlen($html) <= 0)) { | ||
128 | $url = 'http://' . $url; | ||
129 | $html = Encoding::toUTF8(get_external_file($url,15)); | ||
130 | } | ||
113 | 131 | ||
114 | $html = Encoding::toUTF8(get_external_file($url,15)); | ||
115 | if (isset($html) and strlen($html) > 0) | 132 | if (isset($html) and strlen($html) > 0) |
116 | { | 133 | { |
117 | $r = new Readability($html, $url); | 134 | $r = new Readability($html, $url); |
135 | |||
118 | $r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; | 136 | $r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; |
137 | $r->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS; | ||
138 | |||
119 | if($r->init()) | 139 | if($r->init()) |
120 | { | 140 | { |
121 | $content = $r->articleContent->innerHTML; | 141 | $content = $r->articleContent->innerHTML; |
@@ -362,4 +382,4 @@ function logm($message) | |||
362 | { | 382 | { |
363 | $t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n"; | 383 | $t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n"; |
364 | file_put_contents('./log.txt',$t,FILE_APPEND); | 384 | file_put_contents('./log.txt',$t,FILE_APPEND); |
365 | } \ No newline at end of file | 385 | } |