aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc
diff options
context:
space:
mode:
authornicosomb <nicolas@loeuillet.org>2013-04-23 15:19:33 +0200
committernicosomb <nicolas@loeuillet.org>2013-04-23 15:19:33 +0200
commit3e7188185d9d4be054ed3807c8b910d1e5f504f8 (patch)
tree1caacbc5ed979eb4e0266fc35588e4d88c9cb9ed /inc
parent8d5aab49c185313338245f9c3a878628e16d7c85 (diff)
parentcdcc8d2533d2ed65ac6a89c9a6d0041de7361ce1 (diff)
downloadwallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.tar.gz
wallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.tar.zst
wallabag-3e7188185d9d4be054ed3807c8b910d1e5f504f8.zip
Merge branch 'memiks-gestion_erreur_readability' into dev
Diffstat (limited to 'inc')
-rw-r--r--inc/Readability.php7
-rw-r--r--inc/functions.php30
2 files changed, 29 insertions, 8 deletions
diff --git a/inc/Readability.php b/inc/Readability.php
index c50bf2ef..2ce90f6d 100644
--- a/inc/Readability.php
+++ b/inc/Readability.php
@@ -80,7 +80,7 @@ class Readability
80 public $debug = false; 80 public $debug = false;
81 protected $body = null; // 81 protected $body = null; //
82 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later 82 protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
83 protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set. 83 protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4; // Start with all flags set.
84 protected $success = false; // indicates whether we were able to extract or not 84 protected $success = false; // indicates whether we were able to extract or not
85 85
86 /** 86 /**
@@ -90,7 +90,7 @@ class Readability
90 public $regexps = array( 90 public $regexps = array(
91 'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i', 91 'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
92 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 92 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
93 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i', 93 'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|attachment/i',
94 'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i', 94 'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
95 'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i', 95 'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i',
96 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i', 96 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
@@ -106,7 +106,7 @@ class Readability
106 * Create instance of Readability 106 * Create instance of Readability
107 * @param string UTF-8 encoded string 107 * @param string UTF-8 encoded string
108 * @param string (optional) URL associated with HTML (used for footnotes) 108 * @param string (optional) URL associated with HTML (used for footnotes)
109 */ 109 */
110 function __construct($html, $url=null) 110 function __construct($html, $url=null)
111 { 111 {
112 /* Turn all double br's into p's */ 112 /* Turn all double br's into p's */
@@ -185,6 +185,7 @@ class Readability
185 $articleContent = $this->dom->createElement('div'); 185 $articleContent = $this->dom->createElement('div');
186 $articleContent->setAttribute('id', 'readability-content'); 186 $articleContent->setAttribute('id', 'readability-content');
187 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>'; 187 $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';
188 return $this->success;
188 } 189 }
189 190
190 $overlay->setAttribute('id', 'readOverlay'); 191 $overlay->setAttribute('id', 'readOverlay');
diff --git a/inc/functions.php b/inc/functions.php
index 750d430e..b830b616 100644
--- a/inc/functions.php
+++ b/inc/functions.php
@@ -39,6 +39,10 @@ function get_external_file($url)
39 curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 39 curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
40 curl_setopt($curl, CURLOPT_HEADER, false); 40 curl_setopt($curl, CURLOPT_HEADER, false);
41 41
42 // FOR SSL do not verified certificate
43 curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
44 curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE );
45
42 // FeedBurner requires a proper USER-AGENT... 46 // FeedBurner requires a proper USER-AGENT...
43 curl_setopt($curl, CURL_HTTP_VERSION_1_1, true); 47 curl_setopt($curl, CURL_HTTP_VERSION_1_1, true);
44 curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate"); 48 curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate");
@@ -54,7 +58,15 @@ function get_external_file($url)
54 } else { 58 } else {
55 59
56 // create http context and add timeout and user-agent 60 // create http context and add timeout and user-agent
57 $context = stream_context_create(array('http'=>array('timeout' => $timeout,'header'=> "User-Agent: ".$useragent,/*spoot Mozilla Firefox*/'follow_location' => true))); 61 $context = stream_context_create(array(
62 'http'=>array('timeout' => $timeout,
63 'header'=> "User-Agent: ".$useragent, /*spoot Mozilla Firefox*/
64 'follow_location' => true),
65 // FOR SSL do not verified certificate
66 'ssl' => array('verify_peer' => false,
67 'allow_self_signed' => true)
68 )
69 );
58 70
59 // only download page lesser than 4MB 71 // only download page lesser than 4MB
60 $data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source. 72 $data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source.
@@ -98,6 +110,8 @@ function get_external_file($url)
98 */ 110 */
99function prepare_url($url) 111function prepare_url($url)
100{ 112{
113 global $msg;
114
101 $parametres = array(); 115 $parametres = array();
102 $url = html_entity_decode(trim($url)); 116 $url = html_entity_decode(trim($url));
103 117
@@ -108,14 +122,20 @@ function prepare_url($url)
108 $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i); 122 $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i);
109 123
110 $title = $url; 124 $title = $url;
111 if (!preg_match('!^https?://!i', $url)) 125 $html = Encoding::toUTF8(get_external_file($url,15));
112 $url = 'http://' . $url; 126 // If get_external_file if not able to retrieve HTTPS content try the same URL with HTTP protocol
127 if (!preg_match('!^https?://!i', $url) && (!isset($html) || strlen($html) <= 0)) {
128 $url = 'http://' . $url;
129 $html = Encoding::toUTF8(get_external_file($url,15));
130 }
113 131
114 $html = Encoding::toUTF8(get_external_file($url,15));
115 if (isset($html) and strlen($html) > 0) 132 if (isset($html) and strlen($html) > 0)
116 { 133 {
117 $r = new Readability($html, $url); 134 $r = new Readability($html, $url);
135
118 $r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; 136 $r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
137 $r->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS;
138
119 if($r->init()) 139 if($r->init())
120 { 140 {
121 $content = $r->articleContent->innerHTML; 141 $content = $r->articleContent->innerHTML;
@@ -362,4 +382,4 @@ function logm($message)
362{ 382{
363 $t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n"; 383 $t = strval(date('Y/m/d_H:i:s')).' - '.$_SERVER["REMOTE_ADDR"].' - '.strval($message)."\n";
364 file_put_contents('./log.txt',$t,FILE_APPEND); 384 file_put_contents('./log.txt',$t,FILE_APPEND);
365} \ No newline at end of file 385}