Ajout du global msg et gestion du retour dans Readability

author memiks <memiks@memiks.fr>

Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)

committer memiks <memiks@memiks.fr>

Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)
author memiks <memiks@memiks.fr>
Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)
committer memiks <memiks@memiks.fr>
Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)
diff --git a/inc/Readability.php b/inc/Readability.php

index c50bf2efa1911acd3129e69cd1648e4584fd9d4b..19298c1324ed8828f380672f8bc6bf980c8d29c8 100644 (file)
--- a/inc/Readability.php
+++ b/inc/Readability.php
@@ -80,7 +80,7 @@ class Readability
         public $debug = false;
         protected $body = null; // 
         protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
-       protected $flags = self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4;   // Start with all flags set.
+       protected $flags =  Self::FLAG_CLEAN_CONDITIONALLY; // 1 | 2 | 4;   // Start with all flags set.
         protected $success = false; // indicates whether we were able to extract or not
         
         /**
@@ -90,7 +90,7 @@ class Readability
         public $regexps = array(
                 'unlikelyCandidates' => '/combx|comment|comments|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i',
                 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
-               'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i',
+               'positive' => '/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story|attachment/i',
                 'negative' => '/combx|comment|comments|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
                 'divToPElements' => '/<(a|blockquote|dl|div|ol|p|pre|table|ul)/i',
                 'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
@@ -106,7 +106,7 @@ class Readability
         * Create instance of Readability
         * @param string UTF-8 encoded string
         * @param string (optional) URL associated with HTML (used for footnotes)
-       */      
+       */
         function __construct($html, $url=null)
         {
                 /* Turn all double br's into p's */
@@ -185,6 +185,7 @@ class Readability
                         $articleContent = $this->dom->createElement('div');
                         $articleContent->setAttribute('id', 'readability-content');
                         $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';            
+                       return $this->success;
                 }
                 
                 $overlay->setAttribute('id', 'readOverlay');
diff --git a/inc/functions.php b/inc/functions.php

index 205f3968130dd51da4809679ca5ae2cb758479dc..b27120c59a44d82d053cb73e7ff5dc95f57fe874 100644 (file)
--- a/inc/functions.php
+++ b/inc/functions.php
@@ -39,6 +39,10 @@ function get_external_file($url)
          curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
          curl_setopt($curl, CURLOPT_HEADER, false);
  
+               // FOR SSL do not verified certificate
+        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
+               curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE );
+
          // FeedBurner requires a proper USER-AGENT...
          curl_setopt($curl, CURL_HTTP_VERSION_1_1, true);
          curl_setopt($curl, CURLOPT_ENCODING, "gzip, deflate");
@@ -54,7 +58,15 @@ function get_external_file($url)
      } else {
  
          // create http context and add timeout and user-agent
-        $context = stream_context_create(array('http'=>array('timeout' => $timeout,'header'=> "User-Agent: ".$useragent,/*spoot Mozilla Firefox*/'follow_location' => true)));
+        $context = stream_context_create(array(
+                                                               'http'=>array('timeout' => $timeout,
+                                                                               'header'=> "User-Agent: ".$useragent,   /*spoot Mozilla Firefox*/
+                                                                               'follow_location' => true),
+                                                               // FOR SSL do not verified certificate
+                                                               'ssl' => array('verify_peer' => false,
+                                                                               'allow_self_signed' => true)
+                                                               )
+                                               );
  
          // only download page lesser than 4MB
          $data = @file_get_contents($url, false, $context, -1, 4000000); // We download at most 4 MB from source.
@@ -98,6 +110,8 @@ function get_external_file($url)
   */
  function prepare_url($url)
  {
+    global $msg;
+
      $parametres = array();
      $url        = html_entity_decode(trim($url));
  
@@ -108,14 +122,21 @@ function prepare_url($url)
      $i=strpos($url,'#xtor=RSS-'); if ($i!==false) $url=substr($url,0,$i);
  
      $title = $url;
-    if (!preg_match('!^https?://!i', $url))
-        $url = 'http://' . $url;
+       $html = Encoding::toUTF8(get_external_file($url,15));
+       // If get_external_file if not able to retrieve HTTPS content try the same URL with HTTP protocol
+       if (!preg_match('!^https?://!i', $url) && (!isset($html) || strlen($html) <= 0)) {
+                       $url = 'http://' . $url;
+                       $html = Encoding::toUTF8(get_external_file($url,15));
+       }
  
-    $html = Encoding::toUTF8(get_external_file($url,15));
      if (isset($html) and strlen($html) > 0)
      {
          $r = new Readability($html, $url);
+               
          $r->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES;
+               $r->debug=true;
+               $r->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS;
+
          if($r->init())
          {
              $content = $r->articleContent->innerHTML;
author	memiks <memiks@memiks.fr>
	Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)
committer	memiks <memiks@memiks.fr>
	Tue, 23 Apr 2013 12:22:19 +0000 (07:22 -0500)
inc/Readability.php		patch \| blob \| blame \| history
inc/functions.php		patch \| blob \| blame \| history