Fixes #410 - Retrieve title fails in multiple cases

* `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary.
author: ArthurHoaro <arthur@hoa.ro> 2016-01-04 10:45:54 +0100
committer: ArthurHoaro <arthur@hoa.ro> 2016-01-11 21:19:31 +0100
commit: 1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree: 787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5
parent: c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
download: Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip
8 files changed, 285 insertions, 72 deletions
diff --git a/application/ApplicationUtils.php b/application/ApplicationUtils.php
index 274331e1..978fc9da 100644
--- a/application/ApplicationUtils.php
+++ b/application/ApplicationUtils.php
@@ -19,7 +19,7 @@ class ApplicationUtils
     */
    public static function getLatestGitVersionCode($url, $timeout=2)
    {
-        list($headers, $data) = get_http_url($url, $timeout);
+        list($headers, $data) = get_http_response($url, $timeout);
        if (strpos($headers[0], '200 OK') === false) {
            error_log('Failed to retrieve ' . $url);
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 499220c5..e2c1cb47 100644..100755
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -13,7 +13,7 @@
 *  [1] = URL content (downloaded data)
 *
 * Example:
- *  list($headers, $data) = get_http_url('http://sebauvage.net/');
+ *  list($headers, $data) = get_http_response('http://sebauvage.net/');
 *  if (strpos($headers[0], '200 OK') !== false) {
 *      echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
 *  } else {
@@ -24,31 +24,66 @@
 * @see http://php.net/manual/en/function.stream-context-create.php
 * @see http://php.net/manual/en/function.get-headers.php
 */
-function get_http_url($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
 {
+    $urlObj = new Url($url);
+    if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
+        return array(array(0 => 'Invalid HTTP Url'), false);
+    }
    $options = array(
        'http' => array(
            'method' => 'GET',
            'timeout' => $timeout,
            'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)'
-                         .' Gecko/20100101 Firefox/23.0'
+                         .' Gecko/20100101 Firefox/23.0',
+            'request_fulluri' => true,
        )
    );
    $context = stream_context_create($options);
+    stream_context_set_default($options);
+    list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup());
+    if (! $headers || strpos($headers[0], '200 OK') === false) {
+        return array($headers, false);
+    }
    try {
        // TODO: catch Exception in calling code (thumbnailer)
-        $content = file_get_contents($url, false, $context, -1, $maxBytes);
+        $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
    } catch (Exception $exc) {
        return array(array(0 => 'HTTP Error'), $exc->getMessage());
    }
-    if (!$content) {
+    return array($headers, $content);
-        return array(array(0 => 'HTTP Error'), '');
+}
+/**
+ * Retrieve HTTP headers, following n redirections (temporary and permanent).
+ *
+ * @param string $url initial URL to reach.
+ * @param int $redirectionLimit max redirection follow..
+ *
+ * @return array
+ */
+function get_redirected_headers($url, $redirectionLimit = 3)
+{
+    $headers = get_headers($url, 1);
+    // Headers found, redirection found, and limit not reached.
+    if ($redirectionLimit-- > 0
+        && !empty($headers)
+        && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
+        && !empty($headers['Location'])) {
+        $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
+        if ($redirection != $url) {
+            return get_redirected_headers($redirection, $redirectionLimit);
+        }
    }
-    return array(get_headers($url, 1), $content);
+    return array($headers, $url);
 }
 /**
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
new file mode 100755
index 00000000..26dd6b67
--- /dev/null
+++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
+<?php
+/**
+ * Extract title from an HTML document.
+ *
+ * @param string $html HTML content where to look for a title.
+ *
+ * @return bool|string Extracted title if found, false otherwise.
+ */
+function html_extract_title($html)
+{
+    if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
+        return trim(str_replace("\n", ' ', $matches[1]));
+    }
+    return false;
+}
+/**
+ * Determine charset from downloaded page.
+ * Priority:
+ *   1. HTTP headers (Content type).
+ *   2. HTML content page (tag <meta charset>).
+ *   3. Use a default charset (default: UTF-8).
+ *
+ * @param array  $headers           HTTP headers array.
+ * @param string $htmlContent       HTML content where to look for charset.
+ * @param string $defaultCharset    Default charset to apply if other methods failed.
+ *
+ * @return string Determined charset.
+ */
+function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+{
+    if ($charset = headers_extract_charset($headers)) {
+        return $charset;
+    }
+    if ($charset = html_extract_charset($htmlContent)) {
+        return $charset;
+    }
+    return $defaultCharset;
+}
+/**
+ * Extract charset from HTTP headers if it's defined.
+ *
+ * @param array $headers HTTP headers array.
+ *
+ * @return bool|string Charset string if found (lowercase), false otherwise.
+ */
+function headers_extract_charset($headers)
+{
+    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+        if (! empty($match[1])) {
+            return strtolower(trim($match[1]));
+        }
+    }
+    return false;
+}
+/**
+ * Extract charset HTML content (tag <meta charset>).
+ *
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_charset($html)
+{
+    // Get encoding specified in HTML header.
+    preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
+    if (!empty($enc[1])) {
+        return strtolower($enc[1]);
+    }
+    return false;
+}
diff --git a/application/Url.php b/application/Url.php
index d80c9c58..a4ac2e73 100644..100755
--- a/application/Url.php
+++ b/application/Url.php
@@ -118,7 +118,7 @@ class Url
     */
    public function __construct($url)
    {
-        $this->parts = parse_url($url);
+        $this->parts = parse_url(trim($url));
        if (!empty($url) && empty($this->parts['scheme'])) {
            $this->parts['scheme'] = 'http';
@@ -201,4 +201,13 @@ class Url
        }
        return $this->parts['scheme'];
    }
+    /**
+     * Test if the Url is an HTTP one.
+     *
+     * @return true is HTTP, false otherwise.
+     */
+    public function isHttp() {
+        return strpos(strtolower($this->parts['scheme']), 'http') !== false;
+    }
 }
diff --git a/index.php b/index.php
index cd83600b..600b2f55 100644
--- a/index.php
+++ b/index.php
@@ -152,6 +152,7 @@ require_once 'application/FileUtils.php';
 require_once 'application/HttpUtils.php';
 require_once 'application/LinkDB.php';
 require_once 'application/LinkFilter.php';
+require_once 'application/LinkUtils.php';
 require_once 'application/TimeZone.php';
 require_once 'application/Url.php';
 require_once 'application/Utils.php';
@@ -578,13 +579,6 @@ function linkdate2iso8601($linkdate)
    return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format.
 }
-// Extract title from an HTML document.
-// (Returns an empty string if not found.)
-function html_extract_title($html)
-{
-  return preg_match('!<title>(.*?)</title>!is', $html, $matches) ? trim(str_replace("\n",' ', $matches[1])) : '' ;
-}
 // ------------------------------------------------------------------------------------------
 // Token management for XSRF protection
 // Token should be used in any form which acts on data (create,update,delete,import...).
@@ -1642,7 +1636,7 @@ function renderPage()
    // -------- User want to post a new link: Display link edit form.
    if (isset($_GET['post'])) {
-        $url = cleanup_url($_GET['post']);
+        $url = cleanup_url(escape($_GET['post']));
        $link_is_new = false;
        // Check if URL is not already in database (in this case, we will edit the existing link)
@@ -1660,35 +1654,24 @@ function renderPage()
            // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
            if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
                // Short timeout to keep the application responsive
-                list($headers, $data) = get_http_url($url, 4);
+                list($headers, $content) = get_http_response($url, 4);
-                // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html
                if (strpos($headers[0], '200 OK') !== false) {
-                    // Look for charset in html header.
+                    // Retrieve charset.
-                    preg_match('#<meta .*charset=.*>#Usi', $data, $meta);
+                    $charset = get_charset($headers, $content);
+                    // Extract title.
-                    // If found, extract encoding.
+                    $title = html_extract_title($content);
-                    if (!empty($meta[0])) {
+                    // Re-encode title in utf-8 if necessary.
-                        // Get encoding specified in header.
+                    if (! empty($title) && $charset != 'utf-8') {
-                        preg_match('#charset="?(.*)"#si', $meta[0], $enc);
+                        $title = mb_convert_encoding($title, $charset, 'utf-8');
-                        // If charset not found, use utf-8.
-                        $html_charset = (!empty($enc[1])) ? strtolower($enc[1]) : 'utf-8';
-                    }
-                    else {
-                        $html_charset = 'utf-8';
-                    }
-                    // Extract title
-                    $title = html_extract_title($data);
-                    if (!empty($title)) {
-                        // Re-encode title in utf-8 if necessary.
-                        $title = ($html_charset == 'iso-8859-1') ? utf8_encode($title) : $title;
                    }
                }
            }
            if ($url == '') {
                $url = '?' . smallHash($linkdate);
                $title = 'Note: ';
            }
            $link = array(
                'linkdate' => $linkdate,
                'title' => $title,
@@ -2314,11 +2297,11 @@ function genThumbnail()
        else // This is a flickr page (html)
        {
            // Get the flickr html page.
-            list($headers, $data) = get_http_url($url, 20);
+            list($headers, $content) = get_http_response($url, 20);
            if (strpos($headers[0], '200 OK') !== false)
            {
                // flickr now nicely provides the URL of the thumbnail in each flickr page.
-                preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!',$data,$matches);
+                preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!', $content, $matches);
                if (!empty($matches[1])) $imageurl=$matches[1];
                // In albums (and some other pages), the link rel="image_src" is not provided,
@@ -2326,7 +2309,7 @@ function genThumbnail()
                // <meta property="og:image" content="http://farm4.staticflickr.com/3398/3239339068_25d13535ff_z.jpg" />
                if ($imageurl=='')
                {
-                    preg_match('!<meta property=\"og:image\" content=\"(.+?)\"!',$data,$matches);
+                    preg_match('!<meta property=\"og:image\" content=\"(.+?)\"!', $content, $matches);
                    if (!empty($matches[1])) $imageurl=$matches[1];
                }
            }
@@ -2335,11 +2318,12 @@ function genThumbnail()
        if ($imageurl!='')
        {   // Let's download the image.
            // Image is 240x120, so 10 seconds to download should be enough.
-            list($headers, $data) = get_http_url($imageurl, 10);
+            list($headers, $content) = get_http_response($imageurl, 10);
            if (strpos($headers[0], '200 OK') !== false) {
-                file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache.
+                // Save image to cache.
+                file_put_contents($GLOBALS['config']['CACHEDIR'].'/' . $thumbname, $content);
                header('Content-Type: image/jpeg');
-                echo $data;
+                echo $content;
                return;
            }
        }
@@ -2350,16 +2334,17 @@ function genThumbnail()
        // This is more complex: we have to perform a HTTP request, then parse the result.
        // Maybe we should deport this to JavaScript ? Example: http://stackoverflow.com/questions/1361149/get-img-thumbnails-from-vimeo/4285098#4285098
        $vid = substr(parse_url($url,PHP_URL_PATH),1);
-        list($headers, $data) = get_http_url('https://vimeo.com/api/v2/video/'.escape($vid).'.php', 5);
+        list($headers, $content) = get_http_response('https://vimeo.com/api/v2/video/'.escape($vid).'.php', 5);
        if (strpos($headers[0], '200 OK') !== false) {
-            $t = unserialize($data);
+            $t = unserialize($content);
            $imageurl = $t[0]['thumbnail_medium'];
            // Then we download the image and serve it to our client.
-            list($headers, $data) = get_http_url($imageurl, 10);
+            list($headers, $content) = get_http_response($imageurl, 10);
            if (strpos($headers[0], '200 OK') !== false) {
-                file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache.
+                // Save image to cache.
+                file_put_contents($GLOBALS['config']['CACHEDIR'] . '/' . $thumbname, $content);
                header('Content-Type: image/jpeg');
-                echo $data;
+                echo $content;
                return;
            }
        }
@@ -2370,18 +2355,18 @@ function genThumbnail()
        // The thumbnail for TED talks is located in the <link rel="image_src" [...]> tag on that page
        // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html
        // <link rel="image_src" href="http://images.ted.com/images/ted/28bced335898ba54d4441809c5b1112ffaf36781_389x292.jpg" />
-        list($headers, $data) = get_http_url($url, 5);
+        list($headers, $content) = get_http_response($url, 5);
        if (strpos($headers[0], '200 OK') !== false) {
            // Extract the link to the thumbnail
-            preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!',$data,$matches);
+            preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!', $content, $matches);
            if (!empty($matches[1]))
            {   // Let's download the image.
                $imageurl=$matches[1];
                // No control on image size, so wait long enough
-                list($headers, $data) = get_http_url($imageurl, 20);
+                list($headers, $content) = get_http_response($imageurl, 20);
                if (strpos($headers[0], '200 OK') !== false) {
                    $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname;
-                    file_put_contents($filepath,$data); // Save image to cache.
+                    file_put_contents($filepath, $content); // Save image to cache.
                    if (resizeImage($filepath))
                    {
                        header('Content-Type: image/jpeg');
@@ -2398,18 +2383,19 @@ function genThumbnail()
        // There is no thumbnail available for xkcd comics, so download the whole image and resize it.
        // http://xkcd.com/327/
        // <img src="http://imgs.xkcd.com/comics/exploits_of_a_mom.png" title="<BLABLA>" alt="<BLABLA>" />
-        list($headers, $data) = get_http_url($url, 5);
+        list($headers, $content) = get_http_response($url, 5);
        if (strpos($headers[0], '200 OK') !== false) {
            // Extract the link to the thumbnail
-            preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!',$data,$matches);
+            preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!', $content, $matches);
            if (!empty($matches[1]))
            {   // Let's download the image.
                $imageurl=$matches[1];
                // No control on image size, so wait long enough
-                list($headers, $data) = get_http_url($imageurl, 20);
+                list($headers, $content) = get_http_response($imageurl, 20);
                if (strpos($headers[0], '200 OK') !== false) {
                    $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname;
-                    file_put_contents($filepath,$data); // Save image to cache.
+                    // Save image to cache.
+                    file_put_contents($filepath, $content);
                    if (resizeImage($filepath))
                    {
                        header('Content-Type: image/jpeg');
@@ -2425,10 +2411,11 @@ function genThumbnail()
    {
        // For all other domains, we try to download the image and make a thumbnail.
        // We allow 30 seconds max to download (and downloads are limited to 4 Mb)
-        list($headers, $data) = get_http_url($url, 30);
+        list($headers, $content) = get_http_response($url, 30);
        if (strpos($headers[0], '200 OK') !== false) {
            $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname;
-            file_put_contents($filepath,$data); // Save image to cache.
+            // Save image to cache.
+            file_put_contents($filepath, $content);
            if (resizeImage($filepath))
            {
                header('Content-Type: image/jpeg');
diff --git a/tests/HttpUtils/GetHttpUrlTest.php b/tests/HttpUtils/GetHttpUrlTest.php
index 76092b80..fd293505 100644
--- a/tests/HttpUtils/GetHttpUrlTest.php
+++ b/tests/HttpUtils/GetHttpUrlTest.php
@@ -6,7 +6,7 @@
 require_once 'application/HttpUtils.php';
 /**
- * Unitary tests for get_http_url()
+ * Unitary tests for get_http_response()
 */
 class GetHttpUrlTest extends PHPUnit_Framework_TestCase
 {
@@ -15,12 +15,15 @@ class GetHttpUrlTest extends PHPUnit_Framework_TestCase
     */
    public function testGetInvalidLocalUrl()
    {
-        list($headers, $content) = get_http_url('/non/existent', 1);
+        // Local
-        $this->assertEquals('HTTP Error', $headers[0]);
+        list($headers, $content) = get_http_response('/non/existent', 1);
-        $this->assertRegexp(
+        $this->assertEquals('Invalid HTTP Url', $headers[0]);
-            '/failed to open stream: No such file or directory/',
+        $this->assertFalse($content);
-            $content
-        );
+        // Non HTTP
+        list($headers, $content) = get_http_response('ftp://save.tld/mysave', 1);
+        $this->assertEquals('Invalid HTTP Url', $headers[0]);
+        $this->assertFalse($content);
    }
    /**
@@ -28,11 +31,8 @@ class GetHttpUrlTest extends PHPUnit_Framework_TestCase
     */
    public function testGetInvalidRemoteUrl()
    {
-        list($headers, $content) = get_http_url('http://non.existent', 1);
+        list($headers, $content) = @get_http_response('http://non.existent', 1);
-        $this->assertEquals('HTTP Error', $headers[0]);
+        $this->assertFalse($headers);
-        $this->assertRegexp(
+        $this->assertFalse($content);
-            '/Name or service not known/',
-            $content
-        );
    }
 }
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php
new file mode 100644
index 00000000..c2257590
--- /dev/null
+++ b/tests/LinkUtilsTest.php
@@ -0,0 +1,85 @@
+<?php
+require_once 'application/LinkUtils.php';
+/**
+* Class LinkUtilsTest.
+*/
+class LinkUtilsTest extends PHPUnit_Framework_TestCase
+{
+    /**
+     * Test html_extract_title() when the title is found.
+     */
+    public function testHtmlExtractExistentTitle()
+    {
+        $title = 'Read me please.';
+        $html = '<html><meta>stuff</meta><title>'. $title .'</title></html>';
+        $this->assertEquals($title, html_extract_title($html));
+    }
+    /**
+     * Test html_extract_title() when the title is not found.
+     */
+    public function testHtmlExtractNonExistentTitle()
+    {
+        $html = '<html><meta>stuff</meta></html>';
+        $this->assertFalse(html_extract_title($html));
+    }
+    /**
+     * Test get_charset() with all priorities.
+     */
+    public function testGetCharset()
+    {
+        $headers = array('Content-Type' => 'text/html; charset=Headers');
+        $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
+        $default = 'default';
+        $this->assertEquals('headers', get_charset($headers, $html, $default));
+        $this->assertEquals('html', get_charset(array(), $html, $default));
+        $this->assertEquals($default, get_charset(array(), '', $default));
+        $this->assertEquals('utf-8', get_charset(array(), ''));
+    }
+    /**
+     * Test headers_extract_charset() when the charset is found.
+     */
+    public function testHeadersExtractExistentCharset()
+    {
+        $charset = 'x-MacCroatian';
+        $headers = array('Content-Type' => 'text/html; charset='. $charset);
+        $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+    }
+    /**
+     * Test headers_extract_charset() when the charset is not found.
+     */
+    public function testHeadersExtractNonExistentCharset()
+    {
+        $headers = array();
+        $this->assertFalse(headers_extract_charset($headers));
+        $headers = array('Content-Type' => 'text/html');
+        $this->assertFalse(headers_extract_charset($headers));
+    }
+    /**
+     * Test html_extract_charset() when the charset is found.
+     */
+    public function testHtmlExtractExistentCharset()
+    {
+        $charset = 'x-MacCroatian';
+        $html = '<html><meta>stuff2</meta><meta charset="'. $charset .'"/></html>';
+        $this->assertEquals(strtolower($charset), html_extract_charset($html));
+    }
+    /**
+     * Test html_extract_charset() when the charset is not found.
+     */
+    public function testHtmlExtractNonExistentCharset()
+    {
+        $html = '<html><meta>stuff</meta></html>';
+        $this->assertFalse(html_extract_charset($html));
+        $html = '<html><meta>stuff</meta><meta charset=""/></html>';
+        $this->assertFalse(html_extract_charset($html));
+    }
+}
diff --git a/tests/Url/UrlTest.php b/tests/Url/UrlTest.php
index af6daaa4..425327ed 100644
--- a/tests/Url/UrlTest.php
+++ b/tests/Url/UrlTest.php
@@ -156,4 +156,22 @@ class UrlTest extends PHPUnit_Framework_TestCase
        $this->assertEquals($strOn, add_trailing_slash($strOn));
        $this->assertEquals($strOn, add_trailing_slash($strOff));
    }
+    /**
+     * Test valid HTTP url.
+     */
+    function testUrlIsHttp()
+    {
+        $url = new Url(self::$baseUrl);
+        $this->assertTrue($url->isHttp());
+    }
+    /**
+     * Test non HTTP url.
+     */
+    function testUrlIsNotHttp()
+    {
+        $url = new Url('ftp://save.tld/mysave');
+        $this->assertFalse($url->isHttp());
+    }
 }
author	ArthurHoaro <arthur@hoa.ro>	2016-01-04 10:45:54 +0100
committer	ArthurHoaro <arthur@hoa.ro>	2016-01-11 21:19:31 +0100
commit	1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree	787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5
parent	c0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
download	Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip