Merge pull request #977 from ArthurHoaro/feature/dl-filter

Extract the title/charset during page download, and check content type
author: ArthurHoaro <arthur@hoa.ro> 2018-01-23 18:41:38 +0100
committer: GitHub <noreply@github.com> 2018-01-23 18:41:38 +0100
commit: d449f79a0d7ca808b891baf73b9e25ce7f7e48fe (patch)
tree: 66e40b38bfce1475b745d6a1227f478f8e99ab75
parent: 5f8c3f532ed16ad5b789f75e9ff745e5329271c3 (diff)
parent: d65342e304f92643ba922200953cfebc51e1e482 (diff)
download: Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.gz
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.zst
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.zip
4 files changed, 293 insertions, 68 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index c9371b55..83a4c5e2 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
 * GET an HTTP URL to retrieve its content
 * Uses the cURL library or a fallback method 
 *
- * @param string $url      URL to get (http://...)
+ * @param string          $url               URL to get (http://...)
- * @param int    $timeout  network timeout (in seconds)
+ * @param int             $timeout           network timeout (in seconds)
- * @param int    $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                           Can be used to add download conditions on the headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@@ -29,7 +31,7 @@
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
 {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
    curl_setopt($ch, CURLOPT_TIMEOUT,           $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT,         $userAgent);
+    if (is_callable($curlWriteFunction)) {
+        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+    }
    // Max download size management
    curl_setopt($ch, CURLOPT_BUFFERSIZE,        1024*16);
    curl_setopt($ch, CURLOPT_NOPROGRESS,        false);
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index e3d95d08..3705f7e9 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
 <?php
 /**
- * Extract title from an HTML document.
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
- * @param string $html HTML content where to look for a title.
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
 *
- * @return bool|string Extracted title if found, false otherwise.
+ * @return Closure
 */
-function html_extract_title($html)
+function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
 {
-    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
+    /**
-        return trim(str_replace("\n", '', $matches[1]));
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
-    }
+     *
-    return false;
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        if (!empty($responseCode) && $responseCode != 200) {
+            return false;
+        }
+        $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+        }
+        // We got everything we want, stop the download.
+        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+            return false;
+        }
+        return strlen($data);
+    };
 }
 /**
- * Determine charset from downloaded page.
+ * Extract title from an HTML document.
- * Priority:
- *   1. HTTP headers (Content type).
- *   2. HTML content page (tag <meta charset>).
- *   3. Use a default charset (default: UTF-8).
 *
- * @param array  $headers           HTTP headers array.
+ * @param string $html HTML content where to look for a title.
- * @param string $htmlContent       HTML content where to look for charset.
- * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
- * @return string Determined charset.
+ * @return bool|string Extracted title if found, false otherwise.
 */
-function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+function html_extract_title($html)
 {
-    if ($charset = headers_extract_charset($headers)) {
+    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
-        return $charset;
+        return trim(str_replace("\n", '', $matches[1]));
-    }
-    if ($charset = html_extract_charset($htmlContent)) {
-        return $charset;
    }
+    return false;
-    return $defaultCharset;
 }
 /**
- * Extract charset from HTTP headers if it's defined.
+ * Extract charset from HTTP header if it's defined.
 *
- * @param array $headers HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
-function headers_extract_charset($headers)
+function header_extract_charset($header)
 {
-    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+    preg_match('/charset="?([^; ]+)/i', $header, $match);
-        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+    if (! empty($match[1])) {
-        if (! empty($match[1])) {
+        return strtolower(trim($match[1]));
-            return strtolower(trim($match[1]));
-        }
    }
    return false;
diff --git a/index.php b/index.php
index 27335a36..d57789e6 100644
--- a/index.php
+++ b/index.php
@@ -1425,16 +1425,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager)
            // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
            if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
                // Short timeout to keep the application responsive
-                list($headers, $content) = get_http_response($url, 4);
+                // The callback will fill $charset and $title with data from the downloaded page.
-                if (strpos($headers[0], '200 OK') !== false) {
+                get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
-                    // Retrieve charset.
+                if (! empty($title) && strtolower($charset) != 'utf-8') {
-                    $charset = get_charset($headers, $content);
+                    $title = mb_convert_encoding($title, 'utf-8', $charset);
-                    // Extract title.
-                    $title = html_extract_title($content);
-                    // Re-encode title in utf-8 if necessary.
-                    if (! empty($title) && strtolower($charset) != 'utf-8') {
-                        $title = mb_convert_encoding($title, 'utf-8', $charset);
-                    }
                }
            }
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php
index 99679320..7fbd59b0 100644
--- a/tests/LinkUtilsTest.php
+++ b/tests/LinkUtilsTest.php
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
    }
    /**
-     * Test get_charset() with all priorities.
-     */
-    public function testGetCharset()
-    {
-        $headers = array('Content-Type' => 'text/html; charset=Headers');
-        $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
-        $default = 'default';
-        $this->assertEquals('headers', get_charset($headers, $html, $default));
-        $this->assertEquals('html', get_charset(array(), $html, $default));
-        $this->assertEquals($default, get_charset(array(), '', $default));
-        $this->assertEquals('utf-8', get_charset(array(), ''));
-    }
-    /**
     * Test headers_extract_charset() when the charset is found.
     */
    public function testHeadersExtractExistentCharset()
    {
        $charset = 'x-MacCroatian';
-        $headers = array('Content-Type' => 'text/html; charset='. $charset);
+        $headers = 'text/html; charset='. $charset;
-        $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+        $this->assertEquals(strtolower($charset), header_extract_charset($headers));
    }
    /**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
     */
    public function testHeadersExtractNonExistentCharset()
    {
-        $headers = array();
+        $headers = '';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
-        $headers = array('Content-Type' => 'text/html');
+        $headers = 'text/html';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
    }
    /**
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
    }
    /**
+     * Test the download callback with valid value
+     */
+    public function testCurlDownloadCallbackOk()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'Server: GitHub.com',
+            'Date: Sat, 28 Oct 2017 12:01:33 GMT',
+            'Content-Type: text/html; charset=utf-8',
+            'Status: 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkNoCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEmpty($charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkHtmlCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no title
+     */
+    public function testCurlDownloadCallbackOkNoTitle()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
+            'ignored',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid content type.
+     */
+    public function testCurlDownloadCallbackInvalidContentType()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid response code.
+     */
+    public function testCurlDownloadCallbackInvalidResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid content type and response code.
+     */
+    public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
     * Test count_private.
     */
    public function testCountPrivateLinks()
@@ -207,3 +318,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
        return str_replace('$1', $hashtag, $hashtagLink);
    }
 }
+// old style mock: PHPUnit doesn't allow function mock
+/**
+ * Returns code 200 or html content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_ok($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+/**
+ * Returns code 200 or html content type without charset.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_no_charset($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html';
+    }
+}
+/**
+ * Invalid response code.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/html'
+ */
+function ut_curl_getinfo_rc_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+/**
+ * Invalid content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/plain'
+ */
+function ut_curl_getinfo_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
+/**
+ * Invalid response code and content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/plain'
+ */
+function ut_curl_getinfo_rs_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
author	ArthurHoaro <arthur@hoa.ro>	2018-01-23 18:41:38 +0100
committer	GitHub <noreply@github.com>	2018-01-23 18:41:38 +0100
commit	d449f79a0d7ca808b891baf73b9e25ce7f7e48fe (patch)
tree	66e40b38bfce1475b745d6a1227f478f8e99ab75
parent	5f8c3f532ed16ad5b789f75e9ff745e5329271c3 (diff)
parent	d65342e304f92643ba922200953cfebc51e1e482 (diff)
download	Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.gz Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.zst Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.zip

diff --git a/application/HttpUtils.php b/application/HttpUtils.php index c9371b55..83a4c5e2 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
3	* GET an HTTP URL to retrieve its content	3	* GET an HTTP URL to retrieve its content
4	* Uses the cURL library or a fallback method	4	* Uses the cURL library or a fallback method
5	*	5	*
6	* @param string $url URL to get (http://...)	6	* @param string $url URL to get (http://...)
7	* @param int $timeout network timeout (in seconds)	7	* @param int $timeout network timeout (in seconds)
8	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)	8	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
		9	* @param callable\|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
		10	* Can be used to add download conditions on the headers (response code, content type, etc.).
9	*	11	*
10	* @return array HTTP response headers, downloaded content	12	* @return array HTTP response headers, downloaded content
11	*	13	*
@@ -29,7 +31,7 @@
29	* @see http://stackoverflow.com/q/9183178	31	* @see http://stackoverflow.com/q/9183178
30	* @see http://stackoverflow.com/q/1462720	32	* @see http://stackoverflow.com/q/1462720
31	*/	33	*/
32	function get_http_response($url, $timeout = 30, $maxBytes = 4194304)	34	function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
33	{	35	{
34	$urlObj = new Url($url);	36	$urlObj = new Url($url);
35	$cleanUrl = $urlObj->idnToAscii();	37	$cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
75	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);	77	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
76	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);	78	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
77		79
		80	if (is_callable($curlWriteFunction)) {
		81	curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
		82	}
		83
78	// Max download size management	84	// Max download size management
79	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);	85	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
80	curl_setopt($ch, CURLOPT_NOPROGRESS, false);	86	curl_setopt($ch, CURLOPT_NOPROGRESS, false);


diff --git a/application/LinkUtils.php b/application/LinkUtils.php index e3d95d08..3705f7e9 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1	<?php	1	<?php
2		2
3	/**	3	/**
4	* Extract title from an HTML document.	4	* Get cURL callback function for CURLOPT_WRITEFUNCTION
5	*	5	*
6	* @param string $html HTML content where to look for a title.	6	* @param string $charset to extract from the downloaded page (reference)
		7	* @param string $title to extract from the downloaded page (reference)
		8	* @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7	*	9	*
8	* @return bool\|string Extracted title if found, false otherwise.	10	* @return Closure
9	*/	11	*/
10	function html_extract_title($html)	12	function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11	{	13	{
12	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {	14	/**
13	return trim(str_replace("\n", '', $matches[1]));	15	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14	}	16	*
15	return false;	17	* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
		18	* Then we extract the title and the charset and stop the download when it's done.
		19	*
		20	* @param resource $ch cURL resource
		21	* @param string $data chunk of data being downloaded
		22	*
		23	* @return int\|bool length of $data or false if we need to stop the download
		24	*/
		25	return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
		26	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
		27	if (!empty($responseCode) && $responseCode != 200) {
		28	return false;
		29	}
		30	$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
		31	if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
		32	return false;
		33	}
		34	if (empty($charset)) {
		35	$charset = header_extract_charset($contentType);
		36	}
		37	if (empty($charset)) {
		38	$charset = html_extract_charset($data);
		39	}
		40	if (empty($title)) {
		41	$title = html_extract_title($data);
		42	}
		43	// We got everything we want, stop the download.
		44	if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
		45	return false;
		46	}
		47
		48	return strlen($data);
		49	};
16	}	50	}
17		51
18	/**	52	/**
19	* Determine charset from downloaded page.	53	* Extract title from an HTML document.
20	* Priority:
21	* 1. HTTP headers (Content type).
22	* 2. HTML content page (tag <meta charset>).
23	* 3. Use a default charset (default: UTF-8).
24	*	54	*
25	* @param array $headers HTTP headers array.	55	* @param string $html HTML content where to look for a title.
26	* @param string $htmlContent HTML content where to look for charset.
27	* @param string $defaultCharset Default charset to apply if other methods failed.
28	*	56	*
29	* @return string Determined charset.	57	* @return bool\|string Extracted title if found, false otherwise.
30	*/	58	*/
31	function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')	59	function html_extract_title($html)
32	{	60	{
33	if ($charset = headers_extract_charset($headers)) {	61	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {
34	return $charset;	62	return trim(str_replace("\n", '', $matches[1]));
35	}
36
37	if ($charset = html_extract_charset($htmlContent)) {
38	return $charset;
39	}	63	}
40		64	return false;
41	return $defaultCharset;
42	}	65	}
43		66
44	/**	67	/**
45	* Extract charset from HTTP headers if it's defined.	68	* Extract charset from HTTP header if it's defined.
46	*	69	*
47	* @param array $headers HTTP headers array.	70	* @param string $header HTTP header Content-Type line.
48	*	71	*
49	* @return bool\|string Charset string if found (lowercase), false otherwise.	72	* @return bool\|string Charset string if found (lowercase), false otherwise.
50	*/	73	*/
51	function headers_extract_charset($headers)	74	function header_extract_charset($header)
52	{	75	{
53	if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {	76	preg_match('/charset="?([^; ]+)/i', $header, $match);
54	preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);	77	if (! empty($match[1])) {
55	if (! empty($match[1])) {	78	return strtolower(trim($match[1]));
56	return strtolower(trim($match[1]));
57	}
58	}	79	}
59		80
60	return false;	81	return false;


diff --git a/index.php b/index.php index 27335a36..d57789e6 100644 --- a/index.php +++ b/index.php
@@ -1425,16 +1425,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager)
1425	// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)	1425	// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
1426	if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {	1426	if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
1427	// Short timeout to keep the application responsive	1427	// Short timeout to keep the application responsive
1428	list($headers, $content) = get_http_response($url, 4);	1428	// The callback will fill $charset and $title with data from the downloaded page.
1429	if (strpos($headers[0], '200 OK') !== false) {	1429	get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
1430	// Retrieve charset.	1430	if (! empty($title) && strtolower($charset) != 'utf-8') {
1431	$charset = get_charset($headers, $content);	1431	$title = mb_convert_encoding($title, 'utf-8', $charset);
1432	// Extract title.
1433	$title = html_extract_title($content);
1434	// Re-encode title in utf-8 if necessary.
1435	if (! empty($title) && strtolower($charset) != 'utf-8') {
1436	$title = mb_convert_encoding($title, 'utf-8', $charset);
1437	}
1438	}	1432	}
1439	}	1433	}
1440		1434


diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php index 99679320..7fbd59b0 100644 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
29	}	29	}
30		30
31	/**	31	/**
32	* Test get_charset() with all priorities.
33	*/
34	public function testGetCharset()
35	{
36	$headers = array('Content-Type' => 'text/html; charset=Headers');
37	$html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
38	$default = 'default';
39	$this->assertEquals('headers', get_charset($headers, $html, $default));
40	$this->assertEquals('html', get_charset(array(), $html, $default));
41	$this->assertEquals($default, get_charset(array(), '', $default));
42	$this->assertEquals('utf-8', get_charset(array(), ''));
43	}
44
45	/**
46	* Test headers_extract_charset() when the charset is found.	32	* Test headers_extract_charset() when the charset is found.
47	*/	33	*/
48	public function testHeadersExtractExistentCharset()	34	public function testHeadersExtractExistentCharset()
49	{	35	{
50	$charset = 'x-MacCroatian';	36	$charset = 'x-MacCroatian';
51	$headers = array('Content-Type' => 'text/html; charset='. $charset);	37	$headers = 'text/html; charset='. $charset;
52	$this->assertEquals(strtolower($charset), headers_extract_charset($headers));	38	$this->assertEquals(strtolower($charset), header_extract_charset($headers));
53	}	39	}
54		40
55	/**	41	/**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
57	*/	43	*/
58	public function testHeadersExtractNonExistentCharset()	44	public function testHeadersExtractNonExistentCharset()
59	{	45	{
60	$headers = array();	46	$headers = '';
61	$this->assertFalse(headers_extract_charset($headers));	47	$this->assertFalse(header_extract_charset($headers));
62		48
63	$headers = array('Content-Type' => 'text/html');	49	$headers = 'text/html';
64	$this->assertFalse(headers_extract_charset($headers));	50	$this->assertFalse(header_extract_charset($headers));
65	}	51	}
66		52
67	/**	53	/**
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
86	}	72	}
87		73
88	/**	74	/**
		75	* Test the download callback with valid value
		76	*/
		77	public function testCurlDownloadCallbackOk()
		78	{
		79	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
		80	$data = [
		81	'HTTP/1.1 200 OK',
		82	'Server: GitHub.com',
		83	'Date: Sat, 28 Oct 2017 12:01:33 GMT',
		84	'Content-Type: text/html; charset=utf-8',
		85	'Status: 200 OK',
		86	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		87	'<title>ignored</title>',
		88	];
		89	foreach ($data as $key => $line) {
		90	$ignore = null;
		91	$expected = $key !== 'end' ? strlen($line) : false;
		92	$this->assertEquals($expected, $callback($ignore, $line));
		93	if ($expected === false) {
		94	break;
		95	}
		96	}
		97	$this->assertEquals('utf-8', $charset);
		98	$this->assertEquals('Refactoring · GitHub', $title);
		99	}
		100
		101	/**
		102	* Test the download callback with valid values and no charset
		103	*/
		104	public function testCurlDownloadCallbackOkNoCharset()
		105	{
		106	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
		107	$data = [
		108	'HTTP/1.1 200 OK',
		109	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		110	'<title>ignored</title>',
		111	];
		112	foreach ($data as $key => $line) {
		113	$ignore = null;
		114	$this->assertEquals(strlen($line), $callback($ignore, $line));
		115	}
		116	$this->assertEmpty($charset);
		117	$this->assertEquals('Refactoring · GitHub', $title);
		118	}
		119
		120	/**
		121	* Test the download callback with valid values and no charset
		122	*/
		123	public function testCurlDownloadCallbackOkHtmlCharset()
		124	{
		125	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
		126	$data = [
		127	'HTTP/1.1 200 OK',
		128	'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
		129	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		130	'<title>ignored</title>',
		131	];
		132	foreach ($data as $key => $line) {
		133	$ignore = null;
		134	$expected = $key !== 'end' ? strlen($line) : false;
		135	$this->assertEquals($expected, $callback($ignore, $line));
		136	if ($expected === false) {
		137	break;
		138	}
		139	}
		140	$this->assertEquals('utf-8', $charset);
		141	$this->assertEquals('Refactoring · GitHub', $title);
		142	}
		143
		144	/**
		145	* Test the download callback with valid values and no title
		146	*/
		147	public function testCurlDownloadCallbackOkNoTitle()
		148	{
		149	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
		150	$data = [
		151	'HTTP/1.1 200 OK',
		152	'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
		153	'ignored',
		154	];
		155	foreach ($data as $key => $line) {
		156	$ignore = null;
		157	$this->assertEquals(strlen($line), $callback($ignore, $line));
		158	}
		159	$this->assertEquals('utf-8', $charset);
		160	$this->assertEmpty($title);
		161	}
		162
		163	/**
		164	* Test the download callback with an invalid content type.
		165	*/
		166	public function testCurlDownloadCallbackInvalidContentType()
		167	{
		168	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
		169	$ignore = null;
		170	$this->assertFalse($callback($ignore, ''));
		171	$this->assertEmpty($charset);
		172	$this->assertEmpty($title);
		173	}
		174
		175	/**
		176	* Test the download callback with an invalid response code.
		177	*/
		178	public function testCurlDownloadCallbackInvalidResponseCode()
		179	{
		180	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
		181	$ignore = null;
		182	$this->assertFalse($callback($ignore, ''));
		183	$this->assertEmpty($charset);
		184	$this->assertEmpty($title);
		185	}
		186
		187	/**
		188	* Test the download callback with an invalid content type and response code.
		189	*/
		190	public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
		191	{
		192	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
		193	$ignore = null;
		194	$this->assertFalse($callback($ignore, ''));
		195	$this->assertEmpty($charset);
		196	$this->assertEmpty($title);
		197	}
		198
		199	/**
89	* Test count_private.	200	* Test count_private.
90	*/	201	*/
91	public function testCountPrivateLinks()	202	public function testCountPrivateLinks()
@@ -207,3 +318,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
207	return str_replace('$1', $hashtag, $hashtagLink);	318	return str_replace('$1', $hashtag, $hashtagLink);
208	}	319	}
209	}	320	}
		321
		322	// old style mock: PHPUnit doesn't allow function mock
		323
		324	/**
		325	* Returns code 200 or html content type.
		326	*
		327	* @param resource $ch cURL resource
		328	* @param int $type cURL info type
		329	*
		330	* @return int\|string 200 or 'text/html'
		331	*/
		332	function ut_curl_getinfo_ok($ch, $type)
		333	{
		334	switch ($type) {
		335	case CURLINFO_RESPONSE_CODE:
		336	return 200;
		337	case CURLINFO_CONTENT_TYPE:
		338	return 'text/html; charset=utf-8';
		339	}
		340	}
		341
		342	/**
		343	* Returns code 200 or html content type without charset.
		344	*
		345	* @param resource $ch cURL resource
		346	* @param int $type cURL info type
		347	*
		348	* @return int\|string 200 or 'text/html'
		349	*/
		350	function ut_curl_getinfo_no_charset($ch, $type)
		351	{
		352	switch ($type) {
		353	case CURLINFO_RESPONSE_CODE:
		354	return 200;
		355	case CURLINFO_CONTENT_TYPE:
		356	return 'text/html';
		357	}
		358	}
		359
		360	/**
		361	* Invalid response code.
		362	*
		363	* @param resource $ch cURL resource
		364	* @param int $type cURL info type
		365	*
		366	* @return int\|string 404 or 'text/html'
		367	*/
		368	function ut_curl_getinfo_rc_ko($ch, $type)
		369	{
		370	switch ($type) {
		371	case CURLINFO_RESPONSE_CODE:
		372	return 404;
		373	case CURLINFO_CONTENT_TYPE:
		374	return 'text/html; charset=utf-8';
		375	}
		376	}
		377
		378	/**
		379	* Invalid content type.
		380	*
		381	* @param resource $ch cURL resource
		382	* @param int $type cURL info type
		383	*
		384	* @return int\|string 200 or 'text/plain'
		385	*/
		386	function ut_curl_getinfo_ct_ko($ch, $type)
		387	{
		388	switch ($type) {
		389	case CURLINFO_RESPONSE_CODE:
		390	return 200;
		391	case CURLINFO_CONTENT_TYPE:
		392	return 'text/plain';
		393	}
		394	}
		395
		396	/**
		397	* Invalid response code and content type.
		398	*
		399	* @param resource $ch cURL resource
		400	* @param int $type cURL info type
		401	*
		402	* @return int\|string 404 or 'text/plain'
		403	*/
		404	function ut_curl_getinfo_rs_ct_ko($ch, $type)
		405	{
		406	switch ($type) {
		407	case CURLINFO_RESPONSE_CODE:
		408	return 404;
		409	case CURLINFO_CONTENT_TYPE:
		410	return 'text/plain';
		411	}
		412	}
		413