Extract the title/charset during page download, and check content type

Use CURLOPT_WRITEFUNCTION to check the response code and content type (only allow HTML). Also extract the title and charset during downloading chunk of data, and stop it when everything has been extracted. Closes #579
author: ArthurHoaro <arthur@hoa.ro> 2017-09-30 11:04:13 +0200
committer: ArthurHoaro <arthur@hoa.ro> 2017-10-28 14:35:49 +0200
commit: d65342e304f92643ba922200953cfebc51e1e482 (patch)
tree: 3097c77bb4dd0590c4644422b5dc4369a4186eb7
parent: a59bbf50d7530d7e82a91896a210b9da49cb1568 (diff)
download: Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.gz
Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.zst
Shaarli-d65342e304f92643ba922200953cfebc51e1e482.zip
4 files changed, 293 insertions, 68 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 00835966..2edf5ce2 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
 * GET an HTTP URL to retrieve its content
 * Uses the cURL library or a fallback method 
 *
- * @param string $url      URL to get (http://...)
+ * @param string          $url               URL to get (http://...)
- * @param int    $timeout  network timeout (in seconds)
+ * @param int             $timeout           network timeout (in seconds)
- * @param int    $maxBytes maximum downloaded bytes (default: 4 MiB)
+ * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                           Can be used to add download conditions on the headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@@ -29,7 +31,7 @@
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
+function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
 {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
    curl_setopt($ch, CURLOPT_TIMEOUT,           $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT,         $userAgent);
+    if (is_callable($curlWriteFunction)) {
+        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
+    }
    // Max download size management
    curl_setopt($ch, CURLOPT_BUFFERSIZE,        1024);
    curl_setopt($ch, CURLOPT_NOPROGRESS,        false);
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index 976474de..c0dd32a6 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
 <?php
 /**
- * Extract title from an HTML document.
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
- * @param string $html HTML content where to look for a title.
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
 *
- * @return bool|string Extracted title if found, false otherwise.
+ * @return Closure
 */
-function html_extract_title($html)
+function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
 {
-    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
+    /**
-        return trim(str_replace("\n", '', $matches[1]));
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
-    }
+     *
-    return false;
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        if (!empty($responseCode) && $responseCode != 200) {
+            return false;
+        }
+        $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+        }
+        // We got everything we want, stop the download.
+        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+            return false;
+        }
+        return strlen($data);
+    };
 }
 /**
- * Determine charset from downloaded page.
+ * Extract title from an HTML document.
- * Priority:
- *   1. HTTP headers (Content type).
- *   2. HTML content page (tag <meta charset>).
- *   3. Use a default charset (default: UTF-8).
 *
- * @param array  $headers           HTTP headers array.
+ * @param string $html HTML content where to look for a title.
- * @param string $htmlContent       HTML content where to look for charset.
- * @param string $defaultCharset    Default charset to apply if other methods failed.
 *
- * @return string Determined charset.
+ * @return bool|string Extracted title if found, false otherwise.
 */
-function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
+function html_extract_title($html)
 {
-    if ($charset = headers_extract_charset($headers)) {
+    if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
-        return $charset;
+        return trim(str_replace("\n", '', $matches[1]));
-    }
-    if ($charset = html_extract_charset($htmlContent)) {
-        return $charset;
    }
+    return false;
-    return $defaultCharset;
 }
 /**
- * Extract charset from HTTP headers if it's defined.
+ * Extract charset from HTTP header if it's defined.
 *
- * @param array $headers HTTP headers array.
+ * @param string $header HTTP header Content-Type line.
 *
 * @return bool|string Charset string if found (lowercase), false otherwise.
 */
-function headers_extract_charset($headers)
+function header_extract_charset($header)
 {
-    if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
+    preg_match('/charset="?([^; ]+)/i', $header, $match);
-        preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
+    if (! empty($match[1])) {
-        if (! empty($match[1])) {
+        return strtolower(trim($match[1]));
-            return strtolower(trim($match[1]));
-        }
    }
    return false;
diff --git a/index.php b/index.php
index fb00a9fa..ac51038d 100644
--- a/index.php
+++ b/index.php
@@ -1428,16 +1428,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history)
            // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
            if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
                // Short timeout to keep the application responsive
-                list($headers, $content) = get_http_response($url, 4);
+                // The callback will fill $charset and $title with data from the downloaded page.
-                if (strpos($headers[0], '200 OK') !== false) {
+                get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
-                    // Retrieve charset.
+                if (! empty($title) && strtolower($charset) != 'utf-8') {
-                    $charset = get_charset($headers, $content);
+                    $title = mb_convert_encoding($title, 'utf-8', $charset);
-                    // Extract title.
-                    $title = html_extract_title($content);
-                    // Re-encode title in utf-8 if necessary.
-                    if (! empty($title) && strtolower($charset) != 'utf-8') {
-                        $title = mb_convert_encoding($title, 'utf-8', $charset);
-                    }
                }
            }
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php
index 7c0d4b0b..ef650f44 100644
--- a/tests/LinkUtilsTest.php
+++ b/tests/LinkUtilsTest.php
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
    }
    /**
-     * Test get_charset() with all priorities.
-     */
-    public function testGetCharset()
-    {
-        $headers = array('Content-Type' => 'text/html; charset=Headers');
-        $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
-        $default = 'default';
-        $this->assertEquals('headers', get_charset($headers, $html, $default));
-        $this->assertEquals('html', get_charset(array(), $html, $default));
-        $this->assertEquals($default, get_charset(array(), '', $default));
-        $this->assertEquals('utf-8', get_charset(array(), ''));
-    }
-    /**
     * Test headers_extract_charset() when the charset is found.
     */
    public function testHeadersExtractExistentCharset()
    {
        $charset = 'x-MacCroatian';
-        $headers = array('Content-Type' => 'text/html; charset='. $charset);
+        $headers = 'text/html; charset='. $charset;
-        $this->assertEquals(strtolower($charset), headers_extract_charset($headers));
+        $this->assertEquals(strtolower($charset), header_extract_charset($headers));
    }
    /**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
     */
    public function testHeadersExtractNonExistentCharset()
    {
-        $headers = array();
+        $headers = '';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
-        $headers = array('Content-Type' => 'text/html');
+        $headers = 'text/html';
-        $this->assertFalse(headers_extract_charset($headers));
+        $this->assertFalse(header_extract_charset($headers));
    }
    /**
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
    }
    /**
+     * Test the download callback with valid value
+     */
+    public function testCurlDownloadCallbackOk()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'Server: GitHub.com',
+            'Date: Sat, 28 Oct 2017 12:01:33 GMT',
+            'Content-Type: text/html; charset=utf-8',
+            'Status: 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkNoCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEmpty($charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no charset
+     */
+    public function testCurlDownloadCallbackOkHtmlCharset()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
+        $data = [
+            'HTTP/1.1 200 OK',
+            '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
+            'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
+            '<title>ignored</title>',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $expected = $key !== 'end' ? strlen($line) : false;
+            $this->assertEquals($expected, $callback($ignore, $line));
+            if ($expected === false) {
+                break;
+            }
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEquals('Refactoring · GitHub', $title);
+    }
+    /**
+     * Test the download callback with valid values and no title
+     */
+    public function testCurlDownloadCallbackOkNoTitle()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
+        $data = [
+            'HTTP/1.1 200 OK',
+            'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
+            'ignored',
+        ];
+        foreach ($data as $key => $line) {
+            $ignore = null;
+            $this->assertEquals(strlen($line), $callback($ignore, $line));
+        }
+        $this->assertEquals('utf-8', $charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid content type.
+     */
+    public function testCurlDownloadCallbackInvalidContentType()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid response code.
+     */
+    public function testCurlDownloadCallbackInvalidResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
+     * Test the download callback with an invalid content type and response code.
+     */
+    public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
+    {
+        $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
+        $ignore = null;
+        $this->assertFalse($callback($ignore, ''));
+        $this->assertEmpty($charset);
+        $this->assertEmpty($title);
+    }
+    /**
     * Test count_private.
     */
    public function testCountPrivateLinks()
@@ -182,3 +293,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
        return str_replace('$1', $hashtag, $hashtagLink);
    }
 }
+// old style mock: PHPUnit doesn't allow function mock
+/**
+ * Returns code 200 or html content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_ok($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+/**
+ * Returns code 200 or html content type without charset.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/html'
+ */
+function ut_curl_getinfo_no_charset($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html';
+    }
+}
+/**
+ * Invalid response code.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/html'
+ */
+function ut_curl_getinfo_rc_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/html; charset=utf-8';
+    }
+}
+/**
+ * Invalid content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 200 or 'text/plain'
+ */
+function ut_curl_getinfo_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 200;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
+/**
+ * Invalid response code and content type.
+ *
+ * @param resource $ch   cURL resource
+ * @param int      $type cURL info type
+ *
+ * @return int|string 404 or 'text/plain'
+ */
+function ut_curl_getinfo_rs_ct_ko($ch, $type)
+{
+    switch ($type) {
+        case CURLINFO_RESPONSE_CODE:
+            return 404;
+        case CURLINFO_CONTENT_TYPE:
+            return 'text/plain';
+    }
+}
author	ArthurHoaro <arthur@hoa.ro>	2017-09-30 11:04:13 +0200
committer	ArthurHoaro <arthur@hoa.ro>	2017-10-28 14:35:49 +0200
commit	d65342e304f92643ba922200953cfebc51e1e482 (patch)
tree	3097c77bb4dd0590c4644422b5dc4369a4186eb7
parent	a59bbf50d7530d7e82a91896a210b9da49cb1568 (diff)
download	Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.gz Shaarli-d65342e304f92643ba922200953cfebc51e1e482.tar.zst Shaarli-d65342e304f92643ba922200953cfebc51e1e482.zip

diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 00835966..2edf5ce2 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
3	* GET an HTTP URL to retrieve its content	3	* GET an HTTP URL to retrieve its content
4	* Uses the cURL library or a fallback method	4	* Uses the cURL library or a fallback method
5	*	5	*
6	* @param string $url URL to get (http://...)	6	* @param string $url URL to get (http://...)
7	* @param int $timeout network timeout (in seconds)	7	* @param int $timeout network timeout (in seconds)
8	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)	8	* @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
		9	* @param callable\|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
		10	* Can be used to add download conditions on the headers (response code, content type, etc.).
9	*	11	*
10	* @return array HTTP response headers, downloaded content	12	* @return array HTTP response headers, downloaded content
11	*	13	*
@@ -29,7 +31,7 @@
29	* @see http://stackoverflow.com/q/9183178	31	* @see http://stackoverflow.com/q/9183178
30	* @see http://stackoverflow.com/q/1462720	32	* @see http://stackoverflow.com/q/1462720
31	*/	33	*/
32	function get_http_response($url, $timeout = 30, $maxBytes = 4194304)	34	function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
33	{	35	{
34	$urlObj = new Url($url);	36	$urlObj = new Url($url);
35	$cleanUrl = $urlObj->idnToAscii();	37	$cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
75	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);	77	curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
76	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);	78	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
77		79
		80	if (is_callable($curlWriteFunction)) {
		81	curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
		82	}
		83
78	// Max download size management	84	// Max download size management
79	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);	85	curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024);
80	curl_setopt($ch, CURLOPT_NOPROGRESS, false);	86	curl_setopt($ch, CURLOPT_NOPROGRESS, false);


diff --git a/application/LinkUtils.php b/application/LinkUtils.php index 976474de..c0dd32a6 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1	<?php	1	<?php
2		2
3	/**	3	/**
4	* Extract title from an HTML document.	4	* Get cURL callback function for CURLOPT_WRITEFUNCTION
5	*	5	*
6	* @param string $html HTML content where to look for a title.	6	* @param string $charset to extract from the downloaded page (reference)
		7	* @param string $title to extract from the downloaded page (reference)
		8	* @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7	*	9	*
8	* @return bool\|string Extracted title if found, false otherwise.	10	* @return Closure
9	*/	11	*/
10	function html_extract_title($html)	12	function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11	{	13	{
12	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {	14	/**
13	return trim(str_replace("\n", '', $matches[1]));	15	* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14	}	16	*
15	return false;	17	* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
		18	* Then we extract the title and the charset and stop the download when it's done.
		19	*
		20	* @param resource $ch cURL resource
		21	* @param string $data chunk of data being downloaded
		22	*
		23	* @return int\|bool length of $data or false if we need to stop the download
		24	*/
		25	return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
		26	$responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
		27	if (!empty($responseCode) && $responseCode != 200) {
		28	return false;
		29	}
		30	$contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
		31	if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
		32	return false;
		33	}
		34	if (empty($charset)) {
		35	$charset = header_extract_charset($contentType);
		36	}
		37	if (empty($charset)) {
		38	$charset = html_extract_charset($data);
		39	}
		40	if (empty($title)) {
		41	$title = html_extract_title($data);
		42	}
		43	// We got everything we want, stop the download.
		44	if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
		45	return false;
		46	}
		47
		48	return strlen($data);
		49	};
16	}	50	}
17		51
18	/**	52	/**
19	* Determine charset from downloaded page.	53	* Extract title from an HTML document.
20	* Priority:
21	* 1. HTTP headers (Content type).
22	* 2. HTML content page (tag <meta charset>).
23	* 3. Use a default charset (default: UTF-8).
24	*	54	*
25	* @param array $headers HTTP headers array.	55	* @param string $html HTML content where to look for a title.
26	* @param string $htmlContent HTML content where to look for charset.
27	* @param string $defaultCharset Default charset to apply if other methods failed.
28	*	56	*
29	* @return string Determined charset.	57	* @return bool\|string Extracted title if found, false otherwise.
30	*/	58	*/
31	function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')	59	function html_extract_title($html)
32	{	60	{
33	if ($charset = headers_extract_charset($headers)) {	61	if (preg_match('!<title.?>(.?)</title>!is', $html, $matches)) {
34	return $charset;	62	return trim(str_replace("\n", '', $matches[1]));
35	}
36
37	if ($charset = html_extract_charset($htmlContent)) {
38	return $charset;
39	}	63	}
40		64	return false;
41	return $defaultCharset;
42	}	65	}
43		66
44	/**	67	/**
45	* Extract charset from HTTP headers if it's defined.	68	* Extract charset from HTTP header if it's defined.
46	*	69	*
47	* @param array $headers HTTP headers array.	70	* @param string $header HTTP header Content-Type line.
48	*	71	*
49	* @return bool\|string Charset string if found (lowercase), false otherwise.	72	* @return bool\|string Charset string if found (lowercase), false otherwise.
50	*/	73	*/
51	function headers_extract_charset($headers)	74	function header_extract_charset($header)
52	{	75	{
53	if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {	76	preg_match('/charset="?([^; ]+)/i', $header, $match);
54	preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);	77	if (! empty($match[1])) {
55	if (! empty($match[1])) {	78	return strtolower(trim($match[1]));
56	return strtolower(trim($match[1]));
57	}
58	}	79	}
59		80
60	return false;	81	return false;


diff --git a/index.php b/index.php index fb00a9fa..ac51038d 100644 --- a/index.php +++ b/index.php
@@ -1428,16 +1428,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history)
1428	// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)	1428	// If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
1429	if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {	1429	if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
1430	// Short timeout to keep the application responsive	1430	// Short timeout to keep the application responsive
1431	list($headers, $content) = get_http_response($url, 4);	1431	// The callback will fill $charset and $title with data from the downloaded page.
1432	if (strpos($headers[0], '200 OK') !== false) {	1432	get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
1433	// Retrieve charset.	1433	if (! empty($title) && strtolower($charset) != 'utf-8') {
1434	$charset = get_charset($headers, $content);	1434	$title = mb_convert_encoding($title, 'utf-8', $charset);
1435	// Extract title.
1436	$title = html_extract_title($content);
1437	// Re-encode title in utf-8 if necessary.
1438	if (! empty($title) && strtolower($charset) != 'utf-8') {
1439	$title = mb_convert_encoding($title, 'utf-8', $charset);
1440	}
1441	}	1435	}
1442	}	1436	}
1443		1437


diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php index 7c0d4b0b..ef650f44 100644 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
29	}	29	}
30		30
31	/**	31	/**
32	* Test get_charset() with all priorities.
33	*/
34	public function testGetCharset()
35	{
36	$headers = array('Content-Type' => 'text/html; charset=Headers');
37	$html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
38	$default = 'default';
39	$this->assertEquals('headers', get_charset($headers, $html, $default));
40	$this->assertEquals('html', get_charset(array(), $html, $default));
41	$this->assertEquals($default, get_charset(array(), '', $default));
42	$this->assertEquals('utf-8', get_charset(array(), ''));
43	}
44
45	/**
46	* Test headers_extract_charset() when the charset is found.	32	* Test headers_extract_charset() when the charset is found.
47	*/	33	*/
48	public function testHeadersExtractExistentCharset()	34	public function testHeadersExtractExistentCharset()
49	{	35	{
50	$charset = 'x-MacCroatian';	36	$charset = 'x-MacCroatian';
51	$headers = array('Content-Type' => 'text/html; charset='. $charset);	37	$headers = 'text/html; charset='. $charset;
52	$this->assertEquals(strtolower($charset), headers_extract_charset($headers));	38	$this->assertEquals(strtolower($charset), header_extract_charset($headers));
53	}	39	}
54		40
55	/**	41	/**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
57	*/	43	*/
58	public function testHeadersExtractNonExistentCharset()	44	public function testHeadersExtractNonExistentCharset()
59	{	45	{
60	$headers = array();	46	$headers = '';
61	$this->assertFalse(headers_extract_charset($headers));	47	$this->assertFalse(header_extract_charset($headers));
62		48
63	$headers = array('Content-Type' => 'text/html');	49	$headers = 'text/html';
64	$this->assertFalse(headers_extract_charset($headers));	50	$this->assertFalse(header_extract_charset($headers));
65	}	51	}
66		52
67	/**	53	/**
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
86	}	72	}
87		73
88	/**	74	/**
		75	* Test the download callback with valid value
		76	*/
		77	public function testCurlDownloadCallbackOk()
		78	{
		79	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
		80	$data = [
		81	'HTTP/1.1 200 OK',
		82	'Server: GitHub.com',
		83	'Date: Sat, 28 Oct 2017 12:01:33 GMT',
		84	'Content-Type: text/html; charset=utf-8',
		85	'Status: 200 OK',
		86	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		87	'<title>ignored</title>',
		88	];
		89	foreach ($data as $key => $line) {
		90	$ignore = null;
		91	$expected = $key !== 'end' ? strlen($line) : false;
		92	$this->assertEquals($expected, $callback($ignore, $line));
		93	if ($expected === false) {
		94	break;
		95	}
		96	}
		97	$this->assertEquals('utf-8', $charset);
		98	$this->assertEquals('Refactoring · GitHub', $title);
		99	}
		100
		101	/**
		102	* Test the download callback with valid values and no charset
		103	*/
		104	public function testCurlDownloadCallbackOkNoCharset()
		105	{
		106	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
		107	$data = [
		108	'HTTP/1.1 200 OK',
		109	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		110	'<title>ignored</title>',
		111	];
		112	foreach ($data as $key => $line) {
		113	$ignore = null;
		114	$this->assertEquals(strlen($line), $callback($ignore, $line));
		115	}
		116	$this->assertEmpty($charset);
		117	$this->assertEquals('Refactoring · GitHub', $title);
		118	}
		119
		120	/**
		121	* Test the download callback with valid values and no charset
		122	*/
		123	public function testCurlDownloadCallbackOkHtmlCharset()
		124	{
		125	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
		126	$data = [
		127	'HTTP/1.1 200 OK',
		128	'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
		129	'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
		130	'<title>ignored</title>',
		131	];
		132	foreach ($data as $key => $line) {
		133	$ignore = null;
		134	$expected = $key !== 'end' ? strlen($line) : false;
		135	$this->assertEquals($expected, $callback($ignore, $line));
		136	if ($expected === false) {
		137	break;
		138	}
		139	}
		140	$this->assertEquals('utf-8', $charset);
		141	$this->assertEquals('Refactoring · GitHub', $title);
		142	}
		143
		144	/**
		145	* Test the download callback with valid values and no title
		146	*/
		147	public function testCurlDownloadCallbackOkNoTitle()
		148	{
		149	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
		150	$data = [
		151	'HTTP/1.1 200 OK',
		152	'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
		153	'ignored',
		154	];
		155	foreach ($data as $key => $line) {
		156	$ignore = null;
		157	$this->assertEquals(strlen($line), $callback($ignore, $line));
		158	}
		159	$this->assertEquals('utf-8', $charset);
		160	$this->assertEmpty($title);
		161	}
		162
		163	/**
		164	* Test the download callback with an invalid content type.
		165	*/
		166	public function testCurlDownloadCallbackInvalidContentType()
		167	{
		168	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
		169	$ignore = null;
		170	$this->assertFalse($callback($ignore, ''));
		171	$this->assertEmpty($charset);
		172	$this->assertEmpty($title);
		173	}
		174
		175	/**
		176	* Test the download callback with an invalid response code.
		177	*/
		178	public function testCurlDownloadCallbackInvalidResponseCode()
		179	{
		180	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
		181	$ignore = null;
		182	$this->assertFalse($callback($ignore, ''));
		183	$this->assertEmpty($charset);
		184	$this->assertEmpty($title);
		185	}
		186
		187	/**
		188	* Test the download callback with an invalid content type and response code.
		189	*/
		190	public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
		191	{
		192	$callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
		193	$ignore = null;
		194	$this->assertFalse($callback($ignore, ''));
		195	$this->assertEmpty($charset);
		196	$this->assertEmpty($title);
		197	}
		198
		199	/**
89	* Test count_private.	200	* Test count_private.
90	*/	201	*/
91	public function testCountPrivateLinks()	202	public function testCountPrivateLinks()
@@ -182,3 +293,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
182	return str_replace('$1', $hashtag, $hashtagLink);	293	return str_replace('$1', $hashtag, $hashtagLink);
183	}	294	}
184	}	295	}
		296
		297	// old style mock: PHPUnit doesn't allow function mock
		298
		299	/**
		300	* Returns code 200 or html content type.
		301	*
		302	* @param resource $ch cURL resource
		303	* @param int $type cURL info type
		304	*
		305	* @return int\|string 200 or 'text/html'
		306	*/
		307	function ut_curl_getinfo_ok($ch, $type)
		308	{
		309	switch ($type) {
		310	case CURLINFO_RESPONSE_CODE:
		311	return 200;
		312	case CURLINFO_CONTENT_TYPE:
		313	return 'text/html; charset=utf-8';
		314	}
		315	}
		316
		317	/**
		318	* Returns code 200 or html content type without charset.
		319	*
		320	* @param resource $ch cURL resource
		321	* @param int $type cURL info type
		322	*
		323	* @return int\|string 200 or 'text/html'
		324	*/
		325	function ut_curl_getinfo_no_charset($ch, $type)
		326	{
		327	switch ($type) {
		328	case CURLINFO_RESPONSE_CODE:
		329	return 200;
		330	case CURLINFO_CONTENT_TYPE:
		331	return 'text/html';
		332	}
		333	}
		334
		335	/**
		336	* Invalid response code.
		337	*
		338	* @param resource $ch cURL resource
		339	* @param int $type cURL info type
		340	*
		341	* @return int\|string 404 or 'text/html'
		342	*/
		343	function ut_curl_getinfo_rc_ko($ch, $type)
		344	{
		345	switch ($type) {
		346	case CURLINFO_RESPONSE_CODE:
		347	return 404;
		348	case CURLINFO_CONTENT_TYPE:
		349	return 'text/html; charset=utf-8';
		350	}
		351	}
		352
		353	/**
		354	* Invalid content type.
		355	*
		356	* @param resource $ch cURL resource
		357	* @param int $type cURL info type
		358	*
		359	* @return int\|string 200 or 'text/plain'
		360	*/
		361	function ut_curl_getinfo_ct_ko($ch, $type)
		362	{
		363	switch ($type) {
		364	case CURLINFO_RESPONSE_CODE:
		365	return 200;
		366	case CURLINFO_CONTENT_TYPE:
		367	return 'text/plain';
		368	}
		369	}
		370
		371	/**
		372	* Invalid response code and content type.
		373	*
		374	* @param resource $ch cURL resource
		375	* @param int $type cURL info type
		376	*
		377	* @return int\|string 404 or 'text/plain'
		378	*/
		379	function ut_curl_getinfo_rs_ct_ko($ch, $type)
		380	{
		381	switch ($type) {
		382	case CURLINFO_RESPONSE_CODE:
		383	return 404;
		384	case CURLINFO_CONTENT_TYPE:
		385	return 'text/plain';
		386	}
		387	}
		388