diff options
author | ArthurHoaro <arthur@hoa.ro> | 2018-01-23 18:41:38 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-23 18:41:38 +0100 |
commit | d449f79a0d7ca808b891baf73b9e25ce7f7e48fe (patch) | |
tree | 66e40b38bfce1475b745d6a1227f478f8e99ab75 | |
parent | 5f8c3f532ed16ad5b789f75e9ff745e5329271c3 (diff) | |
parent | d65342e304f92643ba922200953cfebc51e1e482 (diff) | |
download | Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.gz Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.zst Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.zip |
Merge pull request #977 from ArthurHoaro/feature/dl-filter
Extract the title/charset during page download, and check content type
-rw-r--r-- | application/HttpUtils.php | 14 | ||||
-rw-r--r-- | application/LinkUtils.php | 89 | ||||
-rw-r--r-- | index.php | 14 | ||||
-rw-r--r-- | tests/LinkUtilsTest.php | 244 |
4 files changed, 293 insertions, 68 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index c9371b55..83a4c5e2 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -3,9 +3,11 @@ | |||
3 | * GET an HTTP URL to retrieve its content | 3 | * GET an HTTP URL to retrieve its content |
4 | * Uses the cURL library or a fallback method | 4 | * Uses the cURL library or a fallback method |
5 | * | 5 | * |
6 | * @param string $url URL to get (http://...) | 6 | * @param string $url URL to get (http://...) |
7 | * @param int $timeout network timeout (in seconds) | 7 | * @param int $timeout network timeout (in seconds) |
8 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | 8 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) |
9 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | ||
10 | * Can be used to add download conditions on the headers (response code, content type, etc.). | ||
9 | * | 11 | * |
10 | * @return array HTTP response headers, downloaded content | 12 | * @return array HTTP response headers, downloaded content |
11 | * | 13 | * |
@@ -29,7 +31,7 @@ | |||
29 | * @see http://stackoverflow.com/q/9183178 | 31 | * @see http://stackoverflow.com/q/9183178 |
30 | * @see http://stackoverflow.com/q/1462720 | 32 | * @see http://stackoverflow.com/q/1462720 |
31 | */ | 33 | */ |
32 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | 34 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) |
33 | { | 35 | { |
34 | $urlObj = new Url($url); | 36 | $urlObj = new Url($url); |
35 | $cleanUrl = $urlObj->idnToAscii(); | 37 | $cleanUrl = $urlObj->idnToAscii(); |
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | |||
75 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | 77 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); |
76 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | 78 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); |
77 | 79 | ||
80 | if (is_callable($curlWriteFunction)) { | ||
81 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
82 | } | ||
83 | |||
78 | // Max download size management | 84 | // Max download size management |
79 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | 85 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); |
80 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | 86 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); |
diff --git a/application/LinkUtils.php b/application/LinkUtils.php index e3d95d08..3705f7e9 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php | |||
@@ -1,60 +1,81 @@ | |||
1 | <?php | 1 | <?php |
2 | 2 | ||
3 | /** | 3 | /** |
4 | * Extract title from an HTML document. | 4 | * Get cURL callback function for CURLOPT_WRITEFUNCTION |
5 | * | 5 | * |
6 | * @param string $html HTML content where to look for a title. | 6 | * @param string $charset to extract from the downloaded page (reference) |
7 | * @param string $title to extract from the downloaded page (reference) | ||
8 | * @param string $curlGetInfo Optionnaly overrides curl_getinfo function | ||
7 | * | 9 | * |
8 | * @return bool|string Extracted title if found, false otherwise. | 10 | * @return Closure |
9 | */ | 11 | */ |
10 | function html_extract_title($html) | 12 | function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') |
11 | { | 13 | { |
12 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { | 14 | /** |
13 | return trim(str_replace("\n", '', $matches[1])); | 15 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). |
14 | } | 16 | * |
15 | return false; | 17 | * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' |
18 | * Then we extract the title and the charset and stop the download when it's done. | ||
19 | * | ||
20 | * @param resource $ch cURL resource | ||
21 | * @param string $data chunk of data being downloaded | ||
22 | * | ||
23 | * @return int|bool length of $data or false if we need to stop the download | ||
24 | */ | ||
25 | return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) { | ||
26 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
27 | if (!empty($responseCode) && $responseCode != 200) { | ||
28 | return false; | ||
29 | } | ||
30 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
31 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
32 | return false; | ||
33 | } | ||
34 | if (empty($charset)) { | ||
35 | $charset = header_extract_charset($contentType); | ||
36 | } | ||
37 | if (empty($charset)) { | ||
38 | $charset = html_extract_charset($data); | ||
39 | } | ||
40 | if (empty($title)) { | ||
41 | $title = html_extract_title($data); | ||
42 | } | ||
43 | // We got everything we want, stop the download. | ||
44 | if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { | ||
45 | return false; | ||
46 | } | ||
47 | |||
48 | return strlen($data); | ||
49 | }; | ||
16 | } | 50 | } |
17 | 51 | ||
18 | /** | 52 | /** |
19 | * Determine charset from downloaded page. | 53 | * Extract title from an HTML document. |
20 | * Priority: | ||
21 | * 1. HTTP headers (Content type). | ||
22 | * 2. HTML content page (tag <meta charset>). | ||
23 | * 3. Use a default charset (default: UTF-8). | ||
24 | * | 54 | * |
25 | * @param array $headers HTTP headers array. | 55 | * @param string $html HTML content where to look for a title. |
26 | * @param string $htmlContent HTML content where to look for charset. | ||
27 | * @param string $defaultCharset Default charset to apply if other methods failed. | ||
28 | * | 56 | * |
29 | * @return string Determined charset. | 57 | * @return bool|string Extracted title if found, false otherwise. |
30 | */ | 58 | */ |
31 | function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') | 59 | function html_extract_title($html) |
32 | { | 60 | { |
33 | if ($charset = headers_extract_charset($headers)) { | 61 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { |
34 | return $charset; | 62 | return trim(str_replace("\n", '', $matches[1])); |
35 | } | ||
36 | |||
37 | if ($charset = html_extract_charset($htmlContent)) { | ||
38 | return $charset; | ||
39 | } | 63 | } |
40 | 64 | return false; | |
41 | return $defaultCharset; | ||
42 | } | 65 | } |
43 | 66 | ||
44 | /** | 67 | /** |
45 | * Extract charset from HTTP headers if it's defined. | 68 | * Extract charset from HTTP header if it's defined. |
46 | * | 69 | * |
47 | * @param array $headers HTTP headers array. | 70 | * @param string $header HTTP header Content-Type line. |
48 | * | 71 | * |
49 | * @return bool|string Charset string if found (lowercase), false otherwise. | 72 | * @return bool|string Charset string if found (lowercase), false otherwise. |
50 | */ | 73 | */ |
51 | function headers_extract_charset($headers) | 74 | function header_extract_charset($header) |
52 | { | 75 | { |
53 | if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { | 76 | preg_match('/charset="?([^; ]+)/i', $header, $match); |
54 | preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); | 77 | if (! empty($match[1])) { |
55 | if (! empty($match[1])) { | 78 | return strtolower(trim($match[1])); |
56 | return strtolower(trim($match[1])); | ||
57 | } | ||
58 | } | 79 | } |
59 | 80 | ||
60 | return false; | 81 | return false; |
@@ -1425,16 +1425,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager) | |||
1425 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) | 1425 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) |
1426 | if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { | 1426 | if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { |
1427 | // Short timeout to keep the application responsive | 1427 | // Short timeout to keep the application responsive |
1428 | list($headers, $content) = get_http_response($url, 4); | 1428 | // The callback will fill $charset and $title with data from the downloaded page. |
1429 | if (strpos($headers[0], '200 OK') !== false) { | 1429 | get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title)); |
1430 | // Retrieve charset. | 1430 | if (! empty($title) && strtolower($charset) != 'utf-8') { |
1431 | $charset = get_charset($headers, $content); | 1431 | $title = mb_convert_encoding($title, 'utf-8', $charset); |
1432 | // Extract title. | ||
1433 | $title = html_extract_title($content); | ||
1434 | // Re-encode title in utf-8 if necessary. | ||
1435 | if (! empty($title) && strtolower($charset) != 'utf-8') { | ||
1436 | $title = mb_convert_encoding($title, 'utf-8', $charset); | ||
1437 | } | ||
1438 | } | 1432 | } |
1439 | } | 1433 | } |
1440 | 1434 | ||
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php index 99679320..7fbd59b0 100644 --- a/tests/LinkUtilsTest.php +++ b/tests/LinkUtilsTest.php | |||
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase | |||
29 | } | 29 | } |
30 | 30 | ||
31 | /** | 31 | /** |
32 | * Test get_charset() with all priorities. | ||
33 | */ | ||
34 | public function testGetCharset() | ||
35 | { | ||
36 | $headers = array('Content-Type' => 'text/html; charset=Headers'); | ||
37 | $html = '<html><meta>stuff</meta><meta charset="Html"/></html>'; | ||
38 | $default = 'default'; | ||
39 | $this->assertEquals('headers', get_charset($headers, $html, $default)); | ||
40 | $this->assertEquals('html', get_charset(array(), $html, $default)); | ||
41 | $this->assertEquals($default, get_charset(array(), '', $default)); | ||
42 | $this->assertEquals('utf-8', get_charset(array(), '')); | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * Test headers_extract_charset() when the charset is found. | 32 | * Test headers_extract_charset() when the charset is found. |
47 | */ | 33 | */ |
48 | public function testHeadersExtractExistentCharset() | 34 | public function testHeadersExtractExistentCharset() |
49 | { | 35 | { |
50 | $charset = 'x-MacCroatian'; | 36 | $charset = 'x-MacCroatian'; |
51 | $headers = array('Content-Type' => 'text/html; charset='. $charset); | 37 | $headers = 'text/html; charset='. $charset; |
52 | $this->assertEquals(strtolower($charset), headers_extract_charset($headers)); | 38 | $this->assertEquals(strtolower($charset), header_extract_charset($headers)); |
53 | } | 39 | } |
54 | 40 | ||
55 | /** | 41 | /** |
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase | |||
57 | */ | 43 | */ |
58 | public function testHeadersExtractNonExistentCharset() | 44 | public function testHeadersExtractNonExistentCharset() |
59 | { | 45 | { |
60 | $headers = array(); | 46 | $headers = ''; |
61 | $this->assertFalse(headers_extract_charset($headers)); | 47 | $this->assertFalse(header_extract_charset($headers)); |
62 | 48 | ||
63 | $headers = array('Content-Type' => 'text/html'); | 49 | $headers = 'text/html'; |
64 | $this->assertFalse(headers_extract_charset($headers)); | 50 | $this->assertFalse(header_extract_charset($headers)); |
65 | } | 51 | } |
66 | 52 | ||
67 | /** | 53 | /** |
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase | |||
86 | } | 72 | } |
87 | 73 | ||
88 | /** | 74 | /** |
75 | * Test the download callback with valid value | ||
76 | */ | ||
77 | public function testCurlDownloadCallbackOk() | ||
78 | { | ||
79 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok'); | ||
80 | $data = [ | ||
81 | 'HTTP/1.1 200 OK', | ||
82 | 'Server: GitHub.com', | ||
83 | 'Date: Sat, 28 Oct 2017 12:01:33 GMT', | ||
84 | 'Content-Type: text/html; charset=utf-8', | ||
85 | 'Status: 200 OK', | ||
86 | 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea', | ||
87 | '<title>ignored</title>', | ||
88 | ]; | ||
89 | foreach ($data as $key => $line) { | ||
90 | $ignore = null; | ||
91 | $expected = $key !== 'end' ? strlen($line) : false; | ||
92 | $this->assertEquals($expected, $callback($ignore, $line)); | ||
93 | if ($expected === false) { | ||
94 | break; | ||
95 | } | ||
96 | } | ||
97 | $this->assertEquals('utf-8', $charset); | ||
98 | $this->assertEquals('Refactoring · GitHub', $title); | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * Test the download callback with valid values and no charset | ||
103 | */ | ||
104 | public function testCurlDownloadCallbackOkNoCharset() | ||
105 | { | ||
106 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset'); | ||
107 | $data = [ | ||
108 | 'HTTP/1.1 200 OK', | ||
109 | 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea', | ||
110 | '<title>ignored</title>', | ||
111 | ]; | ||
112 | foreach ($data as $key => $line) { | ||
113 | $ignore = null; | ||
114 | $this->assertEquals(strlen($line), $callback($ignore, $line)); | ||
115 | } | ||
116 | $this->assertEmpty($charset); | ||
117 | $this->assertEquals('Refactoring · GitHub', $title); | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * Test the download callback with valid values and no charset | ||
122 | */ | ||
123 | public function testCurlDownloadCallbackOkHtmlCharset() | ||
124 | { | ||
125 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset'); | ||
126 | $data = [ | ||
127 | 'HTTP/1.1 200 OK', | ||
128 | '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />', | ||
129 | 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea', | ||
130 | '<title>ignored</title>', | ||
131 | ]; | ||
132 | foreach ($data as $key => $line) { | ||
133 | $ignore = null; | ||
134 | $expected = $key !== 'end' ? strlen($line) : false; | ||
135 | $this->assertEquals($expected, $callback($ignore, $line)); | ||
136 | if ($expected === false) { | ||
137 | break; | ||
138 | } | ||
139 | } | ||
140 | $this->assertEquals('utf-8', $charset); | ||
141 | $this->assertEquals('Refactoring · GitHub', $title); | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * Test the download callback with valid values and no title | ||
146 | */ | ||
147 | public function testCurlDownloadCallbackOkNoTitle() | ||
148 | { | ||
149 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok'); | ||
150 | $data = [ | ||
151 | 'HTTP/1.1 200 OK', | ||
152 | 'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea', | ||
153 | 'ignored', | ||
154 | ]; | ||
155 | foreach ($data as $key => $line) { | ||
156 | $ignore = null; | ||
157 | $this->assertEquals(strlen($line), $callback($ignore, $line)); | ||
158 | } | ||
159 | $this->assertEquals('utf-8', $charset); | ||
160 | $this->assertEmpty($title); | ||
161 | } | ||
162 | |||
163 | /** | ||
164 | * Test the download callback with an invalid content type. | ||
165 | */ | ||
166 | public function testCurlDownloadCallbackInvalidContentType() | ||
167 | { | ||
168 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko'); | ||
169 | $ignore = null; | ||
170 | $this->assertFalse($callback($ignore, '')); | ||
171 | $this->assertEmpty($charset); | ||
172 | $this->assertEmpty($title); | ||
173 | } | ||
174 | |||
175 | /** | ||
176 | * Test the download callback with an invalid response code. | ||
177 | */ | ||
178 | public function testCurlDownloadCallbackInvalidResponseCode() | ||
179 | { | ||
180 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko'); | ||
181 | $ignore = null; | ||
182 | $this->assertFalse($callback($ignore, '')); | ||
183 | $this->assertEmpty($charset); | ||
184 | $this->assertEmpty($title); | ||
185 | } | ||
186 | |||
187 | /** | ||
188 | * Test the download callback with an invalid content type and response code. | ||
189 | */ | ||
190 | public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode() | ||
191 | { | ||
192 | $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko'); | ||
193 | $ignore = null; | ||
194 | $this->assertFalse($callback($ignore, '')); | ||
195 | $this->assertEmpty($charset); | ||
196 | $this->assertEmpty($title); | ||
197 | } | ||
198 | |||
199 | /** | ||
89 | * Test count_private. | 200 | * Test count_private. |
90 | */ | 201 | */ |
91 | public function testCountPrivateLinks() | 202 | public function testCountPrivateLinks() |
@@ -207,3 +318,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase | |||
207 | return str_replace('$1', $hashtag, $hashtagLink); | 318 | return str_replace('$1', $hashtag, $hashtagLink); |
208 | } | 319 | } |
209 | } | 320 | } |
321 | |||
322 | // old style mock: PHPUnit doesn't allow function mock | ||
323 | |||
324 | /** | ||
325 | * Returns code 200 or html content type. | ||
326 | * | ||
327 | * @param resource $ch cURL resource | ||
328 | * @param int $type cURL info type | ||
329 | * | ||
330 | * @return int|string 200 or 'text/html' | ||
331 | */ | ||
332 | function ut_curl_getinfo_ok($ch, $type) | ||
333 | { | ||
334 | switch ($type) { | ||
335 | case CURLINFO_RESPONSE_CODE: | ||
336 | return 200; | ||
337 | case CURLINFO_CONTENT_TYPE: | ||
338 | return 'text/html; charset=utf-8'; | ||
339 | } | ||
340 | } | ||
341 | |||
342 | /** | ||
343 | * Returns code 200 or html content type without charset. | ||
344 | * | ||
345 | * @param resource $ch cURL resource | ||
346 | * @param int $type cURL info type | ||
347 | * | ||
348 | * @return int|string 200 or 'text/html' | ||
349 | */ | ||
350 | function ut_curl_getinfo_no_charset($ch, $type) | ||
351 | { | ||
352 | switch ($type) { | ||
353 | case CURLINFO_RESPONSE_CODE: | ||
354 | return 200; | ||
355 | case CURLINFO_CONTENT_TYPE: | ||
356 | return 'text/html'; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * Invalid response code. | ||
362 | * | ||
363 | * @param resource $ch cURL resource | ||
364 | * @param int $type cURL info type | ||
365 | * | ||
366 | * @return int|string 404 or 'text/html' | ||
367 | */ | ||
368 | function ut_curl_getinfo_rc_ko($ch, $type) | ||
369 | { | ||
370 | switch ($type) { | ||
371 | case CURLINFO_RESPONSE_CODE: | ||
372 | return 404; | ||
373 | case CURLINFO_CONTENT_TYPE: | ||
374 | return 'text/html; charset=utf-8'; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | /** | ||
379 | * Invalid content type. | ||
380 | * | ||
381 | * @param resource $ch cURL resource | ||
382 | * @param int $type cURL info type | ||
383 | * | ||
384 | * @return int|string 200 or 'text/plain' | ||
385 | */ | ||
386 | function ut_curl_getinfo_ct_ko($ch, $type) | ||
387 | { | ||
388 | switch ($type) { | ||
389 | case CURLINFO_RESPONSE_CODE: | ||
390 | return 200; | ||
391 | case CURLINFO_CONTENT_TYPE: | ||
392 | return 'text/plain'; | ||
393 | } | ||
394 | } | ||
395 | |||
396 | /** | ||
397 | * Invalid response code and content type. | ||
398 | * | ||
399 | * @param resource $ch cURL resource | ||
400 | * @param int $type cURL info type | ||
401 | * | ||
402 | * @return int|string 404 or 'text/plain' | ||
403 | */ | ||
404 | function ut_curl_getinfo_rs_ct_ko($ch, $type) | ||
405 | { | ||
406 | switch ($type) { | ||
407 | case CURLINFO_RESPONSE_CODE: | ||
408 | return 404; | ||
409 | case CURLINFO_CONTENT_TYPE: | ||
410 | return 'text/plain'; | ||
411 | } | ||
412 | } | ||
413 | |||