aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2018-01-23 18:41:38 +0100
committerGitHub <noreply@github.com>2018-01-23 18:41:38 +0100
commitd449f79a0d7ca808b891baf73b9e25ce7f7e48fe (patch)
tree66e40b38bfce1475b745d6a1227f478f8e99ab75
parent5f8c3f532ed16ad5b789f75e9ff745e5329271c3 (diff)
parentd65342e304f92643ba922200953cfebc51e1e482 (diff)
downloadShaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.gz
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.zst
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.zip
Merge pull request #977 from ArthurHoaro/feature/dl-filter
Extract the title/charset during page download, and check content type
-rw-r--r--application/HttpUtils.php14
-rw-r--r--application/LinkUtils.php89
-rw-r--r--index.php14
-rw-r--r--tests/LinkUtilsTest.php244
4 files changed, 293 insertions, 68 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index c9371b55..83a4c5e2 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
3 * GET an HTTP URL to retrieve its content 3 * GET an HTTP URL to retrieve its content
4 * Uses the cURL library or a fallback method 4 * Uses the cURL library or a fallback method
5 * 5 *
6 * @param string $url URL to get (http://...) 6 * @param string $url URL to get (http://...)
7 * @param int $timeout network timeout (in seconds) 7 * @param int $timeout network timeout (in seconds)
8 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) 8 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
9 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
10 * Can be used to add download conditions on the headers (response code, content type, etc.).
9 * 11 *
10 * @return array HTTP response headers, downloaded content 12 * @return array HTTP response headers, downloaded content
11 * 13 *
@@ -29,7 +31,7 @@
29 * @see http://stackoverflow.com/q/9183178 31 * @see http://stackoverflow.com/q/9183178
30 * @see http://stackoverflow.com/q/1462720 32 * @see http://stackoverflow.com/q/1462720
31 */ 33 */
32function get_http_response($url, $timeout = 30, $maxBytes = 4194304) 34function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
33{ 35{
34 $urlObj = new Url($url); 36 $urlObj = new Url($url);
35 $cleanUrl = $urlObj->idnToAscii(); 37 $cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
75 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 77 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
76 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); 78 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
77 79
80 if (is_callable($curlWriteFunction)) {
81 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
82 }
83
78 // Max download size management 84 // Max download size management
79 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); 85 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
80 curl_setopt($ch, CURLOPT_NOPROGRESS, false); 86 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index e3d95d08..3705f7e9 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1<?php 1<?php
2 2
3/** 3/**
4 * Extract title from an HTML document. 4 * Get cURL callback function for CURLOPT_WRITEFUNCTION
5 * 5 *
6 * @param string $html HTML content where to look for a title. 6 * @param string $charset to extract from the downloaded page (reference)
7 * @param string $title to extract from the downloaded page (reference)
8 * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7 * 9 *
8 * @return bool|string Extracted title if found, false otherwise. 10 * @return Closure
9 */ 11 */
10function html_extract_title($html) 12function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11{ 13{
12 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { 14 /**
13 return trim(str_replace("\n", '', $matches[1])); 15 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14 } 16 *
15 return false; 17 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
18 * Then we extract the title and the charset and stop the download when it's done.
19 *
20 * @param resource $ch cURL resource
21 * @param string $data chunk of data being downloaded
22 *
23 * @return int|bool length of $data or false if we need to stop the download
24 */
25 return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
26 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
27 if (!empty($responseCode) && $responseCode != 200) {
28 return false;
29 }
30 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
31 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
32 return false;
33 }
34 if (empty($charset)) {
35 $charset = header_extract_charset($contentType);
36 }
37 if (empty($charset)) {
38 $charset = html_extract_charset($data);
39 }
40 if (empty($title)) {
41 $title = html_extract_title($data);
42 }
43 // We got everything we want, stop the download.
44 if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
45 return false;
46 }
47
48 return strlen($data);
49 };
16} 50}
17 51
18/** 52/**
19 * Determine charset from downloaded page. 53 * Extract title from an HTML document.
20 * Priority:
21 * 1. HTTP headers (Content type).
22 * 2. HTML content page (tag <meta charset>).
23 * 3. Use a default charset (default: UTF-8).
24 * 54 *
25 * @param array $headers HTTP headers array. 55 * @param string $html HTML content where to look for a title.
26 * @param string $htmlContent HTML content where to look for charset.
27 * @param string $defaultCharset Default charset to apply if other methods failed.
28 * 56 *
29 * @return string Determined charset. 57 * @return bool|string Extracted title if found, false otherwise.
30 */ 58 */
31function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') 59function html_extract_title($html)
32{ 60{
33 if ($charset = headers_extract_charset($headers)) { 61 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
34 return $charset; 62 return trim(str_replace("\n", '', $matches[1]));
35 }
36
37 if ($charset = html_extract_charset($htmlContent)) {
38 return $charset;
39 } 63 }
40 64 return false;
41 return $defaultCharset;
42} 65}
43 66
44/** 67/**
45 * Extract charset from HTTP headers if it's defined. 68 * Extract charset from HTTP header if it's defined.
46 * 69 *
47 * @param array $headers HTTP headers array. 70 * @param string $header HTTP header Content-Type line.
48 * 71 *
49 * @return bool|string Charset string if found (lowercase), false otherwise. 72 * @return bool|string Charset string if found (lowercase), false otherwise.
50 */ 73 */
51function headers_extract_charset($headers) 74function header_extract_charset($header)
52{ 75{
53 if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { 76 preg_match('/charset="?([^; ]+)/i', $header, $match);
54 preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); 77 if (! empty($match[1])) {
55 if (! empty($match[1])) { 78 return strtolower(trim($match[1]));
56 return strtolower(trim($match[1]));
57 }
58 } 79 }
59 80
60 return false; 81 return false;
diff --git a/index.php b/index.php
index 27335a36..d57789e6 100644
--- a/index.php
+++ b/index.php
@@ -1425,16 +1425,10 @@ function renderPage($conf, $pluginManager, $LINKSDB, $history, $sessionManager)
1425 // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) 1425 // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.)
1426 if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) { 1426 if (empty($title) && strpos(get_url_scheme($url), 'http') !== false) {
1427 // Short timeout to keep the application responsive 1427 // Short timeout to keep the application responsive
1428 list($headers, $content) = get_http_response($url, 4); 1428 // The callback will fill $charset and $title with data from the downloaded page.
1429 if (strpos($headers[0], '200 OK') !== false) { 1429 get_http_response($url, 25, 4194304, get_curl_download_callback($charset, $title));
1430 // Retrieve charset. 1430 if (! empty($title) && strtolower($charset) != 'utf-8') {
1431 $charset = get_charset($headers, $content); 1431 $title = mb_convert_encoding($title, 'utf-8', $charset);
1432 // Extract title.
1433 $title = html_extract_title($content);
1434 // Re-encode title in utf-8 if necessary.
1435 if (! empty($title) && strtolower($charset) != 'utf-8') {
1436 $title = mb_convert_encoding($title, 'utf-8', $charset);
1437 }
1438 } 1432 }
1439 } 1433 }
1440 1434
diff --git a/tests/LinkUtilsTest.php b/tests/LinkUtilsTest.php
index 99679320..7fbd59b0 100644
--- a/tests/LinkUtilsTest.php
+++ b/tests/LinkUtilsTest.php
@@ -29,27 +29,13 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
29 } 29 }
30 30
31 /** 31 /**
32 * Test get_charset() with all priorities.
33 */
34 public function testGetCharset()
35 {
36 $headers = array('Content-Type' => 'text/html; charset=Headers');
37 $html = '<html><meta>stuff</meta><meta charset="Html"/></html>';
38 $default = 'default';
39 $this->assertEquals('headers', get_charset($headers, $html, $default));
40 $this->assertEquals('html', get_charset(array(), $html, $default));
41 $this->assertEquals($default, get_charset(array(), '', $default));
42 $this->assertEquals('utf-8', get_charset(array(), ''));
43 }
44
45 /**
46 * Test headers_extract_charset() when the charset is found. 32 * Test headers_extract_charset() when the charset is found.
47 */ 33 */
48 public function testHeadersExtractExistentCharset() 34 public function testHeadersExtractExistentCharset()
49 { 35 {
50 $charset = 'x-MacCroatian'; 36 $charset = 'x-MacCroatian';
51 $headers = array('Content-Type' => 'text/html; charset='. $charset); 37 $headers = 'text/html; charset='. $charset;
52 $this->assertEquals(strtolower($charset), headers_extract_charset($headers)); 38 $this->assertEquals(strtolower($charset), header_extract_charset($headers));
53 } 39 }
54 40
55 /** 41 /**
@@ -57,11 +43,11 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
57 */ 43 */
58 public function testHeadersExtractNonExistentCharset() 44 public function testHeadersExtractNonExistentCharset()
59 { 45 {
60 $headers = array(); 46 $headers = '';
61 $this->assertFalse(headers_extract_charset($headers)); 47 $this->assertFalse(header_extract_charset($headers));
62 48
63 $headers = array('Content-Type' => 'text/html'); 49 $headers = 'text/html';
64 $this->assertFalse(headers_extract_charset($headers)); 50 $this->assertFalse(header_extract_charset($headers));
65 } 51 }
66 52
67 /** 53 /**
@@ -86,6 +72,131 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
86 } 72 }
87 73
88 /** 74 /**
75 * Test the download callback with valid value
76 */
77 public function testCurlDownloadCallbackOk()
78 {
79 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
80 $data = [
81 'HTTP/1.1 200 OK',
82 'Server: GitHub.com',
83 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
84 'Content-Type: text/html; charset=utf-8',
85 'Status: 200 OK',
86 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
87 '<title>ignored</title>',
88 ];
89 foreach ($data as $key => $line) {
90 $ignore = null;
91 $expected = $key !== 'end' ? strlen($line) : false;
92 $this->assertEquals($expected, $callback($ignore, $line));
93 if ($expected === false) {
94 break;
95 }
96 }
97 $this->assertEquals('utf-8', $charset);
98 $this->assertEquals('Refactoring · GitHub', $title);
99 }
100
101 /**
102 * Test the download callback with valid values and no charset
103 */
104 public function testCurlDownloadCallbackOkNoCharset()
105 {
106 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
107 $data = [
108 'HTTP/1.1 200 OK',
109 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
110 '<title>ignored</title>',
111 ];
112 foreach ($data as $key => $line) {
113 $ignore = null;
114 $this->assertEquals(strlen($line), $callback($ignore, $line));
115 }
116 $this->assertEmpty($charset);
117 $this->assertEquals('Refactoring · GitHub', $title);
118 }
119
120 /**
121 * Test the download callback with valid values and no charset
122 */
123 public function testCurlDownloadCallbackOkHtmlCharset()
124 {
125 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_no_charset');
126 $data = [
127 'HTTP/1.1 200 OK',
128 '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
129 'end' => 'th=device-width"><title>Refactoring · GitHub</title><link rel="search" type="application/opensea',
130 '<title>ignored</title>',
131 ];
132 foreach ($data as $key => $line) {
133 $ignore = null;
134 $expected = $key !== 'end' ? strlen($line) : false;
135 $this->assertEquals($expected, $callback($ignore, $line));
136 if ($expected === false) {
137 break;
138 }
139 }
140 $this->assertEquals('utf-8', $charset);
141 $this->assertEquals('Refactoring · GitHub', $title);
142 }
143
144 /**
145 * Test the download callback with valid values and no title
146 */
147 public function testCurlDownloadCallbackOkNoTitle()
148 {
149 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ok');
150 $data = [
151 'HTTP/1.1 200 OK',
152 'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
153 'ignored',
154 ];
155 foreach ($data as $key => $line) {
156 $ignore = null;
157 $this->assertEquals(strlen($line), $callback($ignore, $line));
158 }
159 $this->assertEquals('utf-8', $charset);
160 $this->assertEmpty($title);
161 }
162
163 /**
164 * Test the download callback with an invalid content type.
165 */
166 public function testCurlDownloadCallbackInvalidContentType()
167 {
168 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_ct_ko');
169 $ignore = null;
170 $this->assertFalse($callback($ignore, ''));
171 $this->assertEmpty($charset);
172 $this->assertEmpty($title);
173 }
174
175 /**
176 * Test the download callback with an invalid response code.
177 */
178 public function testCurlDownloadCallbackInvalidResponseCode()
179 {
180 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rc_ko');
181 $ignore = null;
182 $this->assertFalse($callback($ignore, ''));
183 $this->assertEmpty($charset);
184 $this->assertEmpty($title);
185 }
186
187 /**
188 * Test the download callback with an invalid content type and response code.
189 */
190 public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
191 {
192 $callback = get_curl_download_callback($charset, $title, 'ut_curl_getinfo_rs_ct_ko');
193 $ignore = null;
194 $this->assertFalse($callback($ignore, ''));
195 $this->assertEmpty($charset);
196 $this->assertEmpty($title);
197 }
198
199 /**
89 * Test count_private. 200 * Test count_private.
90 */ 201 */
91 public function testCountPrivateLinks() 202 public function testCountPrivateLinks()
@@ -207,3 +318,96 @@ class LinkUtilsTest extends PHPUnit_Framework_TestCase
207 return str_replace('$1', $hashtag, $hashtagLink); 318 return str_replace('$1', $hashtag, $hashtagLink);
208 } 319 }
209} 320}
321
322// old style mock: PHPUnit doesn't allow function mock
323
324/**
325 * Returns code 200 or html content type.
326 *
327 * @param resource $ch cURL resource
328 * @param int $type cURL info type
329 *
330 * @return int|string 200 or 'text/html'
331 */
332function ut_curl_getinfo_ok($ch, $type)
333{
334 switch ($type) {
335 case CURLINFO_RESPONSE_CODE:
336 return 200;
337 case CURLINFO_CONTENT_TYPE:
338 return 'text/html; charset=utf-8';
339 }
340}
341
342/**
343 * Returns code 200 or html content type without charset.
344 *
345 * @param resource $ch cURL resource
346 * @param int $type cURL info type
347 *
348 * @return int|string 200 or 'text/html'
349 */
350function ut_curl_getinfo_no_charset($ch, $type)
351{
352 switch ($type) {
353 case CURLINFO_RESPONSE_CODE:
354 return 200;
355 case CURLINFO_CONTENT_TYPE:
356 return 'text/html';
357 }
358}
359
360/**
361 * Invalid response code.
362 *
363 * @param resource $ch cURL resource
364 * @param int $type cURL info type
365 *
366 * @return int|string 404 or 'text/html'
367 */
368function ut_curl_getinfo_rc_ko($ch, $type)
369{
370 switch ($type) {
371 case CURLINFO_RESPONSE_CODE:
372 return 404;
373 case CURLINFO_CONTENT_TYPE:
374 return 'text/html; charset=utf-8';
375 }
376}
377
378/**
379 * Invalid content type.
380 *
381 * @param resource $ch cURL resource
382 * @param int $type cURL info type
383 *
384 * @return int|string 200 or 'text/plain'
385 */
386function ut_curl_getinfo_ct_ko($ch, $type)
387{
388 switch ($type) {
389 case CURLINFO_RESPONSE_CODE:
390 return 200;
391 case CURLINFO_CONTENT_TYPE:
392 return 'text/plain';
393 }
394}
395
396/**
397 * Invalid response code and content type.
398 *
399 * @param resource $ch cURL resource
400 * @param int $type cURL info type
401 *
402 * @return int|string 404 or 'text/plain'
403 */
404function ut_curl_getinfo_rs_ct_ko($ch, $type)
405{
406 switch ($type) {
407 case CURLINFO_RESPONSE_CODE:
408 return 404;
409 case CURLINFO_CONTENT_TYPE:
410 return 'text/plain';
411 }
412}
413