diff options
Diffstat (limited to 'application/http')
-rw-r--r-- | application/http/HttpAccess.php | 20 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 198 | ||||
-rw-r--r-- | application/http/MetadataRetriever.php | 69 | ||||
-rw-r--r-- | application/http/Url.php | 10 | ||||
-rw-r--r-- | application/http/UrlUtils.php | 11 |
5 files changed, 214 insertions, 94 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..e80e0c01 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php | |||
@@ -14,9 +14,14 @@ namespace Shaarli\Http; | |||
14 | */ | 14 | */ |
15 | class HttpAccess | 15 | class HttpAccess |
16 | { | 16 | { |
17 | public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 17 | public function getHttpResponse( |
18 | { | 18 | $url, |
19 | return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); | 19 | $timeout = 30, |
20 | $maxBytes = 4194304, | ||
21 | $curlHeaderFunction = null, | ||
22 | $curlWriteFunction = null | ||
23 | ) { | ||
24 | return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); | ||
20 | } | 25 | } |
21 | 26 | ||
22 | public function getCurlDownloadCallback( | 27 | public function getCurlDownloadCallback( |
@@ -25,7 +30,7 @@ class HttpAccess | |||
25 | &$description, | 30 | &$description, |
26 | &$keywords, | 31 | &$keywords, |
27 | $retrieveDescription, | 32 | $retrieveDescription, |
28 | $curlGetInfo = 'curl_getinfo' | 33 | $tagsSeparator |
29 | ) { | 34 | ) { |
30 | return get_curl_download_callback( | 35 | return get_curl_download_callback( |
31 | $charset, | 36 | $charset, |
@@ -33,7 +38,12 @@ class HttpAccess | |||
33 | $description, | 38 | $description, |
34 | $keywords, | 39 | $keywords, |
35 | $retrieveDescription, | 40 | $retrieveDescription, |
36 | $curlGetInfo | 41 | $tagsSeparator |
37 | ); | 42 | ); |
38 | } | 43 | } |
44 | |||
45 | public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') | ||
46 | { | ||
47 | return get_curl_header_callback($charset, $curlGetInfo); | ||
48 | } | ||
39 | } | 49 | } |
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..4bde1d5b 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php | |||
@@ -6,12 +6,14 @@ use Shaarli\Http\Url; | |||
6 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
7 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
8 | * | 8 | * |
9 | * @param string $url URL to get (http://...) | 9 | * @param string $url URL to get (http://...) |
10 | * @param int $timeout network timeout (in seconds) | 10 | * @param int $timeout network timeout (in seconds) |
11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | 11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) |
12 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | 12 | * @param callable|string $curlHeaderFunction Optional callback called during the download of headers |
13 | * Can be used to add download conditions on the | 13 | * (CURLOPT_HEADERFUNCTION) |
14 | * headers (response code, content type, etc.). | 14 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). |
15 | * Can be used to add download conditions on the | ||
16 | * headers (response code, content type, etc.). | ||
15 | * | 17 | * |
16 | * @return array HTTP response headers, downloaded content | 18 | * @return array HTTP response headers, downloaded content |
17 | * | 19 | * |
@@ -35,13 +37,18 @@ use Shaarli\Http\Url; | |||
35 | * @see http://stackoverflow.com/q/9183178 | 37 | * @see http://stackoverflow.com/q/9183178 |
36 | * @see http://stackoverflow.com/q/1462720 | 38 | * @see http://stackoverflow.com/q/1462720 |
37 | */ | 39 | */ |
38 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 40 | function get_http_response( |
39 | { | 41 | $url, |
42 | $timeout = 30, | ||
43 | $maxBytes = 4194304, | ||
44 | $curlHeaderFunction = null, | ||
45 | $curlWriteFunction = null | ||
46 | ) { | ||
40 | $urlObj = new Url($url); | 47 | $urlObj = new Url($url); |
41 | $cleanUrl = $urlObj->idnToAscii(); | 48 | $cleanUrl = $urlObj->idnToAscii(); |
42 | 49 | ||
43 | if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { | 50 | if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { |
44 | return array(array(0 => 'Invalid HTTP UrlUtils'), false); | 51 | return [[0 => 'Invalid HTTP UrlUtils'], false]; |
45 | } | 52 | } |
46 | 53 | ||
47 | $userAgent = | 54 | $userAgent = |
@@ -64,42 +71,39 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
64 | 71 | ||
65 | $ch = curl_init($cleanUrl); | 72 | $ch = curl_init($cleanUrl); |
66 | if ($ch === false) { | 73 | if ($ch === false) { |
67 | return array(array(0 => 'curl_init() error'), false); | 74 | return [[0 => 'curl_init() error'], false]; |
68 | } | 75 | } |
69 | 76 | ||
70 | // General cURL settings | 77 | // General cURL settings |
71 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | 78 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); |
72 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | 79 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
73 | curl_setopt($ch, CURLOPT_HEADER, true); | 80 | // Default header download if the $curlHeaderFunction is not defined |
81 | curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); | ||
74 | curl_setopt( | 82 | curl_setopt( |
75 | $ch, | 83 | $ch, |
76 | CURLOPT_HTTPHEADER, | 84 | CURLOPT_HTTPHEADER, |
77 | array('Accept-Language: ' . $acceptLanguage) | 85 | ['Accept-Language: ' . $acceptLanguage] |
78 | ); | 86 | ); |
79 | curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); | 87 | curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); |
80 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | 88 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); |
81 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | 89 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); |
82 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | 90 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); |
83 | 91 | ||
92 | // Max download size management | ||
93 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16); | ||
94 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | ||
95 | if (is_callable($curlHeaderFunction)) { | ||
96 | curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); | ||
97 | } | ||
84 | if (is_callable($curlWriteFunction)) { | 98 | if (is_callable($curlWriteFunction)) { |
85 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | 99 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); |
86 | } | 100 | } |
87 | |||
88 | // Max download size management | ||
89 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | ||
90 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | ||
91 | curl_setopt( | 101 | curl_setopt( |
92 | $ch, | 102 | $ch, |
93 | CURLOPT_PROGRESSFUNCTION, | 103 | CURLOPT_PROGRESSFUNCTION, |
94 | function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { | 104 | function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { |
95 | if (version_compare(phpversion(), '5.5', '<')) { | 105 | $downloaded = $arg2; |
96 | // PHP version lower than 5.5 | 106 | |
97 | // Callback has 4 arguments | ||
98 | $downloaded = $arg1; | ||
99 | } else { | ||
100 | // Callback has 5 arguments | ||
101 | $downloaded = $arg2; | ||
102 | } | ||
103 | // Non-zero return stops downloading | 107 | // Non-zero return stops downloading |
104 | return ($downloaded > $maxBytes) ? 1 : 0; | 108 | return ($downloaded > $maxBytes) ? 1 : 0; |
105 | } | 109 | } |
@@ -118,9 +122,9 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
118 | * Removing this would require updating | 122 | * Removing this would require updating |
119 | * GetHttpUrlTest::testGetInvalidRemoteUrl() | 123 | * GetHttpUrlTest::testGetInvalidRemoteUrl() |
120 | */ | 124 | */ |
121 | return array(false, false); | 125 | return [false, false]; |
122 | } | 126 | } |
123 | return array(array(0 => 'curl_exec() error: ' . $errorStr), false); | 127 | return [[0 => 'curl_exec() error: ' . $errorStr], false]; |
124 | } | 128 | } |
125 | 129 | ||
126 | // Formatting output like the fallback method | 130 | // Formatting output like the fallback method |
@@ -131,7 +135,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
131 | $rawHeadersLastRedir = end($rawHeadersArrayRedirs); | 135 | $rawHeadersLastRedir = end($rawHeadersArrayRedirs); |
132 | 136 | ||
133 | $content = substr($response, $headSize); | 137 | $content = substr($response, $headSize); |
134 | $headers = array(); | 138 | $headers = []; |
135 | foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { | 139 | foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { |
136 | if (empty($line) || ctype_space($line)) { | 140 | if (empty($line) || ctype_space($line)) { |
137 | continue; | 141 | continue; |
@@ -142,7 +146,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
142 | $value = $splitLine[1]; | 146 | $value = $splitLine[1]; |
143 | if (array_key_exists($key, $headers)) { | 147 | if (array_key_exists($key, $headers)) { |
144 | if (!is_array($headers[$key])) { | 148 | if (!is_array($headers[$key])) { |
145 | $headers[$key] = array(0 => $headers[$key]); | 149 | $headers[$key] = [0 => $headers[$key]]; |
146 | } | 150 | } |
147 | $headers[$key][] = $value; | 151 | $headers[$key][] = $value; |
148 | } else { | 152 | } else { |
@@ -153,7 +157,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
153 | } | 157 | } |
154 | } | 158 | } |
155 | 159 | ||
156 | return array($headers, $content); | 160 | return [$headers, $content]; |
157 | } | 161 | } |
158 | 162 | ||
159 | /** | 163 | /** |
@@ -184,15 +188,15 @@ function get_http_response_fallback( | |||
184 | $acceptLanguage, | 188 | $acceptLanguage, |
185 | $maxRedr | 189 | $maxRedr |
186 | ) { | 190 | ) { |
187 | $options = array( | 191 | $options = [ |
188 | 'http' => array( | 192 | 'http' => [ |
189 | 'method' => 'GET', | 193 | 'method' => 'GET', |
190 | 'timeout' => $timeout, | 194 | 'timeout' => $timeout, |
191 | 'user_agent' => $userAgent, | 195 | 'user_agent' => $userAgent, |
192 | 'header' => "Accept: */*\r\n" | 196 | 'header' => "Accept: */*\r\n" |
193 | . 'Accept-Language: ' . $acceptLanguage | 197 | . 'Accept-Language: ' . $acceptLanguage |
194 | ) | 198 | ] |
195 | ); | 199 | ]; |
196 | 200 | ||
197 | stream_context_set_default($options); | 201 | stream_context_set_default($options); |
198 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); | 202 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); |
@@ -203,7 +207,7 @@ function get_http_response_fallback( | |||
203 | } | 207 | } |
204 | 208 | ||
205 | if (! $headers) { | 209 | if (! $headers) { |
206 | return array($headers, false); | 210 | return [$headers, false]; |
207 | } | 211 | } |
208 | 212 | ||
209 | try { | 213 | try { |
@@ -211,10 +215,10 @@ function get_http_response_fallback( | |||
211 | $context = stream_context_create($options); | 215 | $context = stream_context_create($options); |
212 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); | 216 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); |
213 | } catch (Exception $exc) { | 217 | } catch (Exception $exc) { |
214 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | 218 | return [[0 => 'HTTP Error'], $exc->getMessage()]; |
215 | } | 219 | } |
216 | 220 | ||
217 | return array($headers, $content); | 221 | return [$headers, $content]; |
218 | } | 222 | } |
219 | 223 | ||
220 | /** | 224 | /** |
@@ -233,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
233 | } | 237 | } |
234 | 238 | ||
235 | // Headers found, redirection found, and limit not reached. | 239 | // Headers found, redirection found, and limit not reached. |
236 | if ($redirectionLimit-- > 0 | 240 | if ( |
241 | $redirectionLimit-- > 0 | ||
237 | && !empty($headers) | 242 | && !empty($headers) |
238 | && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) | 243 | && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) |
239 | && !empty($headers['Location'])) { | 244 | && !empty($headers['Location']) |
245 | ) { | ||
240 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | 246 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; |
241 | if ($redirection != $url) { | 247 | if ($redirection != $url) { |
242 | $redirection = getAbsoluteUrl($url, $redirection); | 248 | $redirection = getAbsoluteUrl($url, $redirection); |
@@ -244,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
244 | } | 250 | } |
245 | } | 251 | } |
246 | 252 | ||
247 | return array($headers, $url); | 253 | return [$headers, $url]; |
248 | } | 254 | } |
249 | 255 | ||
250 | /** | 256 | /** |
@@ -266,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl) | |||
266 | } | 272 | } |
267 | 273 | ||
268 | $parts = parse_url($originalUrl); | 274 | $parts = parse_url($originalUrl); |
269 | $final = $parts['scheme'] .'://'. $parts['host']; | 275 | $final = $parts['scheme'] . '://' . $parts['host']; |
270 | $final .= (!empty($parts['port'])) ? $parts['port'] : ''; | 276 | $final .= (!empty($parts['port'])) ? $parts['port'] : ''; |
271 | $final .= '/'; | 277 | $final .= '/'; |
272 | if ($newUrl[0] != '/') { | 278 | if ($newUrl[0] != '/') { |
@@ -319,7 +325,8 @@ function server_url($server) | |||
319 | $scheme = 'https'; | 325 | $scheme = 'https'; |
320 | } | 326 | } |
321 | 327 | ||
322 | if (($scheme == 'http' && $port != '80') | 328 | if ( |
329 | ($scheme == 'http' && $port != '80') | ||
323 | || ($scheme == 'https' && $port != '443') | 330 | || ($scheme == 'https' && $port != '443') |
324 | ) { | 331 | ) { |
325 | $port = ':' . $port; | 332 | $port = ':' . $port; |
@@ -340,22 +347,26 @@ function server_url($server) | |||
340 | $host = $server['SERVER_NAME']; | 347 | $host = $server['SERVER_NAME']; |
341 | } | 348 | } |
342 | 349 | ||
343 | return $scheme.'://'.$host.$port; | 350 | return $scheme . '://' . $host . $port; |
344 | } | 351 | } |
345 | 352 | ||
346 | // SSL detection | 353 | // SSL detection |
347 | if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') | 354 | if ( |
348 | || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) { | 355 | (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') |
356 | || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443') | ||
357 | ) { | ||
349 | $scheme = 'https'; | 358 | $scheme = 'https'; |
350 | } | 359 | } |
351 | 360 | ||
352 | // Do not append standard port values | 361 | // Do not append standard port values |
353 | if (($scheme == 'http' && $server['SERVER_PORT'] != '80') | 362 | if ( |
354 | || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) { | 363 | ($scheme == 'http' && $server['SERVER_PORT'] != '80') |
355 | $port = ':'.$server['SERVER_PORT']; | 364 | || ($scheme == 'https' && $server['SERVER_PORT'] != '443') |
365 | ) { | ||
366 | $port = ':' . $server['SERVER_PORT']; | ||
356 | } | 367 | } |
357 | 368 | ||
358 | return $scheme.'://'.$server['SERVER_NAME'].$port; | 369 | return $scheme . '://' . $server['SERVER_NAME'] . $port; |
359 | } | 370 | } |
360 | 371 | ||
361 | /** | 372 | /** |
@@ -493,6 +504,46 @@ function is_https($server) | |||
493 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | 504 | * Get cURL callback function for CURLOPT_WRITEFUNCTION |
494 | * | 505 | * |
495 | * @param string $charset to extract from the downloaded page (reference) | 506 | * @param string $charset to extract from the downloaded page (reference) |
507 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
508 | * | ||
509 | * @return Closure | ||
510 | */ | ||
511 | function get_curl_header_callback( | ||
512 | &$charset, | ||
513 | $curlGetInfo = 'curl_getinfo' | ||
514 | ) { | ||
515 | $isRedirected = false; | ||
516 | |||
517 | return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { | ||
518 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
519 | $chunkLength = strlen($data); | ||
520 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
521 | $isRedirected = true; | ||
522 | return $chunkLength; | ||
523 | } | ||
524 | if (!empty($responseCode) && $responseCode !== 200) { | ||
525 | return false; | ||
526 | } | ||
527 | // After a redirection, the content type will keep the previous request value | ||
528 | // until it finds the next content-type header. | ||
529 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
530 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
531 | } | ||
532 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
533 | return false; | ||
534 | } | ||
535 | if (!empty($contentType) && empty($charset)) { | ||
536 | $charset = header_extract_charset($contentType); | ||
537 | } | ||
538 | |||
539 | return $chunkLength; | ||
540 | }; | ||
541 | } | ||
542 | |||
543 | /** | ||
544 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
545 | * | ||
546 | * @param string $charset to extract from the downloaded page (reference) | ||
496 | * @param string $title to extract from the downloaded page (reference) | 547 | * @param string $title to extract from the downloaded page (reference) |
497 | * @param string $description to extract from the downloaded page (reference) | 548 | * @param string $description to extract from the downloaded page (reference) |
498 | * @param string $keywords to extract from the downloaded page (reference) | 549 | * @param string $keywords to extract from the downloaded page (reference) |
@@ -507,9 +558,8 @@ function get_curl_download_callback( | |||
507 | &$description, | 558 | &$description, |
508 | &$keywords, | 559 | &$keywords, |
509 | $retrieveDescription, | 560 | $retrieveDescription, |
510 | $curlGetInfo = 'curl_getinfo' | 561 | $tagsSeparator |
511 | ) { | 562 | ) { |
512 | $isRedirected = false; | ||
513 | $currentChunk = 0; | 563 | $currentChunk = 0; |
514 | $foundChunk = null; | 564 | $foundChunk = null; |
515 | 565 | ||
@@ -524,37 +574,22 @@ function get_curl_download_callback( | |||
524 | * | 574 | * |
525 | * @return int|bool length of $data or false if we need to stop the download | 575 | * @return int|bool length of $data or false if we need to stop the download |
526 | */ | 576 | */ |
527 | return function (&$ch, $data) use ( | 577 | return function ( |
578 | $ch, | ||
579 | $data | ||
580 | ) use ( | ||
528 | $retrieveDescription, | 581 | $retrieveDescription, |
529 | $curlGetInfo, | 582 | $tagsSeparator, |
530 | &$charset, | 583 | &$charset, |
531 | &$title, | 584 | &$title, |
532 | &$description, | 585 | &$description, |
533 | &$keywords, | 586 | &$keywords, |
534 | &$isRedirected, | ||
535 | &$currentChunk, | 587 | &$currentChunk, |
536 | &$foundChunk | 588 | &$foundChunk |
537 | ) { | 589 | ) { |
590 | $chunkLength = strlen($data); | ||
538 | $currentChunk++; | 591 | $currentChunk++; |
539 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | 592 | |
540 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
541 | $isRedirected = true; | ||
542 | return strlen($data); | ||
543 | } | ||
544 | if (!empty($responseCode) && $responseCode !== 200) { | ||
545 | return false; | ||
546 | } | ||
547 | // After a redirection, the content type will keep the previous request value | ||
548 | // until it finds the next content-type header. | ||
549 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
550 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
551 | } | ||
552 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
553 | return false; | ||
554 | } | ||
555 | if (!empty($contentType) && empty($charset)) { | ||
556 | $charset = header_extract_charset($contentType); | ||
557 | } | ||
558 | if (empty($charset)) { | 593 | if (empty($charset)) { |
559 | $charset = html_extract_charset($data); | 594 | $charset = html_extract_charset($data); |
560 | } | 595 | } |
@@ -562,6 +597,10 @@ function get_curl_download_callback( | |||
562 | $title = html_extract_title($data); | 597 | $title = html_extract_title($data); |
563 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | 598 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; |
564 | } | 599 | } |
600 | if (empty($title)) { | ||
601 | $title = html_extract_tag('title', $data); | ||
602 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
603 | } | ||
565 | if ($retrieveDescription && empty($description)) { | 604 | if ($retrieveDescription && empty($description)) { |
566 | $description = html_extract_tag('description', $data); | 605 | $description = html_extract_tag('description', $data); |
567 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | 606 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; |
@@ -571,10 +610,10 @@ function get_curl_download_callback( | |||
571 | if (! empty($keywords)) { | 610 | if (! empty($keywords)) { |
572 | $foundChunk = $currentChunk; | 611 | $foundChunk = $currentChunk; |
573 | // Keywords use the format tag1, tag2 multiple words, tag | 612 | // Keywords use the format tag1, tag2 multiple words, tag |
574 | // So we format them to match Shaarli's separator and glue multiple words with '-' | 613 | // So we split the result with `,`, then if a tag contains the separator we replace it by `-`. |
575 | $keywords = implode(' ', array_map(function($keyword) { | 614 | $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string { |
576 | return implode('-', preg_split('/\s+/', trim($keyword))); | 615 | return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-'); |
577 | }, explode(',', $keywords))); | 616 | }, tags_str2array($keywords, ',')), $tagsSeparator); |
578 | } | 617 | } |
579 | } | 618 | } |
580 | 619 | ||
@@ -582,7 +621,8 @@ function get_curl_download_callback( | |||
582 | // If we already found either the title, description or keywords, | 621 | // If we already found either the title, description or keywords, |
583 | // it's highly unlikely that we'll found the other metas further than | 622 | // it's highly unlikely that we'll found the other metas further than |
584 | // in the same chunk of data or the next one. So we also stop the download after that. | 623 | // in the same chunk of data or the next one. So we also stop the download after that. |
585 | if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | 624 | if ( |
625 | (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | ||
586 | && (! $retrieveDescription | 626 | && (! $retrieveDescription |
587 | || $foundChunk < $currentChunk | 627 | || $foundChunk < $currentChunk |
588 | || (!empty($title) && !empty($description) && !empty($keywords)) | 628 | || (!empty($title) && !empty($description) && !empty($keywords)) |
@@ -591,6 +631,6 @@ function get_curl_download_callback( | |||
591 | return false; | 631 | return false; |
592 | } | 632 | } |
593 | 633 | ||
594 | return strlen($data); | 634 | return $chunkLength; |
595 | }; | 635 | }; |
596 | } | 636 | } |
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..2e1401ec --- /dev/null +++ b/application/http/MetadataRetriever.php | |||
@@ -0,0 +1,69 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Http; | ||
6 | |||
7 | use Shaarli\Config\ConfigManager; | ||
8 | |||
9 | /** | ||
10 | * HTTP Tool used to extract metadata from external URL (title, description, etc.). | ||
11 | */ | ||
12 | class MetadataRetriever | ||
13 | { | ||
14 | /** @var ConfigManager */ | ||
15 | protected $conf; | ||
16 | |||
17 | /** @var HttpAccess */ | ||
18 | protected $httpAccess; | ||
19 | |||
20 | public function __construct(ConfigManager $conf, HttpAccess $httpAccess) | ||
21 | { | ||
22 | $this->conf = $conf; | ||
23 | $this->httpAccess = $httpAccess; | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * Retrieve metadata for given URL. | ||
28 | * | ||
29 | * @return array [ | ||
30 | * 'title' => <remote title>, | ||
31 | * 'description' => <remote description>, | ||
32 | * 'tags' => <remote keywords>, | ||
33 | * ] | ||
34 | */ | ||
35 | public function retrieve(string $url): array | ||
36 | { | ||
37 | $charset = null; | ||
38 | $title = null; | ||
39 | $description = null; | ||
40 | $tags = null; | ||
41 | |||
42 | // Short timeout to keep the application responsive | ||
43 | // The callback will fill $charset and $title with data from the downloaded page. | ||
44 | $this->httpAccess->getHttpResponse( | ||
45 | $url, | ||
46 | $this->conf->get('general.download_timeout', 30), | ||
47 | $this->conf->get('general.download_max_size', 4194304), | ||
48 | $this->httpAccess->getCurlHeaderCallback($charset), | ||
49 | $this->httpAccess->getCurlDownloadCallback( | ||
50 | $charset, | ||
51 | $title, | ||
52 | $description, | ||
53 | $tags, | ||
54 | $this->conf->get('general.retrieve_description'), | ||
55 | $this->conf->get('general.tags_separator', ' ') | ||
56 | ) | ||
57 | ); | ||
58 | |||
59 | if (!empty($title) && strtolower($charset) !== 'utf-8') { | ||
60 | $title = mb_convert_encoding($title, 'utf-8', $charset); | ||
61 | } | ||
62 | |||
63 | return [ | ||
64 | 'title' => $title, | ||
65 | 'description' => $description, | ||
66 | 'tags' => $tags, | ||
67 | ]; | ||
68 | } | ||
69 | } | ||
diff --git a/application/http/Url.php b/application/http/Url.php index 90444a2f..fe87088f 100644 --- a/application/http/Url.php +++ b/application/http/Url.php | |||
@@ -17,7 +17,7 @@ namespace Shaarli\Http; | |||
17 | */ | 17 | */ |
18 | class Url | 18 | class Url |
19 | { | 19 | { |
20 | private static $annoyingQueryParams = array( | 20 | private static $annoyingQueryParams = [ |
21 | 21 | ||
22 | 'action_object_map=', | 22 | 'action_object_map=', |
23 | 'action_ref_map=', | 23 | 'action_ref_map=', |
@@ -37,15 +37,15 @@ class Url | |||
37 | 37 | ||
38 | // Other | 38 | // Other |
39 | 'campaign_' | 39 | 'campaign_' |
40 | ); | 40 | ]; |
41 | 41 | ||
42 | private static $annoyingFragments = array( | 42 | private static $annoyingFragments = [ |
43 | // ATInternet | 43 | // ATInternet |
44 | 'xtor=RSS-', | 44 | 'xtor=RSS-', |
45 | 45 | ||
46 | // Misc. | 46 | // Misc. |
47 | 'tk.rss_all' | 47 | 'tk.rss_all' |
48 | ); | 48 | ]; |
49 | 49 | ||
50 | /* | 50 | /* |
51 | * URL parts represented as an array | 51 | * URL parts represented as an array |
@@ -120,7 +120,7 @@ class Url | |||
120 | foreach (self::$annoyingQueryParams as $annoying) { | 120 | foreach (self::$annoyingQueryParams as $annoying) { |
121 | foreach ($queryParams as $param) { | 121 | foreach ($queryParams as $param) { |
122 | if (startsWith($param, $annoying)) { | 122 | if (startsWith($param, $annoying)) { |
123 | $queryParams = array_diff($queryParams, array($param)); | 123 | $queryParams = array_diff($queryParams, [$param]); |
124 | continue; | 124 | continue; |
125 | } | 125 | } |
126 | } | 126 | } |
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php index e8d1a283..de5b7db1 100644 --- a/application/http/UrlUtils.php +++ b/application/http/UrlUtils.php | |||
@@ -1,4 +1,5 @@ | |||
1 | <?php | 1 | <?php |
2 | |||
2 | /** | 3 | /** |
3 | * Converts an array-represented URL to a string | 4 | * Converts an array-represented URL to a string |
4 | * | 5 | * |
@@ -12,15 +13,15 @@ | |||
12 | */ | 13 | */ |
13 | function unparse_url($parsedUrl) | 14 | function unparse_url($parsedUrl) |
14 | { | 15 | { |
15 | $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : ''; | 16 | $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : ''; |
16 | $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; | 17 | $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; |
17 | $port = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : ''; | 18 | $port = isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : ''; |
18 | $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; | 19 | $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; |
19 | $pass = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass'] : ''; | 20 | $pass = isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : ''; |
20 | $pass = ($user || $pass) ? "$pass@" : ''; | 21 | $pass = ($user || $pass) ? "$pass@" : ''; |
21 | $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; | 22 | $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; |
22 | $query = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : ''; | 23 | $query = isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : ''; |
23 | $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : ''; | 24 | $fragment = isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : ''; |
24 | 25 | ||
25 | return "$scheme$user$pass$host$port$path$query$fragment"; | 26 | return "$scheme$user$pass$host$port$path$query$fragment"; |
26 | } | 27 | } |