aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/http
diff options
context:
space:
mode:
authoryude <yudesleepy@gmail.com>2021-01-04 18:51:10 +0900
committerGitHub <noreply@github.com>2021-01-04 18:51:10 +0900
commite6754f2154a79abd8e5e64bd923f6984aa9ad44b (patch)
treef074119530bb59ef155938ea367f719f1e4b70f1 /application/http
parent5256b4287021342a9f8868967b2a77e481314331 (diff)
parented4ee8f0297941ac83300389b7de6a293312d20e (diff)
downloadShaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.gz
Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.zst
Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.zip
Merge pull request #2 from shaarli/master
Merge fork source
Diffstat (limited to 'application/http')
-rw-r--r--application/http/HttpAccess.php20
-rw-r--r--application/http/HttpUtils.php198
-rw-r--r--application/http/MetadataRetriever.php74
-rw-r--r--application/http/Url.php10
-rw-r--r--application/http/UrlUtils.php11
5 files changed, 219 insertions, 94 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
index 81d9e076..e80e0c01 100644
--- a/application/http/HttpAccess.php
+++ b/application/http/HttpAccess.php
@@ -14,9 +14,14 @@ namespace Shaarli\Http;
14 */ 14 */
15class HttpAccess 15class HttpAccess
16{ 16{
17 public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) 17 public function getHttpResponse(
18 { 18 $url,
19 return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); 19 $timeout = 30,
20 $maxBytes = 4194304,
21 $curlHeaderFunction = null,
22 $curlWriteFunction = null
23 ) {
24 return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
20 } 25 }
21 26
22 public function getCurlDownloadCallback( 27 public function getCurlDownloadCallback(
@@ -25,7 +30,7 @@ class HttpAccess
25 &$description, 30 &$description,
26 &$keywords, 31 &$keywords,
27 $retrieveDescription, 32 $retrieveDescription,
28 $curlGetInfo = 'curl_getinfo' 33 $tagsSeparator
29 ) { 34 ) {
30 return get_curl_download_callback( 35 return get_curl_download_callback(
31 $charset, 36 $charset,
@@ -33,7 +38,12 @@ class HttpAccess
33 $description, 38 $description,
34 $keywords, 39 $keywords,
35 $retrieveDescription, 40 $retrieveDescription,
36 $curlGetInfo 41 $tagsSeparator
37 ); 42 );
38 } 43 }
44
45 public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
46 {
47 return get_curl_header_callback($charset, $curlGetInfo);
48 }
39} 49}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 9f414073..4bde1d5b 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
6 * GET an HTTP URL to retrieve its content 6 * GET an HTTP URL to retrieve its content
7 * Uses the cURL library or a fallback method 7 * Uses the cURL library or a fallback method
8 * 8 *
9 * @param string $url URL to get (http://...) 9 * @param string $url URL to get (http://...)
10 * @param int $timeout network timeout (in seconds) 10 * @param int $timeout network timeout (in seconds)
11 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) 11 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
12 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). 12 * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
13 * Can be used to add download conditions on the 13 * (CURLOPT_HEADERFUNCTION)
14 * headers (response code, content type, etc.). 14 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
15 * Can be used to add download conditions on the
16 * headers (response code, content type, etc.).
15 * 17 *
16 * @return array HTTP response headers, downloaded content 18 * @return array HTTP response headers, downloaded content
17 * 19 *
@@ -35,13 +37,18 @@ use Shaarli\Http\Url;
35 * @see http://stackoverflow.com/q/9183178 37 * @see http://stackoverflow.com/q/9183178
36 * @see http://stackoverflow.com/q/1462720 38 * @see http://stackoverflow.com/q/1462720
37 */ 39 */
38function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) 40function get_http_response(
39{ 41 $url,
42 $timeout = 30,
43 $maxBytes = 4194304,
44 $curlHeaderFunction = null,
45 $curlWriteFunction = null
46) {
40 $urlObj = new Url($url); 47 $urlObj = new Url($url);
41 $cleanUrl = $urlObj->idnToAscii(); 48 $cleanUrl = $urlObj->idnToAscii();
42 49
43 if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { 50 if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
44 return array(array(0 => 'Invalid HTTP UrlUtils'), false); 51 return [[0 => 'Invalid HTTP UrlUtils'], false];
45 } 52 }
46 53
47 $userAgent = 54 $userAgent =
@@ -64,42 +71,39 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
64 71
65 $ch = curl_init($cleanUrl); 72 $ch = curl_init($cleanUrl);
66 if ($ch === false) { 73 if ($ch === false) {
67 return array(array(0 => 'curl_init() error'), false); 74 return [[0 => 'curl_init() error'], false];
68 } 75 }
69 76
70 // General cURL settings 77 // General cURL settings
71 curl_setopt($ch, CURLOPT_AUTOREFERER, true); 78 curl_setopt($ch, CURLOPT_AUTOREFERER, true);
72 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 79 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
73 curl_setopt($ch, CURLOPT_HEADER, true); 80 // Default header download if the $curlHeaderFunction is not defined
81 curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
74 curl_setopt( 82 curl_setopt(
75 $ch, 83 $ch,
76 CURLOPT_HTTPHEADER, 84 CURLOPT_HTTPHEADER,
77 array('Accept-Language: ' . $acceptLanguage) 85 ['Accept-Language: ' . $acceptLanguage]
78 ); 86 );
79 curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); 87 curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
80 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 88 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
81 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 89 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
82 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); 90 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
83 91
92 // Max download size management
93 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
94 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
95 if (is_callable($curlHeaderFunction)) {
96 curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
97 }
84 if (is_callable($curlWriteFunction)) { 98 if (is_callable($curlWriteFunction)) {
85 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); 99 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
86 } 100 }
87
88 // Max download size management
89 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
90 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
91 curl_setopt( 101 curl_setopt(
92 $ch, 102 $ch,
93 CURLOPT_PROGRESSFUNCTION, 103 CURLOPT_PROGRESSFUNCTION,
94 function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { 104 function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
95 if (version_compare(phpversion(), '5.5', '<')) { 105 $downloaded = $arg2;
96 // PHP version lower than 5.5 106
97 // Callback has 4 arguments
98 $downloaded = $arg1;
99 } else {
100 // Callback has 5 arguments
101 $downloaded = $arg2;
102 }
103 // Non-zero return stops downloading 107 // Non-zero return stops downloading
104 return ($downloaded > $maxBytes) ? 1 : 0; 108 return ($downloaded > $maxBytes) ? 1 : 0;
105 } 109 }
@@ -118,9 +122,9 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
118 * Removing this would require updating 122 * Removing this would require updating
119 * GetHttpUrlTest::testGetInvalidRemoteUrl() 123 * GetHttpUrlTest::testGetInvalidRemoteUrl()
120 */ 124 */
121 return array(false, false); 125 return [false, false];
122 } 126 }
123 return array(array(0 => 'curl_exec() error: ' . $errorStr), false); 127 return [[0 => 'curl_exec() error: ' . $errorStr], false];
124 } 128 }
125 129
126 // Formatting output like the fallback method 130 // Formatting output like the fallback method
@@ -131,7 +135,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
131 $rawHeadersLastRedir = end($rawHeadersArrayRedirs); 135 $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
132 136
133 $content = substr($response, $headSize); 137 $content = substr($response, $headSize);
134 $headers = array(); 138 $headers = [];
135 foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { 139 foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
136 if (empty($line) || ctype_space($line)) { 140 if (empty($line) || ctype_space($line)) {
137 continue; 141 continue;
@@ -142,7 +146,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
142 $value = $splitLine[1]; 146 $value = $splitLine[1];
143 if (array_key_exists($key, $headers)) { 147 if (array_key_exists($key, $headers)) {
144 if (!is_array($headers[$key])) { 148 if (!is_array($headers[$key])) {
145 $headers[$key] = array(0 => $headers[$key]); 149 $headers[$key] = [0 => $headers[$key]];
146 } 150 }
147 $headers[$key][] = $value; 151 $headers[$key][] = $value;
148 } else { 152 } else {
@@ -153,7 +157,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
153 } 157 }
154 } 158 }
155 159
156 return array($headers, $content); 160 return [$headers, $content];
157} 161}
158 162
159/** 163/**
@@ -184,15 +188,15 @@ function get_http_response_fallback(
184 $acceptLanguage, 188 $acceptLanguage,
185 $maxRedr 189 $maxRedr
186) { 190) {
187 $options = array( 191 $options = [
188 'http' => array( 192 'http' => [
189 'method' => 'GET', 193 'method' => 'GET',
190 'timeout' => $timeout, 194 'timeout' => $timeout,
191 'user_agent' => $userAgent, 195 'user_agent' => $userAgent,
192 'header' => "Accept: */*\r\n" 196 'header' => "Accept: */*\r\n"
193 . 'Accept-Language: ' . $acceptLanguage 197 . 'Accept-Language: ' . $acceptLanguage
194 ) 198 ]
195 ); 199 ];
196 200
197 stream_context_set_default($options); 201 stream_context_set_default($options);
198 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); 202 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
@@ -203,7 +207,7 @@ function get_http_response_fallback(
203 } 207 }
204 208
205 if (! $headers) { 209 if (! $headers) {
206 return array($headers, false); 210 return [$headers, false];
207 } 211 }
208 212
209 try { 213 try {
@@ -211,10 +215,10 @@ function get_http_response_fallback(
211 $context = stream_context_create($options); 215 $context = stream_context_create($options);
212 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); 216 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
213 } catch (Exception $exc) { 217 } catch (Exception $exc) {
214 return array(array(0 => 'HTTP Error'), $exc->getMessage()); 218 return [[0 => 'HTTP Error'], $exc->getMessage()];
215 } 219 }
216 220
217 return array($headers, $content); 221 return [$headers, $content];
218} 222}
219 223
220/** 224/**
@@ -233,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3)
233 } 237 }
234 238
235 // Headers found, redirection found, and limit not reached. 239 // Headers found, redirection found, and limit not reached.
236 if ($redirectionLimit-- > 0 240 if (
241 $redirectionLimit-- > 0
237 && !empty($headers) 242 && !empty($headers)
238 && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) 243 && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
239 && !empty($headers['Location'])) { 244 && !empty($headers['Location'])
245 ) {
240 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; 246 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
241 if ($redirection != $url) { 247 if ($redirection != $url) {
242 $redirection = getAbsoluteUrl($url, $redirection); 248 $redirection = getAbsoluteUrl($url, $redirection);
@@ -244,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3)
244 } 250 }
245 } 251 }
246 252
247 return array($headers, $url); 253 return [$headers, $url];
248} 254}
249 255
250/** 256/**
@@ -266,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl)
266 } 272 }
267 273
268 $parts = parse_url($originalUrl); 274 $parts = parse_url($originalUrl);
269 $final = $parts['scheme'] .'://'. $parts['host']; 275 $final = $parts['scheme'] . '://' . $parts['host'];
270 $final .= (!empty($parts['port'])) ? $parts['port'] : ''; 276 $final .= (!empty($parts['port'])) ? $parts['port'] : '';
271 $final .= '/'; 277 $final .= '/';
272 if ($newUrl[0] != '/') { 278 if ($newUrl[0] != '/') {
@@ -319,7 +325,8 @@ function server_url($server)
319 $scheme = 'https'; 325 $scheme = 'https';
320 } 326 }
321 327
322 if (($scheme == 'http' && $port != '80') 328 if (
329 ($scheme == 'http' && $port != '80')
323 || ($scheme == 'https' && $port != '443') 330 || ($scheme == 'https' && $port != '443')
324 ) { 331 ) {
325 $port = ':' . $port; 332 $port = ':' . $port;
@@ -340,22 +347,26 @@ function server_url($server)
340 $host = $server['SERVER_NAME']; 347 $host = $server['SERVER_NAME'];
341 } 348 }
342 349
343 return $scheme.'://'.$host.$port; 350 return $scheme . '://' . $host . $port;
344 } 351 }
345 352
346 // SSL detection 353 // SSL detection
347 if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') 354 if (
348 || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) { 355 (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
356 || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
357 ) {
349 $scheme = 'https'; 358 $scheme = 'https';
350 } 359 }
351 360
352 // Do not append standard port values 361 // Do not append standard port values
353 if (($scheme == 'http' && $server['SERVER_PORT'] != '80') 362 if (
354 || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) { 363 ($scheme == 'http' && $server['SERVER_PORT'] != '80')
355 $port = ':'.$server['SERVER_PORT']; 364 || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
365 ) {
366 $port = ':' . $server['SERVER_PORT'];
356 } 367 }
357 368
358 return $scheme.'://'.$server['SERVER_NAME'].$port; 369 return $scheme . '://' . $server['SERVER_NAME'] . $port;
359} 370}
360 371
361/** 372/**
@@ -493,6 +504,46 @@ function is_https($server)
493 * Get cURL callback function for CURLOPT_WRITEFUNCTION 504 * Get cURL callback function for CURLOPT_WRITEFUNCTION
494 * 505 *
495 * @param string $charset to extract from the downloaded page (reference) 506 * @param string $charset to extract from the downloaded page (reference)
507 * @param string $curlGetInfo Optionally overrides curl_getinfo function
508 *
509 * @return Closure
510 */
511function get_curl_header_callback(
512 &$charset,
513 $curlGetInfo = 'curl_getinfo'
514) {
515 $isRedirected = false;
516
517 return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
518 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
519 $chunkLength = strlen($data);
520 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
521 $isRedirected = true;
522 return $chunkLength;
523 }
524 if (!empty($responseCode) && $responseCode !== 200) {
525 return false;
526 }
527 // After a redirection, the content type will keep the previous request value
528 // until it finds the next content-type header.
529 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
530 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
531 }
532 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
533 return false;
534 }
535 if (!empty($contentType) && empty($charset)) {
536 $charset = header_extract_charset($contentType);
537 }
538
539 return $chunkLength;
540 };
541}
542
543/**
544 * Get cURL callback function for CURLOPT_WRITEFUNCTION
545 *
546 * @param string $charset to extract from the downloaded page (reference)
496 * @param string $title to extract from the downloaded page (reference) 547 * @param string $title to extract from the downloaded page (reference)
497 * @param string $description to extract from the downloaded page (reference) 548 * @param string $description to extract from the downloaded page (reference)
498 * @param string $keywords to extract from the downloaded page (reference) 549 * @param string $keywords to extract from the downloaded page (reference)
@@ -507,9 +558,8 @@ function get_curl_download_callback(
507 &$description, 558 &$description,
508 &$keywords, 559 &$keywords,
509 $retrieveDescription, 560 $retrieveDescription,
510 $curlGetInfo = 'curl_getinfo' 561 $tagsSeparator
511) { 562) {
512 $isRedirected = false;
513 $currentChunk = 0; 563 $currentChunk = 0;
514 $foundChunk = null; 564 $foundChunk = null;
515 565
@@ -524,37 +574,22 @@ function get_curl_download_callback(
524 * 574 *
525 * @return int|bool length of $data or false if we need to stop the download 575 * @return int|bool length of $data or false if we need to stop the download
526 */ 576 */
527 return function (&$ch, $data) use ( 577 return function (
578 $ch,
579 $data
580 ) use (
528 $retrieveDescription, 581 $retrieveDescription,
529 $curlGetInfo, 582 $tagsSeparator,
530 &$charset, 583 &$charset,
531 &$title, 584 &$title,
532 &$description, 585 &$description,
533 &$keywords, 586 &$keywords,
534 &$isRedirected,
535 &$currentChunk, 587 &$currentChunk,
536 &$foundChunk 588 &$foundChunk
537 ) { 589 ) {
590 $chunkLength = strlen($data);
538 $currentChunk++; 591 $currentChunk++;
539 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); 592
540 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
541 $isRedirected = true;
542 return strlen($data);
543 }
544 if (!empty($responseCode) && $responseCode !== 200) {
545 return false;
546 }
547 // After a redirection, the content type will keep the previous request value
548 // until it finds the next content-type header.
549 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
550 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
551 }
552 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
553 return false;
554 }
555 if (!empty($contentType) && empty($charset)) {
556 $charset = header_extract_charset($contentType);
557 }
558 if (empty($charset)) { 593 if (empty($charset)) {
559 $charset = html_extract_charset($data); 594 $charset = html_extract_charset($data);
560 } 595 }
@@ -562,6 +597,10 @@ function get_curl_download_callback(
562 $title = html_extract_title($data); 597 $title = html_extract_title($data);
563 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; 598 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
564 } 599 }
600 if (empty($title)) {
601 $title = html_extract_tag('title', $data);
602 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
603 }
565 if ($retrieveDescription && empty($description)) { 604 if ($retrieveDescription && empty($description)) {
566 $description = html_extract_tag('description', $data); 605 $description = html_extract_tag('description', $data);
567 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; 606 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
@@ -571,10 +610,10 @@ function get_curl_download_callback(
571 if (! empty($keywords)) { 610 if (! empty($keywords)) {
572 $foundChunk = $currentChunk; 611 $foundChunk = $currentChunk;
573 // Keywords use the format tag1, tag2 multiple words, tag 612 // Keywords use the format tag1, tag2 multiple words, tag
574 // So we format them to match Shaarli's separator and glue multiple words with '-' 613 // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
575 $keywords = implode(' ', array_map(function($keyword) { 614 $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
576 return implode('-', preg_split('/\s+/', trim($keyword))); 615 return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
577 }, explode(',', $keywords))); 616 }, tags_str2array($keywords, ',')), $tagsSeparator);
578 } 617 }
579 } 618 }
580 619
@@ -582,7 +621,8 @@ function get_curl_download_callback(
582 // If we already found either the title, description or keywords, 621 // If we already found either the title, description or keywords,
583 // it's highly unlikely that we'll found the other metas further than 622 // it's highly unlikely that we'll found the other metas further than
584 // in the same chunk of data or the next one. So we also stop the download after that. 623 // in the same chunk of data or the next one. So we also stop the download after that.
585 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null 624 if (
625 (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
586 && (! $retrieveDescription 626 && (! $retrieveDescription
587 || $foundChunk < $currentChunk 627 || $foundChunk < $currentChunk
588 || (!empty($title) && !empty($description) && !empty($keywords)) 628 || (!empty($title) && !empty($description) && !empty($keywords))
@@ -591,6 +631,6 @@ function get_curl_download_callback(
591 return false; 631 return false;
592 } 632 }
593 633
594 return strlen($data); 634 return $chunkLength;
595 }; 635 };
596} 636}
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php
new file mode 100644
index 00000000..cfc72583
--- /dev/null
+++ b/application/http/MetadataRetriever.php
@@ -0,0 +1,74 @@
1<?php
2
3declare(strict_types=1);
4
5namespace Shaarli\Http;
6
7use Shaarli\Config\ConfigManager;
8
9/**
10 * HTTP Tool used to extract metadata from external URL (title, description, etc.).
11 */
12class MetadataRetriever
13{
14 /** @var ConfigManager */
15 protected $conf;
16
17 /** @var HttpAccess */
18 protected $httpAccess;
19
20 public function __construct(ConfigManager $conf, HttpAccess $httpAccess)
21 {
22 $this->conf = $conf;
23 $this->httpAccess = $httpAccess;
24 }
25
26 /**
27 * Retrieve metadata for given URL.
28 *
29 * @return array [
30 * 'title' => <remote title>,
31 * 'description' => <remote description>,
32 * 'tags' => <remote keywords>,
33 * ]
34 */
35 public function retrieve(string $url): array
36 {
37 $charset = null;
38 $title = null;
39 $description = null;
40 $tags = null;
41
42 // Short timeout to keep the application responsive
43 // The callback will fill $charset and $title with data from the downloaded page.
44 $this->httpAccess->getHttpResponse(
45 $url,
46 $this->conf->get('general.download_timeout', 30),
47 $this->conf->get('general.download_max_size', 4194304),
48 $this->httpAccess->getCurlHeaderCallback($charset),
49 $this->httpAccess->getCurlDownloadCallback(
50 $charset,
51 $title,
52 $description,
53 $tags,
54 $this->conf->get('general.retrieve_description'),
55 $this->conf->get('general.tags_separator', ' ')
56 )
57 );
58
59 if (!empty($title) && strtolower($charset) !== 'utf-8') {
60 $title = mb_convert_encoding($title, 'utf-8', $charset);
61 }
62
63 return array_map([$this, 'cleanMetadata'], [
64 'title' => $title,
65 'description' => $description,
66 'tags' => $tags,
67 ]);
68 }
69
70 protected function cleanMetadata($data): ?string
71 {
72 return !is_string($data) || empty(trim($data)) ? null : trim($data);
73 }
74}
diff --git a/application/http/Url.php b/application/http/Url.php
index 90444a2f..fe87088f 100644
--- a/application/http/Url.php
+++ b/application/http/Url.php
@@ -17,7 +17,7 @@ namespace Shaarli\Http;
17 */ 17 */
18class Url 18class Url
19{ 19{
20 private static $annoyingQueryParams = array( 20 private static $annoyingQueryParams = [
21 // Facebook 21 // Facebook
22 'action_object_map=', 22 'action_object_map=',
23 'action_ref_map=', 23 'action_ref_map=',
@@ -37,15 +37,15 @@ class Url
37 37
38 // Other 38 // Other
39 'campaign_' 39 'campaign_'
40 ); 40 ];
41 41
42 private static $annoyingFragments = array( 42 private static $annoyingFragments = [
43 // ATInternet 43 // ATInternet
44 'xtor=RSS-', 44 'xtor=RSS-',
45 45
46 // Misc. 46 // Misc.
47 'tk.rss_all' 47 'tk.rss_all'
48 ); 48 ];
49 49
50 /* 50 /*
51 * URL parts represented as an array 51 * URL parts represented as an array
@@ -120,7 +120,7 @@ class Url
120 foreach (self::$annoyingQueryParams as $annoying) { 120 foreach (self::$annoyingQueryParams as $annoying) {
121 foreach ($queryParams as $param) { 121 foreach ($queryParams as $param) {
122 if (startsWith($param, $annoying)) { 122 if (startsWith($param, $annoying)) {
123 $queryParams = array_diff($queryParams, array($param)); 123 $queryParams = array_diff($queryParams, [$param]);
124 continue; 124 continue;
125 } 125 }
126 } 126 }
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php
index e8d1a283..de5b7db1 100644
--- a/application/http/UrlUtils.php
+++ b/application/http/UrlUtils.php
@@ -1,4 +1,5 @@
1<?php 1<?php
2
2/** 3/**
3 * Converts an array-represented URL to a string 4 * Converts an array-represented URL to a string
4 * 5 *
@@ -12,15 +13,15 @@
12 */ 13 */
13function unparse_url($parsedUrl) 14function unparse_url($parsedUrl)
14{ 15{
15 $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : ''; 16 $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '';
16 $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; 17 $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : '';
17 $port = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : ''; 18 $port = isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '';
18 $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; 19 $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : '';
19 $pass = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass'] : ''; 20 $pass = isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '';
20 $pass = ($user || $pass) ? "$pass@" : ''; 21 $pass = ($user || $pass) ? "$pass@" : '';
21 $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; 22 $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : '';
22 $query = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : ''; 23 $query = isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '';
23 $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : ''; 24 $fragment = isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '';
24 25
25 return "$scheme$user$pass$host$port$path$query$fragment"; 26 return "$scheme$user$pass$host$port$path$query$fragment";
26} 27}