diff options
Diffstat (limited to 'application/http')
-rw-r--r-- | application/http/HttpAccess.php | 47 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 196 | ||||
-rw-r--r-- | application/http/MetadataRetriever.php | 69 |
3 files changed, 286 insertions, 26 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php new file mode 100644 index 00000000..646a5264 --- /dev/null +++ b/application/http/HttpAccess.php | |||
@@ -0,0 +1,47 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Http; | ||
6 | |||
7 | /** | ||
8 | * Class HttpAccess | ||
9 | * | ||
10 | * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`. | ||
11 | * It is used as dependency injection in Shaarli's container. | ||
12 | * | ||
13 | * @package Shaarli\Http | ||
14 | */ | ||
15 | class HttpAccess | ||
16 | { | ||
17 | public function getHttpResponse( | ||
18 | $url, | ||
19 | $timeout = 30, | ||
20 | $maxBytes = 4194304, | ||
21 | $curlHeaderFunction = null, | ||
22 | $curlWriteFunction = null | ||
23 | ) { | ||
24 | return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); | ||
25 | } | ||
26 | |||
27 | public function getCurlDownloadCallback( | ||
28 | &$charset, | ||
29 | &$title, | ||
30 | &$description, | ||
31 | &$keywords, | ||
32 | $retrieveDescription | ||
33 | ) { | ||
34 | return get_curl_download_callback( | ||
35 | $charset, | ||
36 | $title, | ||
37 | $description, | ||
38 | $keywords, | ||
39 | $retrieveDescription | ||
40 | ); | ||
41 | } | ||
42 | |||
43 | public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') | ||
44 | { | ||
45 | return get_curl_header_callback($charset, $curlGetInfo); | ||
46 | } | ||
47 | } | ||
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 2ea9195d..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php | |||
@@ -6,12 +6,14 @@ use Shaarli\Http\Url; | |||
6 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
7 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
8 | * | 8 | * |
9 | * @param string $url URL to get (http://...) | 9 | * @param string $url URL to get (http://...) |
10 | * @param int $timeout network timeout (in seconds) | 10 | * @param int $timeout network timeout (in seconds) |
11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | 11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) |
12 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | 12 | * @param callable|string $curlHeaderFunction Optional callback called during the download of headers |
13 | * Can be used to add download conditions on the | 13 | * (CURLOPT_HEADERFUNCTION) |
14 | * headers (response code, content type, etc.). | 14 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). |
15 | * Can be used to add download conditions on the | ||
16 | * headers (response code, content type, etc.). | ||
15 | * | 17 | * |
16 | * @return array HTTP response headers, downloaded content | 18 | * @return array HTTP response headers, downloaded content |
17 | * | 19 | * |
@@ -35,8 +37,13 @@ use Shaarli\Http\Url; | |||
35 | * @see http://stackoverflow.com/q/9183178 | 37 | * @see http://stackoverflow.com/q/9183178 |
36 | * @see http://stackoverflow.com/q/1462720 | 38 | * @see http://stackoverflow.com/q/1462720 |
37 | */ | 39 | */ |
38 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 40 | function get_http_response( |
39 | { | 41 | $url, |
42 | $timeout = 30, | ||
43 | $maxBytes = 4194304, | ||
44 | $curlHeaderFunction = null, | ||
45 | $curlWriteFunction = null | ||
46 | ) { | ||
40 | $urlObj = new Url($url); | 47 | $urlObj = new Url($url); |
41 | $cleanUrl = $urlObj->idnToAscii(); | 48 | $cleanUrl = $urlObj->idnToAscii(); |
42 | 49 | ||
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
70 | // General cURL settings | 77 | // General cURL settings |
71 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | 78 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); |
72 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | 79 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
73 | curl_setopt($ch, CURLOPT_HEADER, true); | 80 | // Default header download if the $curlHeaderFunction is not defined |
81 | curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); | ||
74 | curl_setopt( | 82 | curl_setopt( |
75 | $ch, | 83 | $ch, |
76 | CURLOPT_HTTPHEADER, | 84 | CURLOPT_HTTPHEADER, |
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
81 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | 89 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); |
82 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | 90 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); |
83 | 91 | ||
84 | if (is_callable($curlWriteFunction)) { | ||
85 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
86 | } | ||
87 | |||
88 | // Max download size management | 92 | // Max download size management |
89 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | 93 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); |
90 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | 94 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); |
95 | if (is_callable($curlHeaderFunction)) { | ||
96 | curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); | ||
97 | } | ||
98 | if (is_callable($curlWriteFunction)) { | ||
99 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
100 | } | ||
91 | curl_setopt( | 101 | curl_setopt( |
92 | $ch, | 102 | $ch, |
93 | CURLOPT_PROGRESSFUNCTION, | 103 | CURLOPT_PROGRESSFUNCTION, |
94 | function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { | 104 | function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { |
95 | if (version_compare(phpversion(), '5.5', '<')) { | 105 | $downloaded = $arg2; |
96 | // PHP version lower than 5.5 | 106 | |
97 | // Callback has 4 arguments | ||
98 | $downloaded = $arg1; | ||
99 | } else { | ||
100 | // Callback has 5 arguments | ||
101 | $downloaded = $arg2; | ||
102 | } | ||
103 | // Non-zero return stops downloading | 107 | // Non-zero return stops downloading |
104 | return ($downloaded > $maxBytes) ? 1 : 0; | 108 | return ($downloaded > $maxBytes) ? 1 : 0; |
105 | } | 109 | } |
@@ -369,7 +373,11 @@ function server_url($server) | |||
369 | */ | 373 | */ |
370 | function index_url($server) | 374 | function index_url($server) |
371 | { | 375 | { |
372 | $scriptname = $server['SCRIPT_NAME']; | 376 | if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) { |
377 | return rtrim(SHAARLI_ROOT_URL, '/') . '/'; | ||
378 | } | ||
379 | |||
380 | $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/'; | ||
373 | if (endsWith($scriptname, 'index.php')) { | 381 | if (endsWith($scriptname, 'index.php')) { |
374 | $scriptname = substr($scriptname, 0, -9); | 382 | $scriptname = substr($scriptname, 0, -9); |
375 | } | 383 | } |
@@ -377,7 +385,7 @@ function index_url($server) | |||
377 | } | 385 | } |
378 | 386 | ||
379 | /** | 387 | /** |
380 | * Returns the absolute URL of the current script, with the query | 388 | * Returns the absolute URL of the current script, with current route and query |
381 | * | 389 | * |
382 | * If the resource is "index.php", then it is removed (for better-looking URLs) | 390 | * If the resource is "index.php", then it is removed (for better-looking URLs) |
383 | * | 391 | * |
@@ -387,10 +395,17 @@ function index_url($server) | |||
387 | */ | 395 | */ |
388 | function page_url($server) | 396 | function page_url($server) |
389 | { | 397 | { |
398 | $scriptname = $server['SCRIPT_NAME'] ?? ''; | ||
399 | if (endsWith($scriptname, 'index.php')) { | ||
400 | $scriptname = substr($scriptname, 0, -9); | ||
401 | } | ||
402 | |||
403 | $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? ''); | ||
390 | if (! empty($server['QUERY_STRING'])) { | 404 | if (! empty($server['QUERY_STRING'])) { |
391 | return index_url($server).'?'.$server['QUERY_STRING']; | 405 | return index_url($server) . $route . '?' . $server['QUERY_STRING']; |
392 | } | 406 | } |
393 | return index_url($server); | 407 | |
408 | return index_url($server) . $route; | ||
394 | } | 409 | } |
395 | 410 | ||
396 | /** | 411 | /** |
@@ -477,3 +492,132 @@ function is_https($server) | |||
477 | 492 | ||
478 | return ! empty($server['HTTPS']); | 493 | return ! empty($server['HTTPS']); |
479 | } | 494 | } |
495 | |||
496 | /** | ||
497 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
498 | * | ||
499 | * @param string $charset to extract from the downloaded page (reference) | ||
500 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
501 | * | ||
502 | * @return Closure | ||
503 | */ | ||
504 | function get_curl_header_callback( | ||
505 | &$charset, | ||
506 | $curlGetInfo = 'curl_getinfo' | ||
507 | ) { | ||
508 | $isRedirected = false; | ||
509 | |||
510 | return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { | ||
511 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
512 | $chunkLength = strlen($data); | ||
513 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
514 | $isRedirected = true; | ||
515 | return $chunkLength; | ||
516 | } | ||
517 | if (!empty($responseCode) && $responseCode !== 200) { | ||
518 | return false; | ||
519 | } | ||
520 | // After a redirection, the content type will keep the previous request value | ||
521 | // until it finds the next content-type header. | ||
522 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
523 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
524 | } | ||
525 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
526 | return false; | ||
527 | } | ||
528 | if (!empty($contentType) && empty($charset)) { | ||
529 | $charset = header_extract_charset($contentType); | ||
530 | } | ||
531 | |||
532 | return $chunkLength; | ||
533 | }; | ||
534 | } | ||
535 | |||
536 | /** | ||
537 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
538 | * | ||
539 | * @param string $charset to extract from the downloaded page (reference) | ||
540 | * @param string $title to extract from the downloaded page (reference) | ||
541 | * @param string $description to extract from the downloaded page (reference) | ||
542 | * @param string $keywords to extract from the downloaded page (reference) | ||
543 | * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content | ||
544 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
545 | * | ||
546 | * @return Closure | ||
547 | */ | ||
548 | function get_curl_download_callback( | ||
549 | &$charset, | ||
550 | &$title, | ||
551 | &$description, | ||
552 | &$keywords, | ||
553 | $retrieveDescription | ||
554 | ) { | ||
555 | $currentChunk = 0; | ||
556 | $foundChunk = null; | ||
557 | |||
558 | /** | ||
559 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). | ||
560 | * | ||
561 | * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' | ||
562 | * Then we extract the title and the charset and stop the download when it's done. | ||
563 | * | ||
564 | * @param resource $ch cURL resource | ||
565 | * @param string $data chunk of data being downloaded | ||
566 | * | ||
567 | * @return int|bool length of $data or false if we need to stop the download | ||
568 | */ | ||
569 | return function ($ch, $data) use ( | ||
570 | $retrieveDescription, | ||
571 | &$charset, | ||
572 | &$title, | ||
573 | &$description, | ||
574 | &$keywords, | ||
575 | &$currentChunk, | ||
576 | &$foundChunk | ||
577 | ) { | ||
578 | $chunkLength = strlen($data); | ||
579 | $currentChunk++; | ||
580 | |||
581 | if (empty($charset)) { | ||
582 | $charset = html_extract_charset($data); | ||
583 | } | ||
584 | if (empty($title)) { | ||
585 | $title = html_extract_title($data); | ||
586 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
587 | } | ||
588 | if (empty($title)) { | ||
589 | $title = html_extract_tag('title', $data); | ||
590 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
591 | } | ||
592 | if ($retrieveDescription && empty($description)) { | ||
593 | $description = html_extract_tag('description', $data); | ||
594 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | ||
595 | } | ||
596 | if ($retrieveDescription && empty($keywords)) { | ||
597 | $keywords = html_extract_tag('keywords', $data); | ||
598 | if (! empty($keywords)) { | ||
599 | $foundChunk = $currentChunk; | ||
600 | // Keywords use the format tag1, tag2 multiple words, tag | ||
601 | // So we format them to match Shaarli's separator and glue multiple words with '-' | ||
602 | $keywords = implode(' ', array_map(function($keyword) { | ||
603 | return implode('-', preg_split('/\s+/', trim($keyword))); | ||
604 | }, explode(',', $keywords))); | ||
605 | } | ||
606 | } | ||
607 | |||
608 | // We got everything we want, stop the download. | ||
609 | // If we already found either the title, description or keywords, | ||
610 | // it's highly unlikely that we'll found the other metas further than | ||
611 | // in the same chunk of data or the next one. So we also stop the download after that. | ||
612 | if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | ||
613 | && (! $retrieveDescription | ||
614 | || $foundChunk < $currentChunk | ||
615 | || (!empty($title) && !empty($description) && !empty($keywords)) | ||
616 | ) | ||
617 | ) { | ||
618 | return false; | ||
619 | } | ||
620 | |||
621 | return $chunkLength; | ||
622 | }; | ||
623 | } | ||
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..ba9bd40c --- /dev/null +++ b/application/http/MetadataRetriever.php | |||
@@ -0,0 +1,69 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Http; | ||
6 | |||
7 | use Shaarli\Config\ConfigManager; | ||
8 | |||
9 | /** | ||
10 | * HTTP Tool used to extract metadata from external URL (title, description, etc.). | ||
11 | */ | ||
12 | class MetadataRetriever | ||
13 | { | ||
14 | /** @var ConfigManager */ | ||
15 | protected $conf; | ||
16 | |||
17 | /** @var HttpAccess */ | ||
18 | protected $httpAccess; | ||
19 | |||
20 | public function __construct(ConfigManager $conf, HttpAccess $httpAccess) | ||
21 | { | ||
22 | $this->conf = $conf; | ||
23 | $this->httpAccess = $httpAccess; | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * Retrieve metadata for given URL. | ||
28 | * | ||
29 | * @return array [ | ||
30 | * 'title' => <remote title>, | ||
31 | * 'description' => <remote description>, | ||
32 | * 'tags' => <remote keywords>, | ||
33 | * ] | ||
34 | */ | ||
35 | public function retrieve(string $url): array | ||
36 | { | ||
37 | $charset = null; | ||
38 | $title = null; | ||
39 | $description = null; | ||
40 | $tags = null; | ||
41 | $retrieveDescription = $this->conf->get('general.retrieve_description'); | ||
42 | |||
43 | // Short timeout to keep the application responsive | ||
44 | // The callback will fill $charset and $title with data from the downloaded page. | ||
45 | $this->httpAccess->getHttpResponse( | ||
46 | $url, | ||
47 | $this->conf->get('general.download_timeout', 30), | ||
48 | $this->conf->get('general.download_max_size', 4194304), | ||
49 | $this->httpAccess->getCurlHeaderCallback($charset), | ||
50 | $this->httpAccess->getCurlDownloadCallback( | ||
51 | $charset, | ||
52 | $title, | ||
53 | $description, | ||
54 | $tags, | ||
55 | $retrieveDescription | ||
56 | ) | ||
57 | ); | ||
58 | |||
59 | if (!empty($title) && strtolower($charset) !== 'utf-8') { | ||
60 | $title = mb_convert_encoding($title, 'utf-8', $charset); | ||
61 | } | ||
62 | |||
63 | return [ | ||
64 | 'title' => $title, | ||
65 | 'description' => $description, | ||
66 | 'tags' => $tags, | ||
67 | ]; | ||
68 | } | ||
69 | } | ||