diff options
author | ArthurHoaro <arthur@hoa.ro> | 2020-10-15 11:20:33 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2020-10-15 11:36:56 +0200 |
commit | 5334090be04e66da5cb5c3ad487604b3733c5cac (patch) | |
tree | 419217a587c2a15bc97b943acee11fcf7b559937 /application | |
parent | 4cf3564d28dc8e4d08a3e64f09ad045ffbde97ae (diff) | |
download | Shaarli-5334090be04e66da5cb5c3ad487604b3733c5cac.tar.gz Shaarli-5334090be04e66da5cb5c3ad487604b3733c5cac.tar.zst Shaarli-5334090be04e66da5cb5c3ad487604b3733c5cac.zip |
Improve metadata retrieval (performances and accuracy)
- Use dedicated function to download headers to avoid apply multiple regexps on headers
- Also try to extract title from meta tags
Diffstat (limited to 'application')
-rw-r--r-- | application/http/HttpAccess.php | 22 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 123 | ||||
-rw-r--r-- | application/http/MetadataRetriever.php | 1 |
3 files changed, 91 insertions, 55 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..646a5264 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php | |||
@@ -14,9 +14,14 @@ namespace Shaarli\Http; | |||
14 | */ | 14 | */ |
15 | class HttpAccess | 15 | class HttpAccess |
16 | { | 16 | { |
17 | public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 17 | public function getHttpResponse( |
18 | { | 18 | $url, |
19 | return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); | 19 | $timeout = 30, |
20 | $maxBytes = 4194304, | ||
21 | $curlHeaderFunction = null, | ||
22 | $curlWriteFunction = null | ||
23 | ) { | ||
24 | return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); | ||
20 | } | 25 | } |
21 | 26 | ||
22 | public function getCurlDownloadCallback( | 27 | public function getCurlDownloadCallback( |
@@ -24,16 +29,19 @@ class HttpAccess | |||
24 | &$title, | 29 | &$title, |
25 | &$description, | 30 | &$description, |
26 | &$keywords, | 31 | &$keywords, |
27 | $retrieveDescription, | 32 | $retrieveDescription |
28 | $curlGetInfo = 'curl_getinfo' | ||
29 | ) { | 33 | ) { |
30 | return get_curl_download_callback( | 34 | return get_curl_download_callback( |
31 | $charset, | 35 | $charset, |
32 | $title, | 36 | $title, |
33 | $description, | 37 | $description, |
34 | $keywords, | 38 | $keywords, |
35 | $retrieveDescription, | 39 | $retrieveDescription |
36 | $curlGetInfo | ||
37 | ); | 40 | ); |
38 | } | 41 | } |
42 | |||
43 | public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') | ||
44 | { | ||
45 | return get_curl_header_callback($charset, $curlGetInfo); | ||
46 | } | ||
39 | } | 47 | } |
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php | |||
@@ -6,12 +6,14 @@ use Shaarli\Http\Url; | |||
6 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
7 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
8 | * | 8 | * |
9 | * @param string $url URL to get (http://...) | 9 | * @param string $url URL to get (http://...) |
10 | * @param int $timeout network timeout (in seconds) | 10 | * @param int $timeout network timeout (in seconds) |
11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | 11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) |
12 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | 12 | * @param callable|string $curlHeaderFunction Optional callback called during the download of headers |
13 | * Can be used to add download conditions on the | 13 | * (CURLOPT_HEADERFUNCTION) |
14 | * headers (response code, content type, etc.). | 14 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). |
15 | * Can be used to add download conditions on the | ||
16 | * headers (response code, content type, etc.). | ||
15 | * | 17 | * |
16 | * @return array HTTP response headers, downloaded content | 18 | * @return array HTTP response headers, downloaded content |
17 | * | 19 | * |
@@ -35,8 +37,13 @@ use Shaarli\Http\Url; | |||
35 | * @see http://stackoverflow.com/q/9183178 | 37 | * @see http://stackoverflow.com/q/9183178 |
36 | * @see http://stackoverflow.com/q/1462720 | 38 | * @see http://stackoverflow.com/q/1462720 |
37 | */ | 39 | */ |
38 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 40 | function get_http_response( |
39 | { | 41 | $url, |
42 | $timeout = 30, | ||
43 | $maxBytes = 4194304, | ||
44 | $curlHeaderFunction = null, | ||
45 | $curlWriteFunction = null | ||
46 | ) { | ||
40 | $urlObj = new Url($url); | 47 | $urlObj = new Url($url); |
41 | $cleanUrl = $urlObj->idnToAscii(); | 48 | $cleanUrl = $urlObj->idnToAscii(); |
42 | 49 | ||
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
70 | // General cURL settings | 77 | // General cURL settings |
71 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | 78 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); |
72 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | 79 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
73 | curl_setopt($ch, CURLOPT_HEADER, true); | 80 | // Default header download if the $curlHeaderFunction is not defined |
81 | curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); | ||
74 | curl_setopt( | 82 | curl_setopt( |
75 | $ch, | 83 | $ch, |
76 | CURLOPT_HTTPHEADER, | 84 | CURLOPT_HTTPHEADER, |
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
81 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | 89 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); |
82 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | 90 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); |
83 | 91 | ||
84 | if (is_callable($curlWriteFunction)) { | ||
85 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
86 | } | ||
87 | |||
88 | // Max download size management | 92 | // Max download size management |
89 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | 93 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); |
90 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | 94 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); |
95 | if (is_callable($curlHeaderFunction)) { | ||
96 | curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); | ||
97 | } | ||
98 | if (is_callable($curlWriteFunction)) { | ||
99 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
100 | } | ||
91 | curl_setopt( | 101 | curl_setopt( |
92 | $ch, | 102 | $ch, |
93 | CURLOPT_PROGRESSFUNCTION, | 103 | CURLOPT_PROGRESSFUNCTION, |
94 | function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { | 104 | function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { |
95 | if (version_compare(phpversion(), '5.5', '<')) { | 105 | $downloaded = $arg2; |
96 | // PHP version lower than 5.5 | 106 | |
97 | // Callback has 4 arguments | ||
98 | $downloaded = $arg1; | ||
99 | } else { | ||
100 | // Callback has 5 arguments | ||
101 | $downloaded = $arg2; | ||
102 | } | ||
103 | // Non-zero return stops downloading | 107 | // Non-zero return stops downloading |
104 | return ($downloaded > $maxBytes) ? 1 : 0; | 108 | return ($downloaded > $maxBytes) ? 1 : 0; |
105 | } | 109 | } |
@@ -493,6 +497,46 @@ function is_https($server) | |||
493 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | 497 | * Get cURL callback function for CURLOPT_WRITEFUNCTION |
494 | * | 498 | * |
495 | * @param string $charset to extract from the downloaded page (reference) | 499 | * @param string $charset to extract from the downloaded page (reference) |
500 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
501 | * | ||
502 | * @return Closure | ||
503 | */ | ||
504 | function get_curl_header_callback( | ||
505 | &$charset, | ||
506 | $curlGetInfo = 'curl_getinfo' | ||
507 | ) { | ||
508 | $isRedirected = false; | ||
509 | |||
510 | return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { | ||
511 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
512 | $chunkLength = strlen($data); | ||
513 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
514 | $isRedirected = true; | ||
515 | return $chunkLength; | ||
516 | } | ||
517 | if (!empty($responseCode) && $responseCode !== 200) { | ||
518 | return false; | ||
519 | } | ||
520 | // After a redirection, the content type will keep the previous request value | ||
521 | // until it finds the next content-type header. | ||
522 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
523 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
524 | } | ||
525 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
526 | return false; | ||
527 | } | ||
528 | if (!empty($contentType) && empty($charset)) { | ||
529 | $charset = header_extract_charset($contentType); | ||
530 | } | ||
531 | |||
532 | return $chunkLength; | ||
533 | }; | ||
534 | } | ||
535 | |||
536 | /** | ||
537 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
538 | * | ||
539 | * @param string $charset to extract from the downloaded page (reference) | ||
496 | * @param string $title to extract from the downloaded page (reference) | 540 | * @param string $title to extract from the downloaded page (reference) |
497 | * @param string $description to extract from the downloaded page (reference) | 541 | * @param string $description to extract from the downloaded page (reference) |
498 | * @param string $keywords to extract from the downloaded page (reference) | 542 | * @param string $keywords to extract from the downloaded page (reference) |
@@ -506,10 +550,8 @@ function get_curl_download_callback( | |||
506 | &$title, | 550 | &$title, |
507 | &$description, | 551 | &$description, |
508 | &$keywords, | 552 | &$keywords, |
509 | $retrieveDescription, | 553 | $retrieveDescription |
510 | $curlGetInfo = 'curl_getinfo' | ||
511 | ) { | 554 | ) { |
512 | $isRedirected = false; | ||
513 | $currentChunk = 0; | 555 | $currentChunk = 0; |
514 | $foundChunk = null; | 556 | $foundChunk = null; |
515 | 557 | ||
@@ -524,37 +566,18 @@ function get_curl_download_callback( | |||
524 | * | 566 | * |
525 | * @return int|bool length of $data or false if we need to stop the download | 567 | * @return int|bool length of $data or false if we need to stop the download |
526 | */ | 568 | */ |
527 | return function (&$ch, $data) use ( | 569 | return function ($ch, $data) use ( |
528 | $retrieveDescription, | 570 | $retrieveDescription, |
529 | $curlGetInfo, | ||
530 | &$charset, | 571 | &$charset, |
531 | &$title, | 572 | &$title, |
532 | &$description, | 573 | &$description, |
533 | &$keywords, | 574 | &$keywords, |
534 | &$isRedirected, | ||
535 | &$currentChunk, | 575 | &$currentChunk, |
536 | &$foundChunk | 576 | &$foundChunk |
537 | ) { | 577 | ) { |
578 | $chunkLength = strlen($data); | ||
538 | $currentChunk++; | 579 | $currentChunk++; |
539 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | 580 | |
540 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
541 | $isRedirected = true; | ||
542 | return strlen($data); | ||
543 | } | ||
544 | if (!empty($responseCode) && $responseCode !== 200) { | ||
545 | return false; | ||
546 | } | ||
547 | // After a redirection, the content type will keep the previous request value | ||
548 | // until it finds the next content-type header. | ||
549 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
550 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
551 | } | ||
552 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
553 | return false; | ||
554 | } | ||
555 | if (!empty($contentType) && empty($charset)) { | ||
556 | $charset = header_extract_charset($contentType); | ||
557 | } | ||
558 | if (empty($charset)) { | 581 | if (empty($charset)) { |
559 | $charset = html_extract_charset($data); | 582 | $charset = html_extract_charset($data); |
560 | } | 583 | } |
@@ -562,6 +585,10 @@ function get_curl_download_callback( | |||
562 | $title = html_extract_title($data); | 585 | $title = html_extract_title($data); |
563 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | 586 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; |
564 | } | 587 | } |
588 | if (empty($title)) { | ||
589 | $title = html_extract_tag('title', $data); | ||
590 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
591 | } | ||
565 | if ($retrieveDescription && empty($description)) { | 592 | if ($retrieveDescription && empty($description)) { |
566 | $description = html_extract_tag('description', $data); | 593 | $description = html_extract_tag('description', $data); |
567 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | 594 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; |
@@ -591,6 +618,6 @@ function get_curl_download_callback( | |||
591 | return false; | 618 | return false; |
592 | } | 619 | } |
593 | 620 | ||
594 | return strlen($data); | 621 | return $chunkLength; |
595 | }; | 622 | }; |
596 | } | 623 | } |
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php index 2ca982e2..ba9bd40c 100644 --- a/application/http/MetadataRetriever.php +++ b/application/http/MetadataRetriever.php | |||
@@ -46,6 +46,7 @@ class MetadataRetriever | |||
46 | $url, | 46 | $url, |
47 | $this->conf->get('general.download_timeout', 30), | 47 | $this->conf->get('general.download_timeout', 30), |
48 | $this->conf->get('general.download_max_size', 4194304), | 48 | $this->conf->get('general.download_max_size', 4194304), |
49 | $this->httpAccess->getCurlHeaderCallback($charset), | ||
49 | $this->httpAccess->getCurlDownloadCallback( | 50 | $this->httpAccess->getCurlDownloadCallback( |
50 | $charset, | 51 | $charset, |
51 | $title, | 52 | $title, |