diff options
author | ArthurHoaro <arthur@hoa.ro> | 2020-10-20 10:14:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-20 10:14:28 +0200 |
commit | 9b3c1270bcbe4f8e30e0160da8badd43dd94871a (patch) | |
tree | f1d87ed084970cf0c3ef99fef8e4ad6dcea423ce /application | |
parent | 552c3b942afe565b780785eab5b2e29c1e800c2e (diff) | |
parent | 5334090be04e66da5cb5c3ad487604b3733c5cac (diff) | |
download | Shaarli-9b3c1270bcbe4f8e30e0160da8badd43dd94871a.tar.gz Shaarli-9b3c1270bcbe4f8e30e0160da8badd43dd94871a.tar.zst Shaarli-9b3c1270bcbe4f8e30e0160da8badd43dd94871a.zip |
Merge pull request #1567 from ArthurHoaro/feature/async-title-retrieval
Diffstat (limited to 'application')
-rw-r--r-- | application/config/ConfigManager.php | 3 | ||||
-rw-r--r-- | application/container/ContainerBuilder.php | 5 | ||||
-rw-r--r-- | application/container/ShaarliContainer.php | 2 | ||||
-rw-r--r-- | application/front/controller/admin/ManageShaareController.php | 36 | ||||
-rw-r--r-- | application/front/controller/admin/MetadataController.php | 29 | ||||
-rw-r--r-- | application/http/HttpAccess.php | 22 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 123 | ||||
-rw-r--r-- | application/http/MetadataRetriever.php | 69 |
8 files changed, 209 insertions, 80 deletions
diff --git a/application/config/ConfigManager.php b/application/config/ConfigManager.php index 4c98be30..fb085023 100644 --- a/application/config/ConfigManager.php +++ b/application/config/ConfigManager.php | |||
@@ -366,7 +366,8 @@ class ConfigManager | |||
366 | $this->setEmpty('general.links_per_page', 20); | 366 | $this->setEmpty('general.links_per_page', 20); |
367 | $this->setEmpty('general.enabled_plugins', self::$DEFAULT_PLUGINS); | 367 | $this->setEmpty('general.enabled_plugins', self::$DEFAULT_PLUGINS); |
368 | $this->setEmpty('general.default_note_title', 'Note: '); | 368 | $this->setEmpty('general.default_note_title', 'Note: '); |
369 | $this->setEmpty('general.retrieve_description', false); | 369 | $this->setEmpty('general.retrieve_description', true); |
370 | $this->setEmpty('general.enable_async_metadata', true); | ||
370 | 371 | ||
371 | $this->setEmpty('updates.check_updates', false); | 372 | $this->setEmpty('updates.check_updates', false); |
372 | $this->setEmpty('updates.check_updates_branch', 'stable'); | 373 | $this->setEmpty('updates.check_updates_branch', 'stable'); |
diff --git a/application/container/ContainerBuilder.php b/application/container/ContainerBuilder.php index c21d58dd..fd94a1c3 100644 --- a/application/container/ContainerBuilder.php +++ b/application/container/ContainerBuilder.php | |||
@@ -14,6 +14,7 @@ use Shaarli\Front\Controller\Visitor\ErrorController; | |||
14 | use Shaarli\Front\Controller\Visitor\ErrorNotFoundController; | 14 | use Shaarli\Front\Controller\Visitor\ErrorNotFoundController; |
15 | use Shaarli\History; | 15 | use Shaarli\History; |
16 | use Shaarli\Http\HttpAccess; | 16 | use Shaarli\Http\HttpAccess; |
17 | use Shaarli\Http\MetadataRetriever; | ||
17 | use Shaarli\Netscape\NetscapeBookmarkUtils; | 18 | use Shaarli\Netscape\NetscapeBookmarkUtils; |
18 | use Shaarli\Plugin\PluginManager; | 19 | use Shaarli\Plugin\PluginManager; |
19 | use Shaarli\Render\PageBuilder; | 20 | use Shaarli\Render\PageBuilder; |
@@ -90,6 +91,10 @@ class ContainerBuilder | |||
90 | ); | 91 | ); |
91 | }; | 92 | }; |
92 | 93 | ||
94 | $container['metadataRetriever'] = function (ShaarliContainer $container): MetadataRetriever { | ||
95 | return new MetadataRetriever($container->conf, $container->httpAccess); | ||
96 | }; | ||
97 | |||
93 | $container['pageBuilder'] = function (ShaarliContainer $container): PageBuilder { | 98 | $container['pageBuilder'] = function (ShaarliContainer $container): PageBuilder { |
94 | return new PageBuilder( | 99 | return new PageBuilder( |
95 | $container->conf, | 100 | $container->conf, |
diff --git a/application/container/ShaarliContainer.php b/application/container/ShaarliContainer.php index 66e669aa..3a7c238f 100644 --- a/application/container/ShaarliContainer.php +++ b/application/container/ShaarliContainer.php | |||
@@ -10,6 +10,7 @@ use Shaarli\Feed\FeedBuilder; | |||
10 | use Shaarli\Formatter\FormatterFactory; | 10 | use Shaarli\Formatter\FormatterFactory; |
11 | use Shaarli\History; | 11 | use Shaarli\History; |
12 | use Shaarli\Http\HttpAccess; | 12 | use Shaarli\Http\HttpAccess; |
13 | use Shaarli\Http\MetadataRetriever; | ||
13 | use Shaarli\Netscape\NetscapeBookmarkUtils; | 14 | use Shaarli\Netscape\NetscapeBookmarkUtils; |
14 | use Shaarli\Plugin\PluginManager; | 15 | use Shaarli\Plugin\PluginManager; |
15 | use Shaarli\Render\PageBuilder; | 16 | use Shaarli\Render\PageBuilder; |
@@ -35,6 +36,7 @@ use Slim\Container; | |||
35 | * @property History $history | 36 | * @property History $history |
36 | * @property HttpAccess $httpAccess | 37 | * @property HttpAccess $httpAccess |
37 | * @property LoginManager $loginManager | 38 | * @property LoginManager $loginManager |
39 | * @property MetadataRetriever $metadataRetriever | ||
38 | * @property NetscapeBookmarkUtils $netscapeBookmarkUtils | 40 | * @property NetscapeBookmarkUtils $netscapeBookmarkUtils |
39 | * @property callable $notFoundHandler Overrides default Slim exception display | 41 | * @property callable $notFoundHandler Overrides default Slim exception display |
40 | * @property PageBuilder $pageBuilder | 42 | * @property PageBuilder $pageBuilder |
diff --git a/application/front/controller/admin/ManageShaareController.php b/application/front/controller/admin/ManageShaareController.php index bb083486..df2f1631 100644 --- a/application/front/controller/admin/ManageShaareController.php +++ b/application/front/controller/admin/ManageShaareController.php | |||
@@ -53,36 +53,22 @@ class ManageShaareController extends ShaarliAdminController | |||
53 | 53 | ||
54 | // If this is an HTTP(S) link, we try go get the page to extract | 54 | // If this is an HTTP(S) link, we try go get the page to extract |
55 | // the title (otherwise we will to straight to the edit form.) | 55 | // the title (otherwise we will to straight to the edit form.) |
56 | if (empty($title) && strpos(get_url_scheme($url) ?: '', 'http') !== false) { | 56 | if (true !== $this->container->conf->get('general.enable_async_metadata', true) |
57 | $retrieveDescription = $this->container->conf->get('general.retrieve_description'); | 57 | && empty($title) |
58 | // Short timeout to keep the application responsive | 58 | && strpos(get_url_scheme($url) ?: '', 'http') !== false |
59 | // The callback will fill $charset and $title with data from the downloaded page. | 59 | ) { |
60 | $this->container->httpAccess->getHttpResponse( | 60 | $metadata = $this->container->metadataRetriever->retrieve($url); |
61 | $url, | ||
62 | $this->container->conf->get('general.download_timeout', 30), | ||
63 | $this->container->conf->get('general.download_max_size', 4194304), | ||
64 | $this->container->httpAccess->getCurlDownloadCallback( | ||
65 | $charset, | ||
66 | $title, | ||
67 | $description, | ||
68 | $tags, | ||
69 | $retrieveDescription | ||
70 | ) | ||
71 | ); | ||
72 | if (! empty($title) && strtolower($charset) !== 'utf-8' && mb_check_encoding($charset)) { | ||
73 | $title = mb_convert_encoding($title, 'utf-8', $charset); | ||
74 | } | ||
75 | } | 61 | } |
76 | 62 | ||
77 | if (empty($url) && empty($title)) { | 63 | if (empty($url)) { |
78 | $title = $this->container->conf->get('general.default_note_title', t('Note: ')); | 64 | $metadata['title'] = $this->container->conf->get('general.default_note_title', t('Note: ')); |
79 | } | 65 | } |
80 | 66 | ||
81 | $link = [ | 67 | $link = [ |
82 | 'title' => $title, | 68 | 'title' => $title ?? $metadata['title'] ?? '', |
83 | 'url' => $url ?? '', | 69 | 'url' => $url ?? '', |
84 | 'description' => $description ?? '', | 70 | 'description' => $description ?? $metadata['description'] ?? '', |
85 | 'tags' => $tags ?? '', | 71 | 'tags' => $tags ?? $metadata['tags'] ?? '', |
86 | 'private' => $private, | 72 | 'private' => $private, |
87 | ]; | 73 | ]; |
88 | } else { | 74 | } else { |
@@ -352,6 +338,8 @@ class ManageShaareController extends ShaarliAdminController | |||
352 | 'source' => $request->getParam('source') ?? '', | 338 | 'source' => $request->getParam('source') ?? '', |
353 | 'tags' => $tags, | 339 | 'tags' => $tags, |
354 | 'default_private_links' => $this->container->conf->get('privacy.default_private_links', false), | 340 | 'default_private_links' => $this->container->conf->get('privacy.default_private_links', false), |
341 | 'async_metadata' => $this->container->conf->get('general.enable_async_metadata', true), | ||
342 | 'retrieve_description' => $this->container->conf->get('general.retrieve_description', false), | ||
355 | ]); | 343 | ]); |
356 | 344 | ||
357 | $this->executePageHooks('render_editlink', $data, TemplatePage::EDIT_LINK); | 345 | $this->executePageHooks('render_editlink', $data, TemplatePage::EDIT_LINK); |
diff --git a/application/front/controller/admin/MetadataController.php b/application/front/controller/admin/MetadataController.php new file mode 100644 index 00000000..ff845944 --- /dev/null +++ b/application/front/controller/admin/MetadataController.php | |||
@@ -0,0 +1,29 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Front\Controller\Admin; | ||
6 | |||
7 | use Slim\Http\Request; | ||
8 | use Slim\Http\Response; | ||
9 | |||
10 | /** | ||
11 | * Controller used to retrieve/update bookmark's metadata. | ||
12 | */ | ||
13 | class MetadataController extends ShaarliAdminController | ||
14 | { | ||
15 | /** | ||
16 | * GET /admin/metadata/{url} - Attempt to retrieve the bookmark title from provided URL. | ||
17 | */ | ||
18 | public function ajaxRetrieveTitle(Request $request, Response $response): Response | ||
19 | { | ||
20 | $url = $request->getParam('url'); | ||
21 | |||
22 | // Only try to extract metadata from URL with HTTP(s) scheme | ||
23 | if (!empty($url) && strpos(get_url_scheme($url) ?: '', 'http') !== false) { | ||
24 | return $response->withJson($this->container->metadataRetriever->retrieve($url)); | ||
25 | } | ||
26 | |||
27 | return $response->withJson([]); | ||
28 | } | ||
29 | } | ||
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..646a5264 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php | |||
@@ -14,9 +14,14 @@ namespace Shaarli\Http; | |||
14 | */ | 14 | */ |
15 | class HttpAccess | 15 | class HttpAccess |
16 | { | 16 | { |
17 | public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 17 | public function getHttpResponse( |
18 | { | 18 | $url, |
19 | return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); | 19 | $timeout = 30, |
20 | $maxBytes = 4194304, | ||
21 | $curlHeaderFunction = null, | ||
22 | $curlWriteFunction = null | ||
23 | ) { | ||
24 | return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); | ||
20 | } | 25 | } |
21 | 26 | ||
22 | public function getCurlDownloadCallback( | 27 | public function getCurlDownloadCallback( |
@@ -24,16 +29,19 @@ class HttpAccess | |||
24 | &$title, | 29 | &$title, |
25 | &$description, | 30 | &$description, |
26 | &$keywords, | 31 | &$keywords, |
27 | $retrieveDescription, | 32 | $retrieveDescription |
28 | $curlGetInfo = 'curl_getinfo' | ||
29 | ) { | 33 | ) { |
30 | return get_curl_download_callback( | 34 | return get_curl_download_callback( |
31 | $charset, | 35 | $charset, |
32 | $title, | 36 | $title, |
33 | $description, | 37 | $description, |
34 | $keywords, | 38 | $keywords, |
35 | $retrieveDescription, | 39 | $retrieveDescription |
36 | $curlGetInfo | ||
37 | ); | 40 | ); |
38 | } | 41 | } |
42 | |||
43 | public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') | ||
44 | { | ||
45 | return get_curl_header_callback($charset, $curlGetInfo); | ||
46 | } | ||
39 | } | 47 | } |
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php | |||
@@ -6,12 +6,14 @@ use Shaarli\Http\Url; | |||
6 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
7 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
8 | * | 8 | * |
9 | * @param string $url URL to get (http://...) | 9 | * @param string $url URL to get (http://...) |
10 | * @param int $timeout network timeout (in seconds) | 10 | * @param int $timeout network timeout (in seconds) |
11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | 11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) |
12 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | 12 | * @param callable|string $curlHeaderFunction Optional callback called during the download of headers |
13 | * Can be used to add download conditions on the | 13 | * (CURLOPT_HEADERFUNCTION) |
14 | * headers (response code, content type, etc.). | 14 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). |
15 | * Can be used to add download conditions on the | ||
16 | * headers (response code, content type, etc.). | ||
15 | * | 17 | * |
16 | * @return array HTTP response headers, downloaded content | 18 | * @return array HTTP response headers, downloaded content |
17 | * | 19 | * |
@@ -35,8 +37,13 @@ use Shaarli\Http\Url; | |||
35 | * @see http://stackoverflow.com/q/9183178 | 37 | * @see http://stackoverflow.com/q/9183178 |
36 | * @see http://stackoverflow.com/q/1462720 | 38 | * @see http://stackoverflow.com/q/1462720 |
37 | */ | 39 | */ |
38 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | 40 | function get_http_response( |
39 | { | 41 | $url, |
42 | $timeout = 30, | ||
43 | $maxBytes = 4194304, | ||
44 | $curlHeaderFunction = null, | ||
45 | $curlWriteFunction = null | ||
46 | ) { | ||
40 | $urlObj = new Url($url); | 47 | $urlObj = new Url($url); |
41 | $cleanUrl = $urlObj->idnToAscii(); | 48 | $cleanUrl = $urlObj->idnToAscii(); |
42 | 49 | ||
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
70 | // General cURL settings | 77 | // General cURL settings |
71 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | 78 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); |
72 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | 79 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
73 | curl_setopt($ch, CURLOPT_HEADER, true); | 80 | // Default header download if the $curlHeaderFunction is not defined |
81 | curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); | ||
74 | curl_setopt( | 82 | curl_setopt( |
75 | $ch, | 83 | $ch, |
76 | CURLOPT_HTTPHEADER, | 84 | CURLOPT_HTTPHEADER, |
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF | |||
81 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | 89 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); |
82 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | 90 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); |
83 | 91 | ||
84 | if (is_callable($curlWriteFunction)) { | ||
85 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
86 | } | ||
87 | |||
88 | // Max download size management | 92 | // Max download size management |
89 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | 93 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); |
90 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | 94 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); |
95 | if (is_callable($curlHeaderFunction)) { | ||
96 | curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); | ||
97 | } | ||
98 | if (is_callable($curlWriteFunction)) { | ||
99 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
100 | } | ||
91 | curl_setopt( | 101 | curl_setopt( |
92 | $ch, | 102 | $ch, |
93 | CURLOPT_PROGRESSFUNCTION, | 103 | CURLOPT_PROGRESSFUNCTION, |
94 | function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { | 104 | function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { |
95 | if (version_compare(phpversion(), '5.5', '<')) { | 105 | $downloaded = $arg2; |
96 | // PHP version lower than 5.5 | 106 | |
97 | // Callback has 4 arguments | ||
98 | $downloaded = $arg1; | ||
99 | } else { | ||
100 | // Callback has 5 arguments | ||
101 | $downloaded = $arg2; | ||
102 | } | ||
103 | // Non-zero return stops downloading | 107 | // Non-zero return stops downloading |
104 | return ($downloaded > $maxBytes) ? 1 : 0; | 108 | return ($downloaded > $maxBytes) ? 1 : 0; |
105 | } | 109 | } |
@@ -493,6 +497,46 @@ function is_https($server) | |||
493 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | 497 | * Get cURL callback function for CURLOPT_WRITEFUNCTION |
494 | * | 498 | * |
495 | * @param string $charset to extract from the downloaded page (reference) | 499 | * @param string $charset to extract from the downloaded page (reference) |
500 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
501 | * | ||
502 | * @return Closure | ||
503 | */ | ||
504 | function get_curl_header_callback( | ||
505 | &$charset, | ||
506 | $curlGetInfo = 'curl_getinfo' | ||
507 | ) { | ||
508 | $isRedirected = false; | ||
509 | |||
510 | return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { | ||
511 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
512 | $chunkLength = strlen($data); | ||
513 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
514 | $isRedirected = true; | ||
515 | return $chunkLength; | ||
516 | } | ||
517 | if (!empty($responseCode) && $responseCode !== 200) { | ||
518 | return false; | ||
519 | } | ||
520 | // After a redirection, the content type will keep the previous request value | ||
521 | // until it finds the next content-type header. | ||
522 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
523 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
524 | } | ||
525 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
526 | return false; | ||
527 | } | ||
528 | if (!empty($contentType) && empty($charset)) { | ||
529 | $charset = header_extract_charset($contentType); | ||
530 | } | ||
531 | |||
532 | return $chunkLength; | ||
533 | }; | ||
534 | } | ||
535 | |||
536 | /** | ||
537 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
538 | * | ||
539 | * @param string $charset to extract from the downloaded page (reference) | ||
496 | * @param string $title to extract from the downloaded page (reference) | 540 | * @param string $title to extract from the downloaded page (reference) |
497 | * @param string $description to extract from the downloaded page (reference) | 541 | * @param string $description to extract from the downloaded page (reference) |
498 | * @param string $keywords to extract from the downloaded page (reference) | 542 | * @param string $keywords to extract from the downloaded page (reference) |
@@ -506,10 +550,8 @@ function get_curl_download_callback( | |||
506 | &$title, | 550 | &$title, |
507 | &$description, | 551 | &$description, |
508 | &$keywords, | 552 | &$keywords, |
509 | $retrieveDescription, | 553 | $retrieveDescription |
510 | $curlGetInfo = 'curl_getinfo' | ||
511 | ) { | 554 | ) { |
512 | $isRedirected = false; | ||
513 | $currentChunk = 0; | 555 | $currentChunk = 0; |
514 | $foundChunk = null; | 556 | $foundChunk = null; |
515 | 557 | ||
@@ -524,37 +566,18 @@ function get_curl_download_callback( | |||
524 | * | 566 | * |
525 | * @return int|bool length of $data or false if we need to stop the download | 567 | * @return int|bool length of $data or false if we need to stop the download |
526 | */ | 568 | */ |
527 | return function (&$ch, $data) use ( | 569 | return function ($ch, $data) use ( |
528 | $retrieveDescription, | 570 | $retrieveDescription, |
529 | $curlGetInfo, | ||
530 | &$charset, | 571 | &$charset, |
531 | &$title, | 572 | &$title, |
532 | &$description, | 573 | &$description, |
533 | &$keywords, | 574 | &$keywords, |
534 | &$isRedirected, | ||
535 | &$currentChunk, | 575 | &$currentChunk, |
536 | &$foundChunk | 576 | &$foundChunk |
537 | ) { | 577 | ) { |
578 | $chunkLength = strlen($data); | ||
538 | $currentChunk++; | 579 | $currentChunk++; |
539 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | 580 | |
540 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
541 | $isRedirected = true; | ||
542 | return strlen($data); | ||
543 | } | ||
544 | if (!empty($responseCode) && $responseCode !== 200) { | ||
545 | return false; | ||
546 | } | ||
547 | // After a redirection, the content type will keep the previous request value | ||
548 | // until it finds the next content-type header. | ||
549 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
550 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
551 | } | ||
552 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
553 | return false; | ||
554 | } | ||
555 | if (!empty($contentType) && empty($charset)) { | ||
556 | $charset = header_extract_charset($contentType); | ||
557 | } | ||
558 | if (empty($charset)) { | 581 | if (empty($charset)) { |
559 | $charset = html_extract_charset($data); | 582 | $charset = html_extract_charset($data); |
560 | } | 583 | } |
@@ -562,6 +585,10 @@ function get_curl_download_callback( | |||
562 | $title = html_extract_title($data); | 585 | $title = html_extract_title($data); |
563 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | 586 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; |
564 | } | 587 | } |
588 | if (empty($title)) { | ||
589 | $title = html_extract_tag('title', $data); | ||
590 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
591 | } | ||
565 | if ($retrieveDescription && empty($description)) { | 592 | if ($retrieveDescription && empty($description)) { |
566 | $description = html_extract_tag('description', $data); | 593 | $description = html_extract_tag('description', $data); |
567 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | 594 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; |
@@ -591,6 +618,6 @@ function get_curl_download_callback( | |||
591 | return false; | 618 | return false; |
592 | } | 619 | } |
593 | 620 | ||
594 | return strlen($data); | 621 | return $chunkLength; |
595 | }; | 622 | }; |
596 | } | 623 | } |
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..ba9bd40c --- /dev/null +++ b/application/http/MetadataRetriever.php | |||
@@ -0,0 +1,69 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Http; | ||
6 | |||
7 | use Shaarli\Config\ConfigManager; | ||
8 | |||
9 | /** | ||
10 | * HTTP Tool used to extract metadata from external URL (title, description, etc.). | ||
11 | */ | ||
12 | class MetadataRetriever | ||
13 | { | ||
14 | /** @var ConfigManager */ | ||
15 | protected $conf; | ||
16 | |||
17 | /** @var HttpAccess */ | ||
18 | protected $httpAccess; | ||
19 | |||
20 | public function __construct(ConfigManager $conf, HttpAccess $httpAccess) | ||
21 | { | ||
22 | $this->conf = $conf; | ||
23 | $this->httpAccess = $httpAccess; | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * Retrieve metadata for given URL. | ||
28 | * | ||
29 | * @return array [ | ||
30 | * 'title' => <remote title>, | ||
31 | * 'description' => <remote description>, | ||
32 | * 'tags' => <remote keywords>, | ||
33 | * ] | ||
34 | */ | ||
35 | public function retrieve(string $url): array | ||
36 | { | ||
37 | $charset = null; | ||
38 | $title = null; | ||
39 | $description = null; | ||
40 | $tags = null; | ||
41 | $retrieveDescription = $this->conf->get('general.retrieve_description'); | ||
42 | |||
43 | // Short timeout to keep the application responsive | ||
44 | // The callback will fill $charset and $title with data from the downloaded page. | ||
45 | $this->httpAccess->getHttpResponse( | ||
46 | $url, | ||
47 | $this->conf->get('general.download_timeout', 30), | ||
48 | $this->conf->get('general.download_max_size', 4194304), | ||
49 | $this->httpAccess->getCurlHeaderCallback($charset), | ||
50 | $this->httpAccess->getCurlDownloadCallback( | ||
51 | $charset, | ||
52 | $title, | ||
53 | $description, | ||
54 | $tags, | ||
55 | $retrieveDescription | ||
56 | ) | ||
57 | ); | ||
58 | |||
59 | if (!empty($title) && strtolower($charset) !== 'utf-8') { | ||
60 | $title = mb_convert_encoding($title, 'utf-8', $charset); | ||
61 | } | ||
62 | |||
63 | return [ | ||
64 | 'title' => $title, | ||
65 | 'description' => $description, | ||
66 | 'tags' => $tags, | ||
67 | ]; | ||
68 | } | ||
69 | } | ||