From 4cf3564d28dc8e4d08a3e64f09ad045ffbde97ae Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Fri, 25 Sep 2020 13:29:36 +0200 Subject: Add a setting to retrieve bookmark metadata asynchrounously - There is a new standalone script (metadata.js) which requests a new controller to get bookmark metadata and fill the form async - This feature is enabled with the new setting: general.enable_async_metadata (enabled by default) - general.retrieve_description is now enabled by default - A small rotating loader animation has a been added to bookmark inputs when metadata is being retrieved (default template) - Custom JS htmlentities has been removed and mathiasbynens/he library is used instead Fixes #1563 --- application/http/MetadataRetriever.php | 68 ++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 application/http/MetadataRetriever.php (limited to 'application/http') diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php new file mode 100644 index 00000000..2ca982e2 --- /dev/null +++ b/application/http/MetadataRetriever.php @@ -0,0 +1,68 @@ +conf = $conf; + $this->httpAccess = $httpAccess; + } + + /** + * Retrieve metadata for given URL. + * + * @return array [ + * 'title' => , + * 'description' => , + * 'tags' => , + * ] + */ + public function retrieve(string $url): array + { + $charset = null; + $title = null; + $description = null; + $tags = null; + $retrieveDescription = $this->conf->get('general.retrieve_description'); + + // Short timeout to keep the application responsive + // The callback will fill $charset and $title with data from the downloaded page. + $this->httpAccess->getHttpResponse( + $url, + $this->conf->get('general.download_timeout', 30), + $this->conf->get('general.download_max_size', 4194304), + $this->httpAccess->getCurlDownloadCallback( + $charset, + $title, + $description, + $tags, + $retrieveDescription + ) + ); + + if (!empty($title) && strtolower($charset) !== 'utf-8') { + $title = mb_convert_encoding($title, 'utf-8', $charset); + } + + return [ + 'title' => $title, + 'description' => $description, + 'tags' => $tags, + ]; + } +} -- cgit v1.2.3 From 5334090be04e66da5cb5c3ad487604b3733c5cac Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 15 Oct 2020 11:20:33 +0200 Subject: Improve metadata retrieval (performances and accuracy) - Use dedicated function to download headers to avoid apply multiple regexps on headers - Also try to extract title from meta tags --- application/http/HttpAccess.php | 22 ++++-- application/http/HttpUtils.php | 123 ++++++++++++++++++++------------- application/http/MetadataRetriever.php | 1 + 3 files changed, 91 insertions(+), 55 deletions(-) (limited to 'application/http') diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 81d9e076..646a5264 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php @@ -14,9 +14,14 @@ namespace Shaarli\Http; */ class HttpAccess { - public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) - { - return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); + public function getHttpResponse( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null + ) { + return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction); } public function getCurlDownloadCallback( @@ -24,16 +29,19 @@ class HttpAccess &$title, &$description, &$keywords, - $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $retrieveDescription ) { return get_curl_download_callback( $charset, $title, $description, $keywords, - $retrieveDescription, - $curlGetInfo + $retrieveDescription ); } + + public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo') + { + return get_curl_header_callback($charset, $curlGetInfo); + } } diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 9f414073..28c12969 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -6,12 +6,14 @@ use Shaarli\Http\Url; * GET an HTTP URL to retrieve its content * Uses the cURL library or a fallback method * - * @param string $url URL to get (http://...) - * @param int $timeout network timeout (in seconds) - * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) - * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). - * Can be used to add download conditions on the - * headers (response code, content type, etc.). + * @param string $url URL to get (http://...) + * @param int $timeout network timeout (in seconds) + * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) + * @param callable|string $curlHeaderFunction Optional callback called during the download of headers + * (CURLOPT_HEADERFUNCTION) + * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). + * Can be used to add download conditions on the + * headers (response code, content type, etc.). * * @return array HTTP response headers, downloaded content * @@ -35,8 +37,13 @@ use Shaarli\Http\Url; * @see http://stackoverflow.com/q/9183178 * @see http://stackoverflow.com/q/1462720 */ -function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) -{ +function get_http_response( + $url, + $timeout = 30, + $maxBytes = 4194304, + $curlHeaderFunction = null, + $curlWriteFunction = null +) { $urlObj = new Url($url); $cleanUrl = $urlObj->idnToAscii(); @@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF // General cURL settings curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_HEADER, true); + // Default header download if the $curlHeaderFunction is not defined + curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction)); curl_setopt( $ch, CURLOPT_HTTPHEADER, @@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); - if (is_callable($curlWriteFunction)) { - curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); - } - // Max download size management curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); + if (is_callable($curlHeaderFunction)) { + curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); + } + if (is_callable($curlWriteFunction)) { + curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); + } curl_setopt( $ch, CURLOPT_PROGRESSFUNCTION, - function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { - if (version_compare(phpversion(), '5.5', '<')) { - // PHP version lower than 5.5 - // Callback has 4 arguments - $downloaded = $arg1; - } else { - // Callback has 5 arguments - $downloaded = $arg2; - } + function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) { + $downloaded = $arg2; + // Non-zero return stops downloading return ($downloaded > $maxBytes) ? 1 : 0; } @@ -489,6 +493,46 @@ function is_https($server) return ! empty($server['HTTPS']); } +/** + * Get cURL callback function for CURLOPT_WRITEFUNCTION + * + * @param string $charset to extract from the downloaded page (reference) + * @param string $curlGetInfo Optionally overrides curl_getinfo function + * + * @return Closure + */ +function get_curl_header_callback( + &$charset, + $curlGetInfo = 'curl_getinfo' +) { + $isRedirected = false; + + return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) { + $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); + $chunkLength = strlen($data); + if (!empty($responseCode) && in_array($responseCode, [301, 302])) { + $isRedirected = true; + return $chunkLength; + } + if (!empty($responseCode) && $responseCode !== 200) { + return false; + } + // After a redirection, the content type will keep the previous request value + // until it finds the next content-type header. + if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { + $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); + } + if (!empty($contentType) && strpos($contentType, 'text/html') === false) { + return false; + } + if (!empty($contentType) && empty($charset)) { + $charset = header_extract_charset($contentType); + } + + return $chunkLength; + }; +} + /** * Get cURL callback function for CURLOPT_WRITEFUNCTION * @@ -506,10 +550,8 @@ function get_curl_download_callback( &$title, &$description, &$keywords, - $retrieveDescription, - $curlGetInfo = 'curl_getinfo' + $retrieveDescription ) { - $isRedirected = false; $currentChunk = 0; $foundChunk = null; @@ -524,37 +566,18 @@ function get_curl_download_callback( * * @return int|bool length of $data or false if we need to stop the download */ - return function (&$ch, $data) use ( + return function ($ch, $data) use ( $retrieveDescription, - $curlGetInfo, &$charset, &$title, &$description, &$keywords, - &$isRedirected, &$currentChunk, &$foundChunk ) { + $chunkLength = strlen($data); $currentChunk++; - $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); - if (!empty($responseCode) && in_array($responseCode, [301, 302])) { - $isRedirected = true; - return strlen($data); - } - if (!empty($responseCode) && $responseCode !== 200) { - return false; - } - // After a redirection, the content type will keep the previous request value - // until it finds the next content-type header. - if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { - $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); - } - if (!empty($contentType) && strpos($contentType, 'text/html') === false) { - return false; - } - if (!empty($contentType) && empty($charset)) { - $charset = header_extract_charset($contentType); - } + if (empty($charset)) { $charset = html_extract_charset($data); } @@ -562,6 +585,10 @@ function get_curl_download_callback( $title = html_extract_title($data); $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; } + if (empty($title)) { + $title = html_extract_tag('title', $data); + $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; + } if ($retrieveDescription && empty($description)) { $description = html_extract_tag('description', $data); $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; @@ -591,6 +618,6 @@ function get_curl_download_callback( return false; } - return strlen($data); + return $chunkLength; }; } diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php index 2ca982e2..ba9bd40c 100644 --- a/application/http/MetadataRetriever.php +++ b/application/http/MetadataRetriever.php @@ -46,6 +46,7 @@ class MetadataRetriever $url, $this->conf->get('general.download_timeout', 30), $this->conf->get('general.download_max_size', 4194304), + $this->httpAccess->getCurlHeaderCallback($charset), $this->httpAccess->getCurlDownloadCallback( $charset, $title, -- cgit v1.2.3 From b3bd8c3e8d367975980043e772f7cd78b7f96bc6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Thu, 22 Oct 2020 16:21:03 +0200 Subject: Feature: support any tag separator So it allows to have multiple words tags. Breaking change: commas ',' are no longer a default separator. Fixes #594 --- application/http/HttpAccess.php | 6 ++++-- application/http/HttpUtils.php | 12 +++++++----- application/http/MetadataRetriever.php | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'application/http') diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php index 646a5264..e80e0c01 100644 --- a/application/http/HttpAccess.php +++ b/application/http/HttpAccess.php @@ -29,14 +29,16 @@ class HttpAccess &$title, &$description, &$keywords, - $retrieveDescription + $retrieveDescription, + $tagsSeparator ) { return get_curl_download_callback( $charset, $title, $description, $keywords, - $retrieveDescription + $retrieveDescription, + $tagsSeparator ); } diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 28c12969..ed1002b0 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -550,7 +550,8 @@ function get_curl_download_callback( &$title, &$description, &$keywords, - $retrieveDescription + $retrieveDescription, + $tagsSeparator ) { $currentChunk = 0; $foundChunk = null; @@ -568,6 +569,7 @@ function get_curl_download_callback( */ return function ($ch, $data) use ( $retrieveDescription, + $tagsSeparator, &$charset, &$title, &$description, @@ -598,10 +600,10 @@ function get_curl_download_callback( if (! empty($keywords)) { $foundChunk = $currentChunk; // Keywords use the format tag1, tag2 multiple words, tag - // So we format them to match Shaarli's separator and glue multiple words with '-' - $keywords = implode(' ', array_map(function($keyword) { - return implode('-', preg_split('/\s+/', trim($keyword))); - }, explode(',', $keywords))); + // So we split the result with `,`, then if a tag contains the separator we replace it by `-`. + $keywords = tags_array2str(array_map(function(string $keyword) use ($tagsSeparator): string { + return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-'); + }, tags_str2array($keywords, ',')), $tagsSeparator); } } diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php index ba9bd40c..2e1401ec 100644 --- a/application/http/MetadataRetriever.php +++ b/application/http/MetadataRetriever.php @@ -38,7 +38,6 @@ class MetadataRetriever $title = null; $description = null; $tags = null; - $retrieveDescription = $this->conf->get('general.retrieve_description'); // Short timeout to keep the application responsive // The callback will fill $charset and $title with data from the downloaded page. @@ -52,7 +51,8 @@ class MetadataRetriever $title, $description, $tags, - $retrieveDescription + $this->conf->get('general.retrieve_description'), + $this->conf->get('general.tags_separator', ' ') ) ); -- cgit v1.2.3 From 53054b2bf6a919fd4ff9b44b6ad1986f21f488b6 Mon Sep 17 00:00:00 2001 From: ArthurHoaro Date: Tue, 22 Sep 2020 20:25:47 +0200 Subject: Apply PHP Code Beautifier on source code for linter automatic fixes --- application/http/HttpUtils.php | 73 ++++++++++++++++++++++++------------------ application/http/Url.php | 10 +++--- application/http/UrlUtils.php | 11 ++++--- 3 files changed, 53 insertions(+), 41 deletions(-) (limited to 'application/http') diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index ed1002b0..4bde1d5b 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php @@ -48,7 +48,7 @@ function get_http_response( $cleanUrl = $urlObj->idnToAscii(); if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { - return array(array(0 => 'Invalid HTTP UrlUtils'), false); + return [[0 => 'Invalid HTTP UrlUtils'], false]; } $userAgent = @@ -71,7 +71,7 @@ function get_http_response( $ch = curl_init($cleanUrl); if ($ch === false) { - return array(array(0 => 'curl_init() error'), false); + return [[0 => 'curl_init() error'], false]; } // General cURL settings @@ -82,7 +82,7 @@ function get_http_response( curl_setopt( $ch, CURLOPT_HTTPHEADER, - array('Accept-Language: ' . $acceptLanguage) + ['Accept-Language: ' . $acceptLanguage] ); curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); @@ -90,7 +90,7 @@ function get_http_response( curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); // Max download size management - curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); + curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16); curl_setopt($ch, CURLOPT_NOPROGRESS, false); if (is_callable($curlHeaderFunction)) { curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction); @@ -122,9 +122,9 @@ function get_http_response( * Removing this would require updating * GetHttpUrlTest::testGetInvalidRemoteUrl() */ - return array(false, false); + return [false, false]; } - return array(array(0 => 'curl_exec() error: ' . $errorStr), false); + return [[0 => 'curl_exec() error: ' . $errorStr], false]; } // Formatting output like the fallback method @@ -135,7 +135,7 @@ function get_http_response( $rawHeadersLastRedir = end($rawHeadersArrayRedirs); $content = substr($response, $headSize); - $headers = array(); + $headers = []; foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { if (empty($line) || ctype_space($line)) { continue; @@ -146,7 +146,7 @@ function get_http_response( $value = $splitLine[1]; if (array_key_exists($key, $headers)) { if (!is_array($headers[$key])) { - $headers[$key] = array(0 => $headers[$key]); + $headers[$key] = [0 => $headers[$key]]; } $headers[$key][] = $value; } else { @@ -157,7 +157,7 @@ function get_http_response( } } - return array($headers, $content); + return [$headers, $content]; } /** @@ -188,15 +188,15 @@ function get_http_response_fallback( $acceptLanguage, $maxRedr ) { - $options = array( - 'http' => array( + $options = [ + 'http' => [ 'method' => 'GET', 'timeout' => $timeout, 'user_agent' => $userAgent, 'header' => "Accept: */*\r\n" . 'Accept-Language: ' . $acceptLanguage - ) - ); + ] + ]; stream_context_set_default($options); list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); @@ -207,7 +207,7 @@ function get_http_response_fallback( } if (! $headers) { - return array($headers, false); + return [$headers, false]; } try { @@ -215,10 +215,10 @@ function get_http_response_fallback( $context = stream_context_create($options); $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); } catch (Exception $exc) { - return array(array(0 => 'HTTP Error'), $exc->getMessage()); + return [[0 => 'HTTP Error'], $exc->getMessage()]; } - return array($headers, $content); + return [$headers, $content]; } /** @@ -237,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3) } // Headers found, redirection found, and limit not reached. - if ($redirectionLimit-- > 0 + if ( + $redirectionLimit-- > 0 && !empty($headers) && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) - && !empty($headers['Location'])) { + && !empty($headers['Location']) + ) { $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; if ($redirection != $url) { $redirection = getAbsoluteUrl($url, $redirection); @@ -248,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3) } } - return array($headers, $url); + return [$headers, $url]; } /** @@ -270,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl) } $parts = parse_url($originalUrl); - $final = $parts['scheme'] .'://'. $parts['host']; + $final = $parts['scheme'] . '://' . $parts['host']; $final .= (!empty($parts['port'])) ? $parts['port'] : ''; $final .= '/'; if ($newUrl[0] != '/') { @@ -323,7 +325,8 @@ function server_url($server) $scheme = 'https'; } - if (($scheme == 'http' && $port != '80') + if ( + ($scheme == 'http' && $port != '80') || ($scheme == 'https' && $port != '443') ) { $port = ':' . $port; @@ -344,22 +347,26 @@ function server_url($server) $host = $server['SERVER_NAME']; } - return $scheme.'://'.$host.$port; + return $scheme . '://' . $host . $port; } // SSL detection - if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') - || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) { + if ( + (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') + || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443') + ) { $scheme = 'https'; } // Do not append standard port values - if (($scheme == 'http' && $server['SERVER_PORT'] != '80') - || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) { - $port = ':'.$server['SERVER_PORT']; + if ( + ($scheme == 'http' && $server['SERVER_PORT'] != '80') + || ($scheme == 'https' && $server['SERVER_PORT'] != '443') + ) { + $port = ':' . $server['SERVER_PORT']; } - return $scheme.'://'.$server['SERVER_NAME'].$port; + return $scheme . '://' . $server['SERVER_NAME'] . $port; } /** @@ -567,7 +574,10 @@ function get_curl_download_callback( * * @return int|bool length of $data or false if we need to stop the download */ - return function ($ch, $data) use ( + return function ( + $ch, + $data + ) use ( $retrieveDescription, $tagsSeparator, &$charset, @@ -601,7 +611,7 @@ function get_curl_download_callback( $foundChunk = $currentChunk; // Keywords use the format tag1, tag2 multiple words, tag // So we split the result with `,`, then if a tag contains the separator we replace it by `-`. - $keywords = tags_array2str(array_map(function(string $keyword) use ($tagsSeparator): string { + $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string { return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-'); }, tags_str2array($keywords, ',')), $tagsSeparator); } @@ -611,7 +621,8 @@ function get_curl_download_callback( // If we already found either the title, description or keywords, // it's highly unlikely that we'll found the other metas further than // in the same chunk of data or the next one. So we also stop the download after that. - if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null + if ( + (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null && (! $retrieveDescription || $foundChunk < $currentChunk || (!empty($title) && !empty($description) && !empty($keywords)) diff --git a/application/http/Url.php b/application/http/Url.php index 90444a2f..fe87088f 100644 --- a/application/http/Url.php +++ b/application/http/Url.php @@ -17,7 +17,7 @@ namespace Shaarli\Http; */ class Url { - private static $annoyingQueryParams = array( + private static $annoyingQueryParams = [ // Facebook 'action_object_map=', 'action_ref_map=', @@ -37,15 +37,15 @@ class Url // Other 'campaign_' - ); + ]; - private static $annoyingFragments = array( + private static $annoyingFragments = [ // ATInternet 'xtor=RSS-', // Misc. 'tk.rss_all' - ); + ]; /* * URL parts represented as an array @@ -120,7 +120,7 @@ class Url foreach (self::$annoyingQueryParams as $annoying) { foreach ($queryParams as $param) { if (startsWith($param, $annoying)) { - $queryParams = array_diff($queryParams, array($param)); + $queryParams = array_diff($queryParams, [$param]); continue; } } diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php index e8d1a283..de5b7db1 100644 --- a/application/http/UrlUtils.php +++ b/application/http/UrlUtils.php @@ -1,4 +1,5 @@