Merge pull request #2 from shaarli/master

Merge fork source
author: yude <yudesleepy@gmail.com> 2021-01-04 18:51:10 +0900
committer: GitHub <noreply@github.com> 2021-01-04 18:51:10 +0900
commit: e6754f2154a79abd8e5e64bd923f6984aa9ad44b (patch)
tree: f074119530bb59ef155938ea367f719f1e4b70f1 /application/http
parent: 5256b4287021342a9f8868967b2a77e481314331 (diff)
parent: ed4ee8f0297941ac83300389b7de6a293312d20e (diff)
download: Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.gz
Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.zst
Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.zip
5 files changed, 219 insertions, 94 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
index 81d9e076..e80e0c01 100644
--- a/application/http/HttpAccess.php
+++ b/application/http/HttpAccess.php
@@ -14,9 +14,14 @@ namespace Shaarli\Http;
 */
 class HttpAccess
 {
-    public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
+    public function getHttpResponse(
-    {
+        $url,
-        return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
+        $timeout = 30,
+        $maxBytes = 4194304,
+        $curlHeaderFunction = null,
+        $curlWriteFunction = null
+    ) {
+        return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
    }
    public function getCurlDownloadCallback(
@@ -25,7 +30,7 @@ class HttpAccess
        &$description,
        &$keywords,
        $retrieveDescription,
-        $curlGetInfo = 'curl_getinfo'
+        $tagsSeparator
    ) {
        return get_curl_download_callback(
            $charset,
@@ -33,7 +38,12 @@ class HttpAccess
            $description,
            $keywords,
            $retrieveDescription,
-            $curlGetInfo
+            $tagsSeparator
        );
    }
+    public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
+    {
+        return get_curl_header_callback($charset, $curlGetInfo);
+    }
 }
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 9f414073..4bde1d5b 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
 * GET an HTTP URL to retrieve its content
 * Uses the cURL library or a fallback method
 *
- * @param string          $url               URL to get (http://...)
+ * @param string          $url                URL to get (http://...)
- * @param int             $timeout           network timeout (in seconds)
+ * @param int             $timeout            network timeout (in seconds)
- * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
+ * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
- *                                           Can be used to add download conditions on the
+ *                                            (CURLOPT_HEADERFUNCTION)
- *                                           headers (response code, content type, etc.).
+ * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                            Can be used to add download conditions on the
+ *                                            headers (response code, content type, etc.).
 *
 * @return array HTTP response headers, downloaded content
 *
@@ -35,13 +37,18 @@ use Shaarli\Http\Url;
 * @see http://stackoverflow.com/q/9183178
 * @see http://stackoverflow.com/q/1462720
 */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
+function get_http_response(
-{
+    $url,
+    $timeout = 30,
+    $maxBytes = 4194304,
+    $curlHeaderFunction = null,
+    $curlWriteFunction = null
+) {
    $urlObj = new Url($url);
    $cleanUrl = $urlObj->idnToAscii();
    if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
-        return array(array(0 => 'Invalid HTTP UrlUtils'), false);
+        return [[0 => 'Invalid HTTP UrlUtils'], false];
    }
    $userAgent =
@@ -64,42 +71,39 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    $ch = curl_init($cleanUrl);
    if ($ch === false) {
-        return array(array(0 => 'curl_init() error'), false);
+        return [[0 => 'curl_init() error'], false];
    }
    // General cURL settings
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
-    curl_setopt($ch, CURLOPT_HEADER, true);
+    // Default header download if the $curlHeaderFunction is not defined
+    curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
    curl_setopt(
        $ch,
        CURLOPT_HTTPHEADER,
-        array('Accept-Language: ' . $acceptLanguage)
+        ['Accept-Language: ' . $acceptLanguage]
    );
    curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
    curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
+    // Max download size management
+    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
+    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+    if (is_callable($curlHeaderFunction)) {
+        curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+    }
    if (is_callable($curlWriteFunction)) {
        curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
    }
-    // Max download size management
-    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
-    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
    curl_setopt(
        $ch,
        CURLOPT_PROGRESSFUNCTION,
-        function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
+        function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
-            if (version_compare(phpversion(), '5.5', '<')) {
+            $downloaded = $arg2;
-                // PHP version lower than 5.5
-                // Callback has 4 arguments
-                $downloaded = $arg1;
-            } else {
-                // Callback has 5 arguments
-                $downloaded = $arg2;
-            }
            // Non-zero return stops downloading
            return ($downloaded > $maxBytes) ? 1 : 0;
        }
@@ -118,9 +122,9 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
             * Removing this would require updating
             * GetHttpUrlTest::testGetInvalidRemoteUrl()
             */
-            return array(false, false);
+            return [false, false];
        }
-        return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
+        return [[0 => 'curl_exec() error: ' . $errorStr], false];
    }
    // Formatting output like the fallback method
@@ -131,7 +135,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
    $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
    $content = substr($response, $headSize);
-    $headers = array();
+    $headers = [];
    foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
        if (empty($line) || ctype_space($line)) {
            continue;
@@ -142,7 +146,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
            $value = $splitLine[1];
            if (array_key_exists($key, $headers)) {
                if (!is_array($headers[$key])) {
-                    $headers[$key] = array(0 => $headers[$key]);
+                    $headers[$key] = [0 => $headers[$key]];
                }
                $headers[$key][] = $value;
            } else {
@@ -153,7 +157,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
        }
    }
-    return array($headers, $content);
+    return [$headers, $content];
 }
 /**
@@ -184,15 +188,15 @@ function get_http_response_fallback(
    $acceptLanguage,
    $maxRedr
 ) {
-    $options = array(
+    $options = [
-        'http' => array(
+        'http' => [
            'method' => 'GET',
            'timeout' => $timeout,
            'user_agent' => $userAgent,
            'header' => "Accept: */*\r\n"
                . 'Accept-Language: ' . $acceptLanguage
-        )
+        ]
-    );
+    ];
    stream_context_set_default($options);
    list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
@@ -203,7 +207,7 @@ function get_http_response_fallback(
    }
    if (! $headers) {
-        return array($headers, false);
+        return [$headers, false];
    }
    try {
@@ -211,10 +215,10 @@ function get_http_response_fallback(
        $context = stream_context_create($options);
        $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
    } catch (Exception $exc) {
-        return array(array(0 => 'HTTP Error'), $exc->getMessage());
+        return [[0 => 'HTTP Error'], $exc->getMessage()];
    }
-    return array($headers, $content);
+    return [$headers, $content];
 }
 /**
@@ -233,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3)
    }
    // Headers found, redirection found, and limit not reached.
-    if ($redirectionLimit-- > 0
+    if (
+        $redirectionLimit-- > 0
        && !empty($headers)
        && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
-        && !empty($headers['Location'])) {
+        && !empty($headers['Location'])
+    ) {
        $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
        if ($redirection != $url) {
            $redirection = getAbsoluteUrl($url, $redirection);
@@ -244,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3)
        }
    }
-    return array($headers, $url);
+    return [$headers, $url];
 }
 /**
@@ -266,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl)
    }
    $parts = parse_url($originalUrl);
-    $final = $parts['scheme'] .'://'. $parts['host'];
+    $final = $parts['scheme'] . '://' . $parts['host'];
    $final .= (!empty($parts['port'])) ? $parts['port'] : '';
    $final .= '/';
    if ($newUrl[0] != '/') {
@@ -319,7 +325,8 @@ function server_url($server)
                $scheme = 'https';
            }
-            if (($scheme == 'http' && $port != '80')
+            if (
+                ($scheme == 'http' && $port != '80')
                || ($scheme == 'https' && $port != '443')
            ) {
                $port = ':' . $port;
@@ -340,22 +347,26 @@ function server_url($server)
            $host = $server['SERVER_NAME'];
        }
-        return $scheme.'://'.$host.$port;
+        return $scheme . '://' . $host . $port;
    }
    // SSL detection
-    if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
+    if (
-        || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
+        (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
+        || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
+    ) {
        $scheme = 'https';
    }
    // Do not append standard port values
-    if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
+    if (
-        || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
+        ($scheme == 'http' && $server['SERVER_PORT'] != '80')
-        $port = ':'.$server['SERVER_PORT'];
+        || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
+    ) {
+        $port = ':' . $server['SERVER_PORT'];
    }
-    return $scheme.'://'.$server['SERVER_NAME'].$port;
+    return $scheme . '://' . $server['SERVER_NAME'] . $port;
 }
 /**
@@ -493,6 +504,46 @@ function is_https($server)
 * Get cURL callback function for CURLOPT_WRITEFUNCTION
 *
 * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_header_callback(
+    &$charset,
+    $curlGetInfo = 'curl_getinfo'
+) {
+    $isRedirected = false;
+    return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        $chunkLength = strlen($data);
+        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+            $isRedirected = true;
+            return $chunkLength;
+        }
+        if (!empty($responseCode) && $responseCode !== 200) {
+            return false;
+        }
+        // After a redirection, the content type will keep the previous request value
+        // until it finds the next content-type header.
+        if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+            $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        }
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (!empty($contentType) && empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+        return $chunkLength;
+    };
+}
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
 * @param string $title       to extract from the downloaded page (reference)
 * @param string $description to extract from the downloaded page (reference)
 * @param string $keywords    to extract from the downloaded page (reference)
@@ -507,9 +558,8 @@ function get_curl_download_callback(
    &$description,
    &$keywords,
    $retrieveDescription,
-    $curlGetInfo = 'curl_getinfo'
+    $tagsSeparator
 ) {
-    $isRedirected = false;
    $currentChunk = 0;
    $foundChunk = null;
@@ -524,37 +574,22 @@ function get_curl_download_callback(
     *
     * @return int|bool length of $data or false if we need to stop the download
     */
-    return function (&$ch, $data) use (
+    return function (
+        $ch,
+        $data
+    ) use (
        $retrieveDescription,
-        $curlGetInfo,
+        $tagsSeparator,
        &$charset,
        &$title,
        &$description,
        &$keywords,
-        &$isRedirected,
        &$currentChunk,
        &$foundChunk
    ) {
+        $chunkLength = strlen($data);
        $currentChunk++;
-        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
-        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
-            $isRedirected = true;
-            return strlen($data);
-        }
-        if (!empty($responseCode) && $responseCode !== 200) {
-            return false;
-        }
-        // After a redirection, the content type will keep the previous request value
-        // until it finds the next content-type header.
-        if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
-            $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
-        }
-        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
-            return false;
-        }
-        if (!empty($contentType) && empty($charset)) {
-            $charset = header_extract_charset($contentType);
-        }
        if (empty($charset)) {
            $charset = html_extract_charset($data);
        }
@@ -562,6 +597,10 @@ function get_curl_download_callback(
            $title = html_extract_title($data);
            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
        }
+        if (empty($title)) {
+            $title = html_extract_tag('title', $data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
        if ($retrieveDescription && empty($description)) {
            $description = html_extract_tag('description', $data);
            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
@@ -571,10 +610,10 @@ function get_curl_download_callback(
            if (! empty($keywords)) {
                $foundChunk = $currentChunk;
                // Keywords use the format tag1, tag2 multiple words, tag
-                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
-                $keywords = implode(' ', array_map(function($keyword) {
+                $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
-                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                    return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
-                }, explode(',', $keywords)));
+                }, tags_str2array($keywords, ',')), $tagsSeparator);
            }
        }
@@ -582,7 +621,8 @@ function get_curl_download_callback(
        // If we already found either the title, description or keywords,
        // it's highly unlikely that we'll found the other metas further than
        // in the same chunk of data or the next one. So we also stop the download after that.
-        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+        if (
+            (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
            && (! $retrieveDescription
                || $foundChunk < $currentChunk
                || (!empty($title) && !empty($description) && !empty($keywords))
@@ -591,6 +631,6 @@ function get_curl_download_callback(
            return false;
        }
-        return strlen($data);
+        return $chunkLength;
    };
 }
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php
new file mode 100644
index 00000000..cfc72583
--- /dev/null
+++ b/application/http/MetadataRetriever.php
@@ -0,0 +1,74 @@
+<?php
+declare(strict_types=1);
+namespace Shaarli\Http;
+use Shaarli\Config\ConfigManager;
+/**
+ * HTTP Tool used to extract metadata from external URL (title, description, etc.).
+ */
+class MetadataRetriever
+{
+    /** @var ConfigManager */
+    protected $conf;
+    /** @var HttpAccess */
+    protected $httpAccess;
+    public function __construct(ConfigManager $conf, HttpAccess $httpAccess)
+    {
+        $this->conf = $conf;
+        $this->httpAccess = $httpAccess;
+    }
+    /**
+     * Retrieve metadata for given URL.
+     *
+     * @return array [
+     *                  'title' => <remote title>,
+     *                  'description' => <remote description>,
+     *                  'tags' => <remote keywords>,
+     *               ]
+     */
+    public function retrieve(string $url): array
+    {
+        $charset = null;
+        $title = null;
+        $description = null;
+        $tags = null;
+        // Short timeout to keep the application responsive
+        // The callback will fill $charset and $title with data from the downloaded page.
+        $this->httpAccess->getHttpResponse(
+            $url,
+            $this->conf->get('general.download_timeout', 30),
+            $this->conf->get('general.download_max_size', 4194304),
+            $this->httpAccess->getCurlHeaderCallback($charset),
+            $this->httpAccess->getCurlDownloadCallback(
+                $charset,
+                $title,
+                $description,
+                $tags,
+                $this->conf->get('general.retrieve_description'),
+                $this->conf->get('general.tags_separator', ' ')
+            )
+        );
+        if (!empty($title) && strtolower($charset) !== 'utf-8') {
+            $title = mb_convert_encoding($title, 'utf-8', $charset);
+        }
+        return array_map([$this, 'cleanMetadata'], [
+            'title' => $title,
+            'description' => $description,
+            'tags' => $tags,
+        ]);
+    }
+    protected function cleanMetadata($data): ?string
+    {
+        return !is_string($data) || empty(trim($data)) ? null : trim($data);
+    }
+}
diff --git a/application/http/Url.php b/application/http/Url.php
index 90444a2f..fe87088f 100644
--- a/application/http/Url.php
+++ b/application/http/Url.php
@@ -17,7 +17,7 @@ namespace Shaarli\Http;
 */
 class Url
 {
-    private static $annoyingQueryParams = array(
+    private static $annoyingQueryParams = [
        // Facebook
        'action_object_map=',
        'action_ref_map=',
@@ -37,15 +37,15 @@ class Url
        // Other
        'campaign_'
-    );
+    ];
-    private static $annoyingFragments = array(
+    private static $annoyingFragments = [
        // ATInternet
        'xtor=RSS-',
        // Misc.
        'tk.rss_all'
-    );
+    ];
    /*
     * URL parts represented as an array
@@ -120,7 +120,7 @@ class Url
        foreach (self::$annoyingQueryParams as $annoying) {
            foreach ($queryParams as $param) {
                if (startsWith($param, $annoying)) {
-                    $queryParams = array_diff($queryParams, array($param));
+                    $queryParams = array_diff($queryParams, [$param]);
                    continue;
                }
            }
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php
index e8d1a283..de5b7db1 100644
--- a/application/http/UrlUtils.php
+++ b/application/http/UrlUtils.php
@@ -1,4 +1,5 @@
 <?php
 /**
 * Converts an array-represented URL to a string
 *
@@ -12,15 +13,15 @@
 */
 function unparse_url($parsedUrl)
 {
-    $scheme   = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : '';
+    $scheme   = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '';
    $host     = isset($parsedUrl['host']) ? $parsedUrl['host'] : '';
-    $port     = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : '';
+    $port     = isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '';
    $user     = isset($parsedUrl['user']) ? $parsedUrl['user'] : '';
-    $pass     = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass']  : '';
+    $pass     = isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass']  : '';
    $pass     = ($user || $pass) ? "$pass@" : '';
    $path     = isset($parsedUrl['path']) ? $parsedUrl['path'] : '';
-    $query    = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : '';
+    $query    = isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '';
-    $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : '';
+    $fragment = isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '';
    return "$scheme$user$pass$host$port$path$query$fragment";
 }
author	yude <yudesleepy@gmail.com>	2021-01-04 18:51:10 +0900
committer	GitHub <noreply@github.com>	2021-01-04 18:51:10 +0900
commit	e6754f2154a79abd8e5e64bd923f6984aa9ad44b (patch)
tree	f074119530bb59ef155938ea367f719f1e4b70f1 /application/http
parent	5256b4287021342a9f8868967b2a77e481314331 (diff)
parent	ed4ee8f0297941ac83300389b7de6a293312d20e (diff)
download	Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.gz Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.tar.zst Shaarli-e6754f2154a79abd8e5e64bd923f6984aa9ad44b.zip