Add russian language file

[github/shaarli/Shaarli.git] / application / http / HttpUtils.php
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php

index 2ea9195d3550bfd4bfd912ef35cb4b669a5c83c2..4bde1d5b8c4b33c97dc91a40f847b96422174f6c 100644 (file)
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
   * GET an HTTP URL to retrieve its content
   * Uses the cURL library or a fallback method
   *
- * @param string          $url               URL to get (http://...)
- * @param int             $timeout           network timeout (in seconds)
- * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
- * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
- *                                           Can be used to add download conditions on the
- *                                           headers (response code, content type, etc.).
+ * @param string          $url                URL to get (http://...)
+ * @param int             $timeout            network timeout (in seconds)
+ * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
+ * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
+ *                                            (CURLOPT_HEADERFUNCTION)
+ * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
+ *                                            Can be used to add download conditions on the
+ *                                            headers (response code, content type, etc.).
   *
   * @return array HTTP response headers, downloaded content
   *
@@ -35,13 +37,18 @@ use Shaarli\Http\Url;
   * @see http://stackoverflow.com/q/9183178
   * @see http://stackoverflow.com/q/1462720
   */
-function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
-{
+function get_http_response(
+    $url,
+    $timeout = 30,
+    $maxBytes = 4194304,
+    $curlHeaderFunction = null,
+    $curlWriteFunction = null
+) {
      $urlObj = new Url($url);
      $cleanUrl = $urlObj->idnToAscii();
  
      if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
-        return array(array(0 => 'Invalid HTTP UrlUtils'), false);
+        return [[0 => 'Invalid HTTP UrlUtils'], false];
      }
  
      $userAgent =
@@ -64,42 +71,39 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
  
      $ch = curl_init($cleanUrl);
      if ($ch === false) {
-        return array(array(0 => 'curl_init() error'), false);
+        return [[0 => 'curl_init() error'], false];
      }
  
      // General cURL settings
      curl_setopt($ch, CURLOPT_AUTOREFERER, true);
      curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
-    curl_setopt($ch, CURLOPT_HEADER, true);
+    // Default header download if the $curlHeaderFunction is not defined
+    curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
      curl_setopt(
          $ch,
          CURLOPT_HTTPHEADER,
-        array('Accept-Language: ' . $acceptLanguage)
+        ['Accept-Language: ' . $acceptLanguage]
      );
      curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
      curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
      curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
      curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  
+    // Max download size management
+    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
+    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
+    if (is_callable($curlHeaderFunction)) {
+        curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
+    }
      if (is_callable($curlWriteFunction)) {
          curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
      }
-
-    // Max download size management
-    curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
-    curl_setopt($ch, CURLOPT_NOPROGRESS, false);
      curl_setopt(
          $ch,
          CURLOPT_PROGRESSFUNCTION,
-        function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
-            if (version_compare(phpversion(), '5.5', '<')) {
-                // PHP version lower than 5.5
-                // Callback has 4 arguments
-                $downloaded = $arg1;
-            } else {
-                // Callback has 5 arguments
-                $downloaded = $arg2;
-            }
+        function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
+            $downloaded = $arg2;
+
              // Non-zero return stops downloading
              return ($downloaded > $maxBytes) ? 1 : 0;
          }
@@ -118,9 +122,9 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
               * Removing this would require updating
               * GetHttpUrlTest::testGetInvalidRemoteUrl()
               */
-            return array(false, false);
+            return [false, false];
          }
-        return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
+        return [[0 => 'curl_exec() error: ' . $errorStr], false];
      }
  
      // Formatting output like the fallback method
@@ -131,7 +135,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
      $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
  
      $content = substr($response, $headSize);
-    $headers = array();
+    $headers = [];
      foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
          if (empty($line) || ctype_space($line)) {
              continue;
@@ -142,7 +146,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
              $value = $splitLine[1];
              if (array_key_exists($key, $headers)) {
                  if (!is_array($headers[$key])) {
-                    $headers[$key] = array(0 => $headers[$key]);
+                    $headers[$key] = [0 => $headers[$key]];
                  }
                  $headers[$key][] = $value;
              } else {
@@ -153,7 +157,7 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
          }
      }
  
-    return array($headers, $content);
+    return [$headers, $content];
  }
  
  /**
@@ -184,15 +188,15 @@ function get_http_response_fallback(
      $acceptLanguage,
      $maxRedr
  ) {
-    $options = array(
-        'http' => array(
+    $options = [
+        'http' => [
              'method' => 'GET',
              'timeout' => $timeout,
              'user_agent' => $userAgent,
              'header' => "Accept: */*\r\n"
                  . 'Accept-Language: ' . $acceptLanguage
-        )
-    );
+        ]
+    ];
  
      stream_context_set_default($options);
      list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
@@ -203,7 +207,7 @@ function get_http_response_fallback(
      }
  
      if (! $headers) {
-        return array($headers, false);
+        return [$headers, false];
      }
  
      try {
@@ -211,10 +215,10 @@ function get_http_response_fallback(
          $context = stream_context_create($options);
          $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
      } catch (Exception $exc) {
-        return array(array(0 => 'HTTP Error'), $exc->getMessage());
+        return [[0 => 'HTTP Error'], $exc->getMessage()];
      }
  
-    return array($headers, $content);
+    return [$headers, $content];
  }
  
  /**
@@ -233,10 +237,12 @@ function get_redirected_headers($url, $redirectionLimit = 3)
      }
  
      // Headers found, redirection found, and limit not reached.
-    if ($redirectionLimit-- > 0
+    if (
+        $redirectionLimit-- > 0
          && !empty($headers)
          && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
-        && !empty($headers['Location'])) {
+        && !empty($headers['Location'])
+    ) {
          $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
          if ($redirection != $url) {
              $redirection = getAbsoluteUrl($url, $redirection);
@@ -244,7 +250,7 @@ function get_redirected_headers($url, $redirectionLimit = 3)
          }
      }
  
-    return array($headers, $url);
+    return [$headers, $url];
  }
  
  /**
@@ -266,7 +272,7 @@ function getAbsoluteUrl($originalUrl, $newUrl)
      }
  
      $parts = parse_url($originalUrl);
-    $final = $parts['scheme'] .'://'. $parts['host'];
+    $final = $parts['scheme'] . '://' . $parts['host'];
      $final .= (!empty($parts['port'])) ? $parts['port'] : '';
      $final .= '/';
      if ($newUrl[0] != '/') {
@@ -319,7 +325,8 @@ function server_url($server)
                  $scheme = 'https';
              }
  
-            if (($scheme == 'http' && $port != '80')
+            if (
+                ($scheme == 'http' && $port != '80')
                  || ($scheme == 'https' && $port != '443')
              ) {
                  $port = ':' . $port;
@@ -340,22 +347,26 @@ function server_url($server)
              $host = $server['SERVER_NAME'];
          }
  
-        return $scheme.'://'.$host.$port;
+        return $scheme . '://' . $host . $port;
      }
  
      // SSL detection
-    if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
-        || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
+    if (
+        (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
+        || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
+    ) {
          $scheme = 'https';
      }
  
      // Do not append standard port values
-    if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
-        || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
-        $port = ':'.$server['SERVER_PORT'];
+    if (
+        ($scheme == 'http' && $server['SERVER_PORT'] != '80')
+        || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
+    ) {
+        $port = ':' . $server['SERVER_PORT'];
      }
  
-    return $scheme.'://'.$server['SERVER_NAME'].$port;
+    return $scheme . '://' . $server['SERVER_NAME'] . $port;
  }
  
  /**
@@ -369,7 +380,11 @@ function server_url($server)
   */
  function index_url($server)
  {
-    $scriptname = $server['SCRIPT_NAME'];
+    if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
+        return rtrim(SHAARLI_ROOT_URL, '/') . '/';
+    }
+
+    $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
      if (endsWith($scriptname, 'index.php')) {
          $scriptname = substr($scriptname, 0, -9);
      }
@@ -377,7 +392,7 @@ function index_url($server)
  }
  
  /**
- * Returns the absolute URL of the current script, with the query
+ * Returns the absolute URL of the current script, with current route and query
   *
   * If the resource is "index.php", then it is removed (for better-looking URLs)
   *
@@ -387,10 +402,17 @@ function index_url($server)
   */
  function page_url($server)
  {
+    $scriptname = $server['SCRIPT_NAME'] ?? '';
+    if (endsWith($scriptname, 'index.php')) {
+        $scriptname = substr($scriptname, 0, -9);
+    }
+
+    $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
      if (! empty($server['QUERY_STRING'])) {
-        return index_url($server).'?'.$server['QUERY_STRING'];
+        return index_url($server) . $route . '?' . $server['QUERY_STRING'];
      }
-    return index_url($server);
+
+    return index_url($server) . $route;
  }
  
  /**
@@ -477,3 +499,138 @@ function is_https($server)
  
      return ! empty($server['HTTPS']);
  }
+
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_header_callback(
+    &$charset,
+    $curlGetInfo = 'curl_getinfo'
+) {
+    $isRedirected = false;
+
+    return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
+        $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
+        $chunkLength = strlen($data);
+        if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
+            $isRedirected = true;
+            return $chunkLength;
+        }
+        if (!empty($responseCode) && $responseCode !== 200) {
+            return false;
+        }
+        // After a redirection, the content type will keep the previous request value
+        // until it finds the next content-type header.
+        if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
+            $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
+        }
+        if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
+            return false;
+        }
+        if (!empty($contentType) && empty($charset)) {
+            $charset = header_extract_charset($contentType);
+        }
+
+        return $chunkLength;
+    };
+}
+
+/**
+ * Get cURL callback function for CURLOPT_WRITEFUNCTION
+ *
+ * @param string $charset     to extract from the downloaded page (reference)
+ * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
+ * @param string $curlGetInfo Optionally overrides curl_getinfo function
+ *
+ * @return Closure
+ */
+function get_curl_download_callback(
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription,
+    $tagsSeparator
+) {
+    $currentChunk = 0;
+    $foundChunk = null;
+
+    /**
+     * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
+     *
+     * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
+     * Then we extract the title and the charset and stop the download when it's done.
+     *
+     * @param resource $ch   cURL resource
+     * @param string   $data chunk of data being downloaded
+     *
+     * @return int|bool length of $data or false if we need to stop the download
+     */
+    return function (
+        $ch,
+        $data
+    ) use (
+        $retrieveDescription,
+        $tagsSeparator,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $chunkLength = strlen($data);
+        $currentChunk++;
+
+        if (empty($charset)) {
+            $charset = html_extract_charset($data);
+        }
+        if (empty($title)) {
+            $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if (empty($title)) {
+            $title = html_extract_tag('title', $data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
+                $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
+                    return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
+                }, tags_str2array($keywords, ',')), $tagsSeparator);
+            }
+        }
+
+        // We got everything we want, stop the download.
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if (
+            (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
+            return false;
+        }
+
+        return $chunkLength;
+    };
+}