application/http/HttpUtils.php

   1 <?php
   2
   3 use Shaarli\Http\Url;
   4
   5 /**
   6  * GET an HTTP URL to retrieve its content
   7  * Uses the cURL library or a fallback method
   8  *
   9  * @param string          $url                URL to get (http://...)
  10  * @param int             $timeout            network timeout (in seconds)
  11  * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
  12  * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
  13  *                                            (CURLOPT_HEADERFUNCTION)
  14  * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
  15  *                                            Can be used to add download conditions on the
  16  *                                            headers (response code, content type, etc.).
  17  *
  18  * @return array HTTP response headers, downloaded content
  19  *
  20  * Output format:
  21  *  [0] = associative array containing HTTP response headers
  22  *  [1] = URL content (downloaded data)
  23  *
  24  * Example:
  25  *  list($headers, $data) = get_http_response('http://sebauvage.net/');
  26  *  if (strpos($headers[0], '200 OK') !== false) {
  27  *      echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  28  *  } else {
  29  *      echo 'There was an error: '.htmlspecialchars($headers[0]);
  30  *  }
  31  *
  32  * @see https://secure.php.net/manual/en/ref.curl.php
  33  * @see https://secure.php.net/manual/en/functions.anonymous.php
  34  * @see https://secure.php.net/manual/en/function.preg-split.php
  35  * @see https://secure.php.net/manual/en/function.explode.php
  36  * @see http://stackoverflow.com/q/17641073
  37  * @see http://stackoverflow.com/q/9183178
  38  * @see http://stackoverflow.com/q/1462720
  39  */
  40 function get_http_response(
  41     $url,
  42     $timeout = 30,
  43     $maxBytes = 4194304,
  44     $curlHeaderFunction = null,
  45     $curlWriteFunction = null
  46 ) {
  47     $urlObj = new Url($url);
  48     $cleanUrl = $urlObj->idnToAscii();
  49
  50     if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  51         return [[0 => 'Invalid HTTP UrlUtils'], false];
  52     }
  53
  54     $userAgent =
  55         'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  56         . ' Gecko/20100101 Firefox/45.0';
  57     $acceptLanguage =
  58         substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  59     $maxRedirs = 3;
  60
  61     if (!function_exists('curl_init')) {
  62         return get_http_response_fallback(
  63             $cleanUrl,
  64             $timeout,
  65             $maxBytes,
  66             $userAgent,
  67             $acceptLanguage,
  68             $maxRedirs
  69         );
  70     }
  71
  72     $ch = curl_init($cleanUrl);
  73     if ($ch === false) {
  74         return [[0 => 'curl_init() error'], false];
  75     }
  76
  77     // General cURL settings
  78     curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  79     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  80     // Default header download if the $curlHeaderFunction is not defined
  81     curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
  82     curl_setopt(
  83         $ch,
  84         CURLOPT_HTTPHEADER,
  85         ['Accept-Language: ' . $acceptLanguage]
  86     );
  87     curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  88     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  89     curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  90     curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  91
  92     // Max download size management
  93     curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
  94     curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  95     if (is_callable($curlHeaderFunction)) {
  96         curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
  97     }
  98     if (is_callable($curlWriteFunction)) {
  99         curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
 100     }
 101     curl_setopt(
 102         $ch,
 103         CURLOPT_PROGRESSFUNCTION,
 104         function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
 105             $downloaded = $arg2;
 106
 107             // Non-zero return stops downloading
 108             return ($downloaded > $maxBytes) ? 1 : 0;
 109         }
 110     );
 111
 112     $response = curl_exec($ch);
 113     $errorNo = curl_errno($ch);
 114     $errorStr = curl_error($ch);
 115     $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
 116     curl_close($ch);
 117
 118     if ($response === false) {
 119         if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
 120             /*
 121              * Workaround to match fallback method behaviour
 122              * Removing this would require updating
 123              * GetHttpUrlTest::testGetInvalidRemoteUrl()
 124              */
 125             return [false, false];
 126         }
 127         return [[0 => 'curl_exec() error: ' . $errorStr], false];
 128     }
 129
 130     // Formatting output like the fallback method
 131     $rawHeaders = substr($response, 0, $headSize);
 132
 133     // Keep only headers from latest redirection
 134     $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
 135     $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
 136
 137     $content = substr($response, $headSize);
 138     $headers = [];
 139     foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
 140         if (empty($line) || ctype_space($line)) {
 141             continue;
 142         }
 143         $splitLine = explode(': ', $line, 2);
 144         if (count($splitLine) > 1) {
 145             $key = $splitLine[0];
 146             $value = $splitLine[1];
 147             if (array_key_exists($key, $headers)) {
 148                 if (!is_array($headers[$key])) {
 149                     $headers[$key] = [0 => $headers[$key]];
 150                 }
 151                 $headers[$key][] = $value;
 152             } else {
 153                 $headers[$key] = $value;
 154             }
 155         } else {
 156             $headers[] = $splitLine[0];
 157         }
 158     }
 159
 160     return [$headers, $content];
 161 }
 162
 163 /**
 164  * GET an HTTP URL to retrieve its content (fallback method)
 165  *
 166  * @param string $cleanUrl       URL to get (http://... valid and in ASCII form)
 167  * @param int    $timeout        network timeout (in seconds)
 168  * @param int    $maxBytes       maximum downloaded bytes
 169  * @param string $userAgent      "User-Agent" header
 170  * @param string $acceptLanguage "Accept-Language" header
 171  * @param int    $maxRedr        maximum amount of redirections followed
 172  *
 173  * @return array HTTP response headers, downloaded content
 174  *
 175  * Output format:
 176  *  [0] = associative array containing HTTP response headers
 177  *  [1] = URL content (downloaded data)
 178  *
 179  * @see http://php.net/manual/en/function.file-get-contents.php
 180  * @see http://php.net/manual/en/function.stream-context-create.php
 181  * @see http://php.net/manual/en/function.get-headers.php
 182  */
 183 function get_http_response_fallback(
 184     $cleanUrl,
 185     $timeout,
 186     $maxBytes,
 187     $userAgent,
 188     $acceptLanguage,
 189     $maxRedr
 190 ) {
 191     $options = [
 192         'http' => [
 193             'method' => 'GET',
 194             'timeout' => $timeout,
 195             'user_agent' => $userAgent,
 196             'header' => "Accept: */*\r\n"
 197                 . 'Accept-Language: ' . $acceptLanguage
 198         ]
 199     ];
 200
 201     stream_context_set_default($options);
 202     list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 203     if (! $headers || strpos($headers[0], '200 OK') === false) {
 204         $options['http']['request_fulluri'] = true;
 205         stream_context_set_default($options);
 206         list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 207     }
 208
 209     if (! $headers) {
 210         return [$headers, false];
 211     }
 212
 213     try {
 214         // TODO: catch Exception in calling code (thumbnailer)
 215         $context = stream_context_create($options);
 216         $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
 217     } catch (Exception $exc) {
 218         return [[0 => 'HTTP Error'], $exc->getMessage()];
 219     }
 220
 221     return [$headers, $content];
 222 }
 223
 224 /**
 225  * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
 226  *
 227  * @param string $url              initial URL to reach.
 228  * @param int    $redirectionLimit max redirection follow.
 229  *
 230  * @return array HTTP headers, or false if it failed.
 231  */
 232 function get_redirected_headers($url, $redirectionLimit = 3)
 233 {
 234     $headers = get_headers($url, 1);
 235     if (!empty($headers['location']) && empty($headers['Location'])) {
 236         $headers['Location'] = $headers['location'];
 237     }
 238
 239     // Headers found, redirection found, and limit not reached.
 240     if (
 241         $redirectionLimit-- > 0
 242         && !empty($headers)
 243         && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
 244         && !empty($headers['Location'])
 245     ) {
 246         $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
 247         if ($redirection != $url) {
 248             $redirection = getAbsoluteUrl($url, $redirection);
 249             return get_redirected_headers($redirection, $redirectionLimit);
 250         }
 251     }
 252
 253     return [$headers, $url];
 254 }
 255
 256 /**
 257  * Get an absolute URL from a complete one, and another absolute/relative URL.
 258  *
 259  * @param string $originalUrl The original complete URL.
 260  * @param string $newUrl      The new one, absolute or relative.
 261  *
 262  * @return string Final URL:
 263  *   - $newUrl if it was already an absolute URL.
 264  *   - if it was relative, absolute URL from $originalUrl path.
 265  */
 266 function getAbsoluteUrl($originalUrl, $newUrl)
 267 {
 268     $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
 269     // Already an absolute URL.
 270     if (!empty($newScheme)) {
 271         return $newUrl;
 272     }
 273
 274     $parts = parse_url($originalUrl);
 275     $final = $parts['scheme'] . '://' . $parts['host'];
 276     $final .= (!empty($parts['port'])) ? $parts['port'] : '';
 277     $final .= '/';
 278     if ($newUrl[0] != '/') {
 279         $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
 280     }
 281     $final .= ltrim($newUrl, '/');
 282     return $final;
 283 }
 284
 285 /**
 286  * Returns the server's base URL: scheme://domain.tld[:port]
 287  *
 288  * @param array $server the $_SERVER array
 289  *
 290  * @return string the server's base URL
 291  *
 292  * @see http://www.ietf.org/rfc/rfc7239.txt
 293  * @see http://www.ietf.org/rfc/rfc6648.txt
 294  * @see http://stackoverflow.com/a/3561399
 295  * @see http://stackoverflow.com/q/452375
 296  */
 297 function server_url($server)
 298 {
 299     $scheme = 'http';
 300     $port = '';
 301
 302     // Shaarli is served behind a proxy
 303     if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
 304         // Keep forwarded scheme
 305         if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
 306             $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
 307             $scheme = trim($schemes[0]);
 308         } else {
 309             $scheme = $server['HTTP_X_FORWARDED_PROTO'];
 310         }
 311
 312         if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 313             // Keep forwarded port
 314             if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 315                 $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 316                 $port = trim($ports[0]);
 317             } else {
 318                 $port = $server['HTTP_X_FORWARDED_PORT'];
 319             }
 320
 321             // This is a workaround for proxies that don't forward the scheme properly.
 322             // Connecting over port 443 has to be in HTTPS.
 323             // See https://github.com/shaarli/Shaarli/issues/1022
 324             if ($port == '443') {
 325                 $scheme = 'https';
 326             }
 327
 328             if (
 329                 ($scheme == 'http' && $port != '80')
 330                 || ($scheme == 'https' && $port != '443')
 331             ) {
 332                 $port = ':' . $port;
 333             } else {
 334                 $port = '';
 335             }
 336         }
 337
 338         if (isset($server['HTTP_X_FORWARDED_HOST'])) {
 339             // Keep forwarded host
 340             if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
 341                 $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
 342                 $host = trim($hosts[0]);
 343             } else {
 344                 $host = $server['HTTP_X_FORWARDED_HOST'];
 345             }
 346         } else {
 347             $host = $server['SERVER_NAME'];
 348         }
 349
 350         return $scheme . '://' . $host . $port;
 351     }
 352
 353     // SSL detection
 354     if (
 355         (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
 356         || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
 357     ) {
 358         $scheme = 'https';
 359     }
 360
 361     // Do not append standard port values
 362     if (
 363         ($scheme == 'http' && $server['SERVER_PORT'] != '80')
 364         || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
 365     ) {
 366         $port = ':' . $server['SERVER_PORT'];
 367     }
 368
 369     return $scheme . '://' . $server['SERVER_NAME'] . $port;
 370 }
 371
 372 /**
 373  * Returns the absolute URL of the current script, without the query
 374  *
 375  * If the resource is "index.php", then it is removed (for better-looking URLs)
 376  *
 377  * @param array $server the $_SERVER array
 378  *
 379  * @return string the absolute URL of the current script, without the query
 380  */
 381 function index_url($server)
 382 {
 383     if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
 384         return rtrim(SHAARLI_ROOT_URL, '/') . '/';
 385     }
 386
 387     $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
 388     if (endsWith($scriptname, 'index.php')) {
 389         $scriptname = substr($scriptname, 0, -9);
 390     }
 391     return server_url($server) . $scriptname;
 392 }
 393
 394 /**
 395  * Returns the absolute URL of the current script, with current route and query
 396  *
 397  * If the resource is "index.php", then it is removed (for better-looking URLs)
 398  *
 399  * @param array $server the $_SERVER array
 400  *
 401  * @return string the absolute URL of the current script, with the query
 402  */
 403 function page_url($server)
 404 {
 405     $scriptname = $server['SCRIPT_NAME'] ?? '';
 406     if (endsWith($scriptname, 'index.php')) {
 407         $scriptname = substr($scriptname, 0, -9);
 408     }
 409
 410     $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
 411     if (! empty($server['QUERY_STRING'])) {
 412         return index_url($server) . $route . '?' . $server['QUERY_STRING'];
 413     }
 414
 415     return index_url($server) . $route;
 416 }
 417
 418 /**
 419  * Retrieve the initial IP forwarded by the reverse proxy.
 420  *
 421  * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
 422  *
 423  * @param array $server     $_SERVER array which contains HTTP headers.
 424  * @param array $trustedIps List of trusted IP from the configuration.
 425  *
 426  * @return string|bool The forwarded IP, or false if none could be extracted.
 427  */
 428 function getIpAddressFromProxy($server, $trustedIps)
 429 {
 430     $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
 431     if (empty($server[$forwardedIpHeader])) {
 432         return false;
 433     }
 434
 435     $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
 436     $ips = array_diff($ips, $trustedIps);
 437     if (empty($ips)) {
 438         return false;
 439     }
 440
 441     return array_pop($ips);
 442 }
 443
 444
 445 /**
 446  * Return an identifier based on the advertised client IP address(es)
 447  *
 448  * This aims at preventing session hijacking from users behind the same proxy
 449  * by relying on HTTP headers.
 450  *
 451  * See:
 452  * - https://secure.php.net/manual/en/reserved.variables.server.php
 453  * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php
 454  * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking
 455  * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor
 456  *
 457  * @param array $server The $_SERVER array
 458  *
 459  * @return string An identifier based on client IP address information
 460  */
 461 function client_ip_id($server)
 462 {
 463     $ip = $server['REMOTE_ADDR'];
 464
 465     if (isset($server['HTTP_X_FORWARDED_FOR'])) {
 466         $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR'];
 467     }
 468     if (isset($server['HTTP_CLIENT_IP'])) {
 469         $ip = $ip . '_' . $server['HTTP_CLIENT_IP'];
 470     }
 471     return $ip;
 472 }
 473
 474
 475 /**
 476  * Returns true if Shaarli's currently browsed in HTTPS.
 477  * Supports reverse proxies (if the headers are correctly set).
 478  *
 479  * @param array $server $_SERVER.
 480  *
 481  * @return bool true if HTTPS, false otherwise.
 482  */
 483 function is_https($server)
 484 {
 485
 486     if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 487         // Keep forwarded port
 488         if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 489             $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 490             $port = trim($ports[0]);
 491         } else {
 492             $port = $server['HTTP_X_FORWARDED_PORT'];
 493         }
 494
 495         if ($port == '443') {
 496             return true;
 497         }
 498     }
 499
 500     return ! empty($server['HTTPS']);
 501 }
 502
 503 /**
 504  * Get cURL callback function for CURLOPT_WRITEFUNCTION
 505  *
 506  * @param string $charset     to extract from the downloaded page (reference)
 507  * @param string $curlGetInfo Optionally overrides curl_getinfo function
 508  *
 509  * @return Closure
 510  */
 511 function get_curl_header_callback(
 512     &$charset,
 513     $curlGetInfo = 'curl_getinfo'
 514 ) {
 515     $isRedirected = false;
 516
 517     return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
 518         $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
 519         $chunkLength = strlen($data);
 520         if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
 521             $isRedirected = true;
 522             return $chunkLength;
 523         }
 524         if (!empty($responseCode) && $responseCode !== 200) {
 525             return false;
 526         }
 527         // After a redirection, the content type will keep the previous request value
 528         // until it finds the next content-type header.
 529         if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
 530             $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
 531         }
 532         if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
 533             return false;
 534         }
 535         if (!empty($contentType) && empty($charset)) {
 536             $charset = header_extract_charset($contentType);
 537         }
 538
 539         return $chunkLength;
 540     };
 541 }
 542
 543 /**
 544  * Get cURL callback function for CURLOPT_WRITEFUNCTION
 545  *
 546  * @param string $charset     to extract from the downloaded page (reference)
 547  * @param string $title       to extract from the downloaded page (reference)
 548  * @param string $description to extract from the downloaded page (reference)
 549  * @param string $keywords    to extract from the downloaded page (reference)
 550  * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 551  * @param string $curlGetInfo Optionally overrides curl_getinfo function
 552  *
 553  * @return Closure
 554  */
 555 function get_curl_download_callback(
 556     &$charset,
 557     &$title,
 558     &$description,
 559     &$keywords,
 560     $retrieveDescription,
 561     $tagsSeparator
 562 ) {
 563     $currentChunk = 0;
 564     $foundChunk = null;
 565
 566     /**
 567      * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
 568      *
 569      * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
 570      * Then we extract the title and the charset and stop the download when it's done.
 571      *
 572      * @param resource $ch   cURL resource
 573      * @param string   $data chunk of data being downloaded
 574      *
 575      * @return int|bool length of $data or false if we need to stop the download
 576      */
 577     return function (
 578         $ch,
 579         $data
 580     ) use (
 581         $retrieveDescription,
 582         $tagsSeparator,
 583         &$charset,
 584         &$title,
 585         &$description,
 586         &$keywords,
 587         &$currentChunk,
 588         &$foundChunk
 589     ) {
 590         $chunkLength = strlen($data);
 591         $currentChunk++;
 592
 593         if (empty($charset)) {
 594             $charset = html_extract_charset($data);
 595         }
 596         if (empty($title)) {
 597             $title = html_extract_title($data);
 598             $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
 599         }
 600         if (empty($title)) {
 601             $title = html_extract_tag('title', $data);
 602             $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
 603         }
 604         if ($retrieveDescription && empty($description)) {
 605             $description = html_extract_tag('description', $data);
 606             $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
 607         }
 608         if ($retrieveDescription && empty($keywords)) {
 609             $keywords = html_extract_tag('keywords', $data);
 610             if (! empty($keywords)) {
 611                 $foundChunk = $currentChunk;
 612                 // Keywords use the format tag1, tag2 multiple words, tag
 613                 // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
 614                 $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
 615                     return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
 616                 }, tags_str2array($keywords, ',')), $tagsSeparator);
 617             }
 618         }
 619
 620         // We got everything we want, stop the download.
 621         // If we already found either the title, description or keywords,
 622         // it's highly unlikely that we'll found the other metas further than
 623         // in the same chunk of data or the next one. So we also stop the download after that.
 624         if (
 625             (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
 626             && (! $retrieveDescription
 627                 || $foundChunk < $currentChunk
 628                 || (!empty($title) && !empty($description) && !empty($keywords))
 629             )
 630         ) {
 631             return false;
 632         }
 633
 634         return $chunkLength;
 635     };
 636 }