application/http/HttpUtils.php

   1 <?php
   2
   3 use Shaarli\Http\Url;
   4
   5 /**
   6  * GET an HTTP URL to retrieve its content
   7  * Uses the cURL library or a fallback method
   8  *
   9  * @param string          $url                URL to get (http://...)
  10  * @param int             $timeout            network timeout (in seconds)
  11  * @param int             $maxBytes           maximum downloaded bytes (default: 4 MiB)
  12  * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
  13  *                                            (CURLOPT_HEADERFUNCTION)
  14  * @param callable|string $curlWriteFunction  Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
  15  *                                            Can be used to add download conditions on the
  16  *                                            headers (response code, content type, etc.).
  17  *
  18  * @return array HTTP response headers, downloaded content
  19  *
  20  * Output format:
  21  *  [0] = associative array containing HTTP response headers
  22  *  [1] = URL content (downloaded data)
  23  *
  24  * Example:
  25  *  list($headers, $data) = get_http_response('http://sebauvage.net/');
  26  *  if (strpos($headers[0], '200 OK') !== false) {
  27  *      echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  28  *  } else {
  29  *      echo 'There was an error: '.htmlspecialchars($headers[0]);
  30  *  }
  31  *
  32  * @see https://secure.php.net/manual/en/ref.curl.php
  33  * @see https://secure.php.net/manual/en/functions.anonymous.php
  34  * @see https://secure.php.net/manual/en/function.preg-split.php
  35  * @see https://secure.php.net/manual/en/function.explode.php
  36  * @see http://stackoverflow.com/q/17641073
  37  * @see http://stackoverflow.com/q/9183178
  38  * @see http://stackoverflow.com/q/1462720
  39  */
  40 function get_http_response(
  41     $url,
  42     $timeout = 30,
  43     $maxBytes = 4194304,
  44     $curlHeaderFunction = null,
  45     $curlWriteFunction = null
  46 ) {
  47     $urlObj = new Url($url);
  48     $cleanUrl = $urlObj->idnToAscii();
  49
  50     if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  51         return array(array(0 => 'Invalid HTTP UrlUtils'), false);
  52     }
  53
  54     $userAgent =
  55         'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  56         . ' Gecko/20100101 Firefox/45.0';
  57     $acceptLanguage =
  58         substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  59     $maxRedirs = 3;
  60
  61     if (!function_exists('curl_init')) {
  62         return get_http_response_fallback(
  63             $cleanUrl,
  64             $timeout,
  65             $maxBytes,
  66             $userAgent,
  67             $acceptLanguage,
  68             $maxRedirs
  69         );
  70     }
  71
  72     $ch = curl_init($cleanUrl);
  73     if ($ch === false) {
  74         return array(array(0 => 'curl_init() error'), false);
  75     }
  76
  77     // General cURL settings
  78     curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  79     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  80     // Default header download if the $curlHeaderFunction is not defined
  81     curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
  82     curl_setopt(
  83         $ch,
  84         CURLOPT_HTTPHEADER,
  85         array('Accept-Language: ' . $acceptLanguage)
  86     );
  87     curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  88     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  89     curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  90     curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  91
  92     // Max download size management
  93     curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
  94     curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  95     if (is_callable($curlHeaderFunction)) {
  96         curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
  97     }
  98     if (is_callable($curlWriteFunction)) {
  99         curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
 100     }
 101     curl_setopt(
 102         $ch,
 103         CURLOPT_PROGRESSFUNCTION,
 104         function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
 105             $downloaded = $arg2;
 106
 107             // Non-zero return stops downloading
 108             return ($downloaded > $maxBytes) ? 1 : 0;
 109         }
 110     );
 111
 112     $response = curl_exec($ch);
 113     $errorNo = curl_errno($ch);
 114     $errorStr = curl_error($ch);
 115     $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
 116     curl_close($ch);
 117
 118     if ($response === false) {
 119         if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
 120             /*
 121              * Workaround to match fallback method behaviour
 122              * Removing this would require updating
 123              * GetHttpUrlTest::testGetInvalidRemoteUrl()
 124              */
 125             return array(false, false);
 126         }
 127         return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
 128     }
 129
 130     // Formatting output like the fallback method
 131     $rawHeaders = substr($response, 0, $headSize);
 132
 133     // Keep only headers from latest redirection
 134     $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
 135     $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
 136
 137     $content = substr($response, $headSize);
 138     $headers = array();
 139     foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
 140         if (empty($line) || ctype_space($line)) {
 141             continue;
 142         }
 143         $splitLine = explode(': ', $line, 2);
 144         if (count($splitLine) > 1) {
 145             $key = $splitLine[0];
 146             $value = $splitLine[1];
 147             if (array_key_exists($key, $headers)) {
 148                 if (!is_array($headers[$key])) {
 149                     $headers[$key] = array(0 => $headers[$key]);
 150                 }
 151                 $headers[$key][] = $value;
 152             } else {
 153                 $headers[$key] = $value;
 154             }
 155         } else {
 156             $headers[] = $splitLine[0];
 157         }
 158     }
 159
 160     return array($headers, $content);
 161 }
 162
 163 /**
 164  * GET an HTTP URL to retrieve its content (fallback method)
 165  *
 166  * @param string $cleanUrl       URL to get (http://... valid and in ASCII form)
 167  * @param int    $timeout        network timeout (in seconds)
 168  * @param int    $maxBytes       maximum downloaded bytes
 169  * @param string $userAgent      "User-Agent" header
 170  * @param string $acceptLanguage "Accept-Language" header
 171  * @param int    $maxRedr        maximum amount of redirections followed
 172  *
 173  * @return array HTTP response headers, downloaded content
 174  *
 175  * Output format:
 176  *  [0] = associative array containing HTTP response headers
 177  *  [1] = URL content (downloaded data)
 178  *
 179  * @see http://php.net/manual/en/function.file-get-contents.php
 180  * @see http://php.net/manual/en/function.stream-context-create.php
 181  * @see http://php.net/manual/en/function.get-headers.php
 182  */
 183 function get_http_response_fallback(
 184     $cleanUrl,
 185     $timeout,
 186     $maxBytes,
 187     $userAgent,
 188     $acceptLanguage,
 189     $maxRedr
 190 ) {
 191     $options = array(
 192         'http' => array(
 193             'method' => 'GET',
 194             'timeout' => $timeout,
 195             'user_agent' => $userAgent,
 196             'header' => "Accept: */*\r\n"
 197                 . 'Accept-Language: ' . $acceptLanguage
 198         )
 199     );
 200
 201     stream_context_set_default($options);
 202     list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 203     if (! $headers || strpos($headers[0], '200 OK') === false) {
 204         $options['http']['request_fulluri'] = true;
 205         stream_context_set_default($options);
 206         list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 207     }
 208
 209     if (! $headers) {
 210         return array($headers, false);
 211     }
 212
 213     try {
 214         // TODO: catch Exception in calling code (thumbnailer)
 215         $context = stream_context_create($options);
 216         $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
 217     } catch (Exception $exc) {
 218         return array(array(0 => 'HTTP Error'), $exc->getMessage());
 219     }
 220
 221     return array($headers, $content);
 222 }
 223
 224 /**
 225  * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
 226  *
 227  * @param string $url              initial URL to reach.
 228  * @param int    $redirectionLimit max redirection follow.
 229  *
 230  * @return array HTTP headers, or false if it failed.
 231  */
 232 function get_redirected_headers($url, $redirectionLimit = 3)
 233 {
 234     $headers = get_headers($url, 1);
 235     if (!empty($headers['location']) && empty($headers['Location'])) {
 236         $headers['Location'] = $headers['location'];
 237     }
 238
 239     // Headers found, redirection found, and limit not reached.
 240     if ($redirectionLimit-- > 0
 241         && !empty($headers)
 242         && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
 243         && !empty($headers['Location'])) {
 244         $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
 245         if ($redirection != $url) {
 246             $redirection = getAbsoluteUrl($url, $redirection);
 247             return get_redirected_headers($redirection, $redirectionLimit);
 248         }
 249     }
 250
 251     return array($headers, $url);
 252 }
 253
 254 /**
 255  * Get an absolute URL from a complete one, and another absolute/relative URL.
 256  *
 257  * @param string $originalUrl The original complete URL.
 258  * @param string $newUrl      The new one, absolute or relative.
 259  *
 260  * @return string Final URL:
 261  *   - $newUrl if it was already an absolute URL.
 262  *   - if it was relative, absolute URL from $originalUrl path.
 263  */
 264 function getAbsoluteUrl($originalUrl, $newUrl)
 265 {
 266     $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
 267     // Already an absolute URL.
 268     if (!empty($newScheme)) {
 269         return $newUrl;
 270     }
 271
 272     $parts = parse_url($originalUrl);
 273     $final = $parts['scheme'] .'://'. $parts['host'];
 274     $final .= (!empty($parts['port'])) ? $parts['port'] : '';
 275     $final .= '/';
 276     if ($newUrl[0] != '/') {
 277         $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
 278     }
 279     $final .= ltrim($newUrl, '/');
 280     return $final;
 281 }
 282
 283 /**
 284  * Returns the server's base URL: scheme://domain.tld[:port]
 285  *
 286  * @param array $server the $_SERVER array
 287  *
 288  * @return string the server's base URL
 289  *
 290  * @see http://www.ietf.org/rfc/rfc7239.txt
 291  * @see http://www.ietf.org/rfc/rfc6648.txt
 292  * @see http://stackoverflow.com/a/3561399
 293  * @see http://stackoverflow.com/q/452375
 294  */
 295 function server_url($server)
 296 {
 297     $scheme = 'http';
 298     $port = '';
 299
 300     // Shaarli is served behind a proxy
 301     if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
 302         // Keep forwarded scheme
 303         if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
 304             $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
 305             $scheme = trim($schemes[0]);
 306         } else {
 307             $scheme = $server['HTTP_X_FORWARDED_PROTO'];
 308         }
 309
 310         if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 311             // Keep forwarded port
 312             if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 313                 $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 314                 $port = trim($ports[0]);
 315             } else {
 316                 $port = $server['HTTP_X_FORWARDED_PORT'];
 317             }
 318
 319             // This is a workaround for proxies that don't forward the scheme properly.
 320             // Connecting over port 443 has to be in HTTPS.
 321             // See https://github.com/shaarli/Shaarli/issues/1022
 322             if ($port == '443') {
 323                 $scheme = 'https';
 324             }
 325
 326             if (($scheme == 'http' && $port != '80')
 327                 || ($scheme == 'https' && $port != '443')
 328             ) {
 329                 $port = ':' . $port;
 330             } else {
 331                 $port = '';
 332             }
 333         }
 334
 335         if (isset($server['HTTP_X_FORWARDED_HOST'])) {
 336             // Keep forwarded host
 337             if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
 338                 $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
 339                 $host = trim($hosts[0]);
 340             } else {
 341                 $host = $server['HTTP_X_FORWARDED_HOST'];
 342             }
 343         } else {
 344             $host = $server['SERVER_NAME'];
 345         }
 346
 347         return $scheme.'://'.$host.$port;
 348     }
 349
 350     // SSL detection
 351     if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
 352         || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
 353         $scheme = 'https';
 354     }
 355
 356     // Do not append standard port values
 357     if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
 358         || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
 359         $port = ':'.$server['SERVER_PORT'];
 360     }
 361
 362     return $scheme.'://'.$server['SERVER_NAME'].$port;
 363 }
 364
 365 /**
 366  * Returns the absolute URL of the current script, without the query
 367  *
 368  * If the resource is "index.php", then it is removed (for better-looking URLs)
 369  *
 370  * @param array $server the $_SERVER array
 371  *
 372  * @return string the absolute URL of the current script, without the query
 373  */
 374 function index_url($server)
 375 {
 376     if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
 377         return rtrim(SHAARLI_ROOT_URL, '/') . '/';
 378     }
 379
 380     $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
 381     if (endsWith($scriptname, 'index.php')) {
 382         $scriptname = substr($scriptname, 0, -9);
 383     }
 384     return server_url($server) . $scriptname;
 385 }
 386
 387 /**
 388  * Returns the absolute URL of the current script, with current route and query
 389  *
 390  * If the resource is "index.php", then it is removed (for better-looking URLs)
 391  *
 392  * @param array $server the $_SERVER array
 393  *
 394  * @return string the absolute URL of the current script, with the query
 395  */
 396 function page_url($server)
 397 {
 398     $scriptname = $server['SCRIPT_NAME'] ?? '';
 399     if (endsWith($scriptname, 'index.php')) {
 400         $scriptname = substr($scriptname, 0, -9);
 401     }
 402
 403     $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
 404     if (! empty($server['QUERY_STRING'])) {
 405         return index_url($server) . $route . '?' . $server['QUERY_STRING'];
 406     }
 407
 408     return index_url($server) . $route;
 409 }
 410
 411 /**
 412  * Retrieve the initial IP forwarded by the reverse proxy.
 413  *
 414  * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
 415  *
 416  * @param array $server     $_SERVER array which contains HTTP headers.
 417  * @param array $trustedIps List of trusted IP from the configuration.
 418  *
 419  * @return string|bool The forwarded IP, or false if none could be extracted.
 420  */
 421 function getIpAddressFromProxy($server, $trustedIps)
 422 {
 423     $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
 424     if (empty($server[$forwardedIpHeader])) {
 425         return false;
 426     }
 427
 428     $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
 429     $ips = array_diff($ips, $trustedIps);
 430     if (empty($ips)) {
 431         return false;
 432     }
 433
 434     return array_pop($ips);
 435 }
 436
 437
 438 /**
 439  * Return an identifier based on the advertised client IP address(es)
 440  *
 441  * This aims at preventing session hijacking from users behind the same proxy
 442  * by relying on HTTP headers.
 443  *
 444  * See:
 445  * - https://secure.php.net/manual/en/reserved.variables.server.php
 446  * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php
 447  * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking
 448  * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor
 449  *
 450  * @param array $server The $_SERVER array
 451  *
 452  * @return string An identifier based on client IP address information
 453  */
 454 function client_ip_id($server)
 455 {
 456     $ip = $server['REMOTE_ADDR'];
 457
 458     if (isset($server['HTTP_X_FORWARDED_FOR'])) {
 459         $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR'];
 460     }
 461     if (isset($server['HTTP_CLIENT_IP'])) {
 462         $ip = $ip . '_' . $server['HTTP_CLIENT_IP'];
 463     }
 464     return $ip;
 465 }
 466
 467
 468 /**
 469  * Returns true if Shaarli's currently browsed in HTTPS.
 470  * Supports reverse proxies (if the headers are correctly set).
 471  *
 472  * @param array $server $_SERVER.
 473  *
 474  * @return bool true if HTTPS, false otherwise.
 475  */
 476 function is_https($server)
 477 {
 478
 479     if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 480         // Keep forwarded port
 481         if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 482             $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 483             $port = trim($ports[0]);
 484         } else {
 485             $port = $server['HTTP_X_FORWARDED_PORT'];
 486         }
 487
 488         if ($port == '443') {
 489             return true;
 490         }
 491     }
 492
 493     return ! empty($server['HTTPS']);
 494 }
 495
 496 /**
 497  * Get cURL callback function for CURLOPT_WRITEFUNCTION
 498  *
 499  * @param string $charset     to extract from the downloaded page (reference)
 500  * @param string $curlGetInfo Optionally overrides curl_getinfo function
 501  *
 502  * @return Closure
 503  */
 504 function get_curl_header_callback(
 505     &$charset,
 506     $curlGetInfo = 'curl_getinfo'
 507 ) {
 508     $isRedirected = false;
 509
 510     return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
 511         $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
 512         $chunkLength = strlen($data);
 513         if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
 514             $isRedirected = true;
 515             return $chunkLength;
 516         }
 517         if (!empty($responseCode) && $responseCode !== 200) {
 518             return false;
 519         }
 520         // After a redirection, the content type will keep the previous request value
 521         // until it finds the next content-type header.
 522         if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
 523             $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
 524         }
 525         if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
 526             return false;
 527         }
 528         if (!empty($contentType) && empty($charset)) {
 529             $charset = header_extract_charset($contentType);
 530         }
 531
 532         return $chunkLength;
 533     };
 534 }
 535
 536 /**
 537  * Get cURL callback function for CURLOPT_WRITEFUNCTION
 538  *
 539  * @param string $charset     to extract from the downloaded page (reference)
 540  * @param string $title       to extract from the downloaded page (reference)
 541  * @param string $description to extract from the downloaded page (reference)
 542  * @param string $keywords    to extract from the downloaded page (reference)
 543  * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 544  * @param string $curlGetInfo Optionally overrides curl_getinfo function
 545  *
 546  * @return Closure
 547  */
 548 function get_curl_download_callback(
 549     &$charset,
 550     &$title,
 551     &$description,
 552     &$keywords,
 553     $retrieveDescription,
 554     $tagsSeparator
 555 ) {
 556     $currentChunk = 0;
 557     $foundChunk = null;
 558
 559     /**
 560      * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
 561      *
 562      * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
 563      * Then we extract the title and the charset and stop the download when it's done.
 564      *
 565      * @param resource $ch   cURL resource
 566      * @param string   $data chunk of data being downloaded
 567      *
 568      * @return int|bool length of $data or false if we need to stop the download
 569      */
 570     return function ($ch, $data) use (
 571         $retrieveDescription,
 572         $tagsSeparator,
 573         &$charset,
 574         &$title,
 575         &$description,
 576         &$keywords,
 577         &$currentChunk,
 578         &$foundChunk
 579     ) {
 580         $chunkLength = strlen($data);
 581         $currentChunk++;
 582
 583         if (empty($charset)) {
 584             $charset = html_extract_charset($data);
 585         }
 586         if (empty($title)) {
 587             $title = html_extract_title($data);
 588             $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
 589         }
 590         if (empty($title)) {
 591             $title = html_extract_tag('title', $data);
 592             $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
 593         }
 594         if ($retrieveDescription && empty($description)) {
 595             $description = html_extract_tag('description', $data);
 596             $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
 597         }
 598         if ($retrieveDescription && empty($keywords)) {
 599             $keywords = html_extract_tag('keywords', $data);
 600             if (! empty($keywords)) {
 601                 $foundChunk = $currentChunk;
 602                 // Keywords use the format tag1, tag2 multiple words, tag
 603                 // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
 604                 $keywords = tags_array2str(array_map(function(string $keyword) use ($tagsSeparator): string {
 605                     return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
 606                 }, tags_str2array($keywords, ',')), $tagsSeparator);
 607             }
 608         }
 609
 610         // We got everything we want, stop the download.
 611         // If we already found either the title, description or keywords,
 612         // it's highly unlikely that we'll found the other metas further than
 613         // in the same chunk of data or the next one. So we also stop the download after that.
 614         if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
 615             && (! $retrieveDescription
 616                 || $foundChunk < $currentChunk
 617                 || (!empty($title) && !empty($description) && !empty($keywords))
 618             )
 619         ) {
 620             return false;
 621         }
 622
 623         return $chunkLength;
 624     };
 625 }