application/http/HttpUtils.php

   1 <?php
   2
   3 use Shaarli\Http\Url;
   4
   5 /**
   6  * GET an HTTP URL to retrieve its content
   7  * Uses the cURL library or a fallback method
   8  *
   9  * @param string          $url               URL to get (http://...)
  10  * @param int             $timeout           network timeout (in seconds)
  11  * @param int             $maxBytes          maximum downloaded bytes (default: 4 MiB)
  12  * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
  13  *                                           Can be used to add download conditions on the
  14  *                                           headers (response code, content type, etc.).
  15  *
  16  * @return array HTTP response headers, downloaded content
  17  *
  18  * Output format:
  19  *  [0] = associative array containing HTTP response headers
  20  *  [1] = URL content (downloaded data)
  21  *
  22  * Example:
  23  *  list($headers, $data) = get_http_response('http://sebauvage.net/');
  24  *  if (strpos($headers[0], '200 OK') !== false) {
  25  *      echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
  26  *  } else {
  27  *      echo 'There was an error: '.htmlspecialchars($headers[0]);
  28  *  }
  29  *
  30  * @see https://secure.php.net/manual/en/ref.curl.php
  31  * @see https://secure.php.net/manual/en/functions.anonymous.php
  32  * @see https://secure.php.net/manual/en/function.preg-split.php
  33  * @see https://secure.php.net/manual/en/function.explode.php
  34  * @see http://stackoverflow.com/q/17641073
  35  * @see http://stackoverflow.com/q/9183178
  36  * @see http://stackoverflow.com/q/1462720
  37  */
  38 function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
  39 {
  40     $urlObj = new Url($url);
  41     $cleanUrl = $urlObj->idnToAscii();
  42
  43     if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
  44         return array(array(0 => 'Invalid HTTP UrlUtils'), false);
  45     }
  46
  47     $userAgent =
  48         'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
  49         . ' Gecko/20100101 Firefox/45.0';
  50     $acceptLanguage =
  51         substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
  52     $maxRedirs = 3;
  53
  54     if (!function_exists('curl_init')) {
  55         return get_http_response_fallback(
  56             $cleanUrl,
  57             $timeout,
  58             $maxBytes,
  59             $userAgent,
  60             $acceptLanguage,
  61             $maxRedirs
  62         );
  63     }
  64
  65     $ch = curl_init($cleanUrl);
  66     if ($ch === false) {
  67         return array(array(0 => 'curl_init() error'), false);
  68     }
  69
  70     // General cURL settings
  71     curl_setopt($ch, CURLOPT_AUTOREFERER, true);
  72     curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  73     curl_setopt($ch, CURLOPT_HEADER, true);
  74     curl_setopt(
  75         $ch,
  76         CURLOPT_HTTPHEADER,
  77         array('Accept-Language: ' . $acceptLanguage)
  78     );
  79     curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
  80     curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  81     curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
  82     curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
  83
  84     if (is_callable($curlWriteFunction)) {
  85         curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
  86     }
  87
  88     // Max download size management
  89     curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
  90     curl_setopt($ch, CURLOPT_NOPROGRESS, false);
  91     curl_setopt(
  92         $ch,
  93         CURLOPT_PROGRESSFUNCTION,
  94         function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) {
  95             if (version_compare(phpversion(), '5.5', '<')) {
  96                 // PHP version lower than 5.5
  97                 // Callback has 4 arguments
  98                 $downloaded = $arg1;
  99             } else {
 100                 // Callback has 5 arguments
 101                 $downloaded = $arg2;
 102             }
 103             // Non-zero return stops downloading
 104             return ($downloaded > $maxBytes) ? 1 : 0;
 105         }
 106     );
 107
 108     $response = curl_exec($ch);
 109     $errorNo = curl_errno($ch);
 110     $errorStr = curl_error($ch);
 111     $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
 112     curl_close($ch);
 113
 114     if ($response === false) {
 115         if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
 116             /*
 117              * Workaround to match fallback method behaviour
 118              * Removing this would require updating
 119              * GetHttpUrlTest::testGetInvalidRemoteUrl()
 120              */
 121             return array(false, false);
 122         }
 123         return array(array(0 => 'curl_exec() error: ' . $errorStr), false);
 124     }
 125
 126     // Formatting output like the fallback method
 127     $rawHeaders = substr($response, 0, $headSize);
 128
 129     // Keep only headers from latest redirection
 130     $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
 131     $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
 132
 133     $content = substr($response, $headSize);
 134     $headers = array();
 135     foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
 136         if (empty($line) || ctype_space($line)) {
 137             continue;
 138         }
 139         $splitLine = explode(': ', $line, 2);
 140         if (count($splitLine) > 1) {
 141             $key = $splitLine[0];
 142             $value = $splitLine[1];
 143             if (array_key_exists($key, $headers)) {
 144                 if (!is_array($headers[$key])) {
 145                     $headers[$key] = array(0 => $headers[$key]);
 146                 }
 147                 $headers[$key][] = $value;
 148             } else {
 149                 $headers[$key] = $value;
 150             }
 151         } else {
 152             $headers[] = $splitLine[0];
 153         }
 154     }
 155
 156     return array($headers, $content);
 157 }
 158
 159 /**
 160  * GET an HTTP URL to retrieve its content (fallback method)
 161  *
 162  * @param string $cleanUrl       URL to get (http://... valid and in ASCII form)
 163  * @param int    $timeout        network timeout (in seconds)
 164  * @param int    $maxBytes       maximum downloaded bytes
 165  * @param string $userAgent      "User-Agent" header
 166  * @param string $acceptLanguage "Accept-Language" header
 167  * @param int    $maxRedr        maximum amount of redirections followed
 168  *
 169  * @return array HTTP response headers, downloaded content
 170  *
 171  * Output format:
 172  *  [0] = associative array containing HTTP response headers
 173  *  [1] = URL content (downloaded data)
 174  *
 175  * @see http://php.net/manual/en/function.file-get-contents.php
 176  * @see http://php.net/manual/en/function.stream-context-create.php
 177  * @see http://php.net/manual/en/function.get-headers.php
 178  */
 179 function get_http_response_fallback(
 180     $cleanUrl,
 181     $timeout,
 182     $maxBytes,
 183     $userAgent,
 184     $acceptLanguage,
 185     $maxRedr
 186 ) {
 187     $options = array(
 188         'http' => array(
 189             'method' => 'GET',
 190             'timeout' => $timeout,
 191             'user_agent' => $userAgent,
 192             'header' => "Accept: */*\r\n"
 193                 . 'Accept-Language: ' . $acceptLanguage
 194         )
 195     );
 196
 197     stream_context_set_default($options);
 198     list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 199     if (! $headers || strpos($headers[0], '200 OK') === false) {
 200         $options['http']['request_fulluri'] = true;
 201         stream_context_set_default($options);
 202         list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
 203     }
 204
 205     if (! $headers) {
 206         return array($headers, false);
 207     }
 208
 209     try {
 210         // TODO: catch Exception in calling code (thumbnailer)
 211         $context = stream_context_create($options);
 212         $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
 213     } catch (Exception $exc) {
 214         return array(array(0 => 'HTTP Error'), $exc->getMessage());
 215     }
 216
 217     return array($headers, $content);
 218 }
 219
 220 /**
 221  * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
 222  *
 223  * @param string $url              initial URL to reach.
 224  * @param int    $redirectionLimit max redirection follow.
 225  *
 226  * @return array HTTP headers, or false if it failed.
 227  */
 228 function get_redirected_headers($url, $redirectionLimit = 3)
 229 {
 230     $headers = get_headers($url, 1);
 231     if (!empty($headers['location']) && empty($headers['Location'])) {
 232         $headers['Location'] = $headers['location'];
 233     }
 234
 235     // Headers found, redirection found, and limit not reached.
 236     if ($redirectionLimit-- > 0
 237         && !empty($headers)
 238         && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
 239         && !empty($headers['Location'])) {
 240         $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
 241         if ($redirection != $url) {
 242             $redirection = getAbsoluteUrl($url, $redirection);
 243             return get_redirected_headers($redirection, $redirectionLimit);
 244         }
 245     }
 246
 247     return array($headers, $url);
 248 }
 249
 250 /**
 251  * Get an absolute URL from a complete one, and another absolute/relative URL.
 252  *
 253  * @param string $originalUrl The original complete URL.
 254  * @param string $newUrl      The new one, absolute or relative.
 255  *
 256  * @return string Final URL:
 257  *   - $newUrl if it was already an absolute URL.
 258  *   - if it was relative, absolute URL from $originalUrl path.
 259  */
 260 function getAbsoluteUrl($originalUrl, $newUrl)
 261 {
 262     $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
 263     // Already an absolute URL.
 264     if (!empty($newScheme)) {
 265         return $newUrl;
 266     }
 267
 268     $parts = parse_url($originalUrl);
 269     $final = $parts['scheme'] .'://'. $parts['host'];
 270     $final .= (!empty($parts['port'])) ? $parts['port'] : '';
 271     $final .= '/';
 272     if ($newUrl[0] != '/') {
 273         $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
 274     }
 275     $final .= ltrim($newUrl, '/');
 276     return $final;
 277 }
 278
 279 /**
 280  * Returns the server's base URL: scheme://domain.tld[:port]
 281  *
 282  * @param array $server the $_SERVER array
 283  *
 284  * @return string the server's base URL
 285  *
 286  * @see http://www.ietf.org/rfc/rfc7239.txt
 287  * @see http://www.ietf.org/rfc/rfc6648.txt
 288  * @see http://stackoverflow.com/a/3561399
 289  * @see http://stackoverflow.com/q/452375
 290  */
 291 function server_url($server)
 292 {
 293     $scheme = 'http';
 294     $port = '';
 295
 296     // Shaarli is served behind a proxy
 297     if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
 298         // Keep forwarded scheme
 299         if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
 300             $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
 301             $scheme = trim($schemes[0]);
 302         } else {
 303             $scheme = $server['HTTP_X_FORWARDED_PROTO'];
 304         }
 305
 306         if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 307             // Keep forwarded port
 308             if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 309                 $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 310                 $port = trim($ports[0]);
 311             } else {
 312                 $port = $server['HTTP_X_FORWARDED_PORT'];
 313             }
 314
 315             // This is a workaround for proxies that don't forward the scheme properly.
 316             // Connecting over port 443 has to be in HTTPS.
 317             // See https://github.com/shaarli/Shaarli/issues/1022
 318             if ($port == '443') {
 319                 $scheme = 'https';
 320             }
 321
 322             if (($scheme == 'http' && $port != '80')
 323                 || ($scheme == 'https' && $port != '443')
 324             ) {
 325                 $port = ':' . $port;
 326             } else {
 327                 $port = '';
 328             }
 329         }
 330
 331         if (isset($server['HTTP_X_FORWARDED_HOST'])) {
 332             // Keep forwarded host
 333             if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
 334                 $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
 335                 $host = trim($hosts[0]);
 336             } else {
 337                 $host = $server['HTTP_X_FORWARDED_HOST'];
 338             }
 339         } else {
 340             $host = $server['SERVER_NAME'];
 341         }
 342
 343         return $scheme.'://'.$host.$port;
 344     }
 345
 346     // SSL detection
 347     if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
 348         || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) {
 349         $scheme = 'https';
 350     }
 351
 352     // Do not append standard port values
 353     if (($scheme == 'http' && $server['SERVER_PORT'] != '80')
 354         || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) {
 355         $port = ':'.$server['SERVER_PORT'];
 356     }
 357
 358     return $scheme.'://'.$server['SERVER_NAME'].$port;
 359 }
 360
 361 /**
 362  * Returns the absolute URL of the current script, without the query
 363  *
 364  * If the resource is "index.php", then it is removed (for better-looking URLs)
 365  *
 366  * @param array $server the $_SERVER array
 367  *
 368  * @return string the absolute URL of the current script, without the query
 369  */
 370 function index_url($server)
 371 {
 372     if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
 373         return rtrim(SHAARLI_ROOT_URL, '/') . '/';
 374     }
 375
 376     $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
 377     if (endsWith($scriptname, 'index.php')) {
 378         $scriptname = substr($scriptname, 0, -9);
 379     }
 380     return server_url($server) . $scriptname;
 381 }
 382
 383 /**
 384  * Returns the absolute URL of the current script, with current route and query
 385  *
 386  * If the resource is "index.php", then it is removed (for better-looking URLs)
 387  *
 388  * @param array $server the $_SERVER array
 389  *
 390  * @return string the absolute URL of the current script, with the query
 391  */
 392 function page_url($server)
 393 {
 394     $scriptname = $server['SCRIPT_NAME'] ?? '';
 395     if (endsWith($scriptname, 'index.php')) {
 396         $scriptname = substr($scriptname, 0, -9);
 397     }
 398
 399     $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
 400     if (! empty($server['QUERY_STRING'])) {
 401         return index_url($server) . $route . '?' . $server['QUERY_STRING'];
 402     }
 403
 404     return index_url($server) . $route;
 405 }
 406
 407 /**
 408  * Retrieve the initial IP forwarded by the reverse proxy.
 409  *
 410  * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
 411  *
 412  * @param array $server     $_SERVER array which contains HTTP headers.
 413  * @param array $trustedIps List of trusted IP from the configuration.
 414  *
 415  * @return string|bool The forwarded IP, or false if none could be extracted.
 416  */
 417 function getIpAddressFromProxy($server, $trustedIps)
 418 {
 419     $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
 420     if (empty($server[$forwardedIpHeader])) {
 421         return false;
 422     }
 423
 424     $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
 425     $ips = array_diff($ips, $trustedIps);
 426     if (empty($ips)) {
 427         return false;
 428     }
 429
 430     return array_pop($ips);
 431 }
 432
 433
 434 /**
 435  * Return an identifier based on the advertised client IP address(es)
 436  *
 437  * This aims at preventing session hijacking from users behind the same proxy
 438  * by relying on HTTP headers.
 439  *
 440  * See:
 441  * - https://secure.php.net/manual/en/reserved.variables.server.php
 442  * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php
 443  * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking
 444  * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor
 445  *
 446  * @param array $server The $_SERVER array
 447  *
 448  * @return string An identifier based on client IP address information
 449  */
 450 function client_ip_id($server)
 451 {
 452     $ip = $server['REMOTE_ADDR'];
 453
 454     if (isset($server['HTTP_X_FORWARDED_FOR'])) {
 455         $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR'];
 456     }
 457     if (isset($server['HTTP_CLIENT_IP'])) {
 458         $ip = $ip . '_' . $server['HTTP_CLIENT_IP'];
 459     }
 460     return $ip;
 461 }
 462
 463
 464 /**
 465  * Returns true if Shaarli's currently browsed in HTTPS.
 466  * Supports reverse proxies (if the headers are correctly set).
 467  *
 468  * @param array $server $_SERVER.
 469  *
 470  * @return bool true if HTTPS, false otherwise.
 471  */
 472 function is_https($server)
 473 {
 474
 475     if (isset($server['HTTP_X_FORWARDED_PORT'])) {
 476         // Keep forwarded port
 477         if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
 478             $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
 479             $port = trim($ports[0]);
 480         } else {
 481             $port = $server['HTTP_X_FORWARDED_PORT'];
 482         }
 483
 484         if ($port == '443') {
 485             return true;
 486         }
 487     }
 488
 489     return ! empty($server['HTTPS']);
 490 }
 491
 492 /**
 493  * Get cURL callback function for CURLOPT_WRITEFUNCTION
 494  *
 495  * @param string $charset     to extract from the downloaded page (reference)
 496  * @param string $title       to extract from the downloaded page (reference)
 497  * @param string $description to extract from the downloaded page (reference)
 498  * @param string $keywords    to extract from the downloaded page (reference)
 499  * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
 500  * @param string $curlGetInfo Optionally overrides curl_getinfo function
 501  *
 502  * @return Closure
 503  */
 504 function get_curl_download_callback(
 505     &$charset,
 506     &$title,
 507     &$description,
 508     &$keywords,
 509     $retrieveDescription,
 510     $curlGetInfo = 'curl_getinfo'
 511 ) {
 512     $isRedirected = false;
 513     $currentChunk = 0;
 514     $foundChunk = null;
 515
 516     /**
 517      * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
 518      *
 519      * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
 520      * Then we extract the title and the charset and stop the download when it's done.
 521      *
 522      * @param resource $ch   cURL resource
 523      * @param string   $data chunk of data being downloaded
 524      *
 525      * @return int|bool length of $data or false if we need to stop the download
 526      */
 527     return function (&$ch, $data) use (
 528         $retrieveDescription,
 529         $curlGetInfo,
 530         &$charset,
 531         &$title,
 532         &$description,
 533         &$keywords,
 534         &$isRedirected,
 535         &$currentChunk,
 536         &$foundChunk
 537     ) {
 538         $currentChunk++;
 539         $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
 540         if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
 541             $isRedirected = true;
 542             return strlen($data);
 543         }
 544         if (!empty($responseCode) && $responseCode !== 200) {
 545             return false;
 546         }
 547         // After a redirection, the content type will keep the previous request value
 548         // until it finds the next content-type header.
 549         if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
 550             $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
 551         }
 552         if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
 553             return false;
 554         }
 555         if (!empty($contentType) && empty($charset)) {
 556             $charset = header_extract_charset($contentType);
 557         }
 558         if (empty($charset)) {
 559             $charset = html_extract_charset($data);
 560         }
 561         if (empty($title)) {
 562             $title = html_extract_title($data);
 563             $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
 564         }
 565         if ($retrieveDescription && empty($description)) {
 566             $description = html_extract_tag('description', $data);
 567             $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
 568         }
 569         if ($retrieveDescription && empty($keywords)) {
 570             $keywords = html_extract_tag('keywords', $data);
 571             if (! empty($keywords)) {
 572                 $foundChunk = $currentChunk;
 573                 // Keywords use the format tag1, tag2 multiple words, tag
 574                 // So we format them to match Shaarli's separator and glue multiple words with '-'
 575                 $keywords = implode(' ', array_map(function($keyword) {
 576                     return implode('-', preg_split('/\s+/', trim($keyword)));
 577                 }, explode(',', $keywords)));
 578             }
 579         }
 580
 581         // We got everything we want, stop the download.
 582         // If we already found either the title, description or keywords,
 583         // it's highly unlikely that we'll found the other metas further than
 584         // in the same chunk of data or the next one. So we also stop the download after that.
 585         if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
 586             && (! $retrieveDescription
 587                 || $foundChunk < $currentChunk
 588                 || (!empty($title) && !empty($description) && !empty($keywords))
 589             )
 590         ) {
 591             return false;
 592         }
 593
 594         return strlen($data);
 595     };
 596 }