]> git.immae.eu Git - github/shaarli/Shaarli.git/blob - application/http/HttpUtils.php
Merge pull request #1698 from ArthurHoaro/feature/plugins-search-filter
[github/shaarli/Shaarli.git] / application / http / HttpUtils.php
1 <?php
2
3 use Shaarli\Http\Url;
4
5 /**
6 * GET an HTTP URL to retrieve its content
7 * Uses the cURL library or a fallback method
8 *
9 * @param string $url URL to get (http://...)
10 * @param int $timeout network timeout (in seconds)
11 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
12 * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
13 * (CURLOPT_HEADERFUNCTION)
14 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
15 * Can be used to add download conditions on the
16 * headers (response code, content type, etc.).
17 *
18 * @return array HTTP response headers, downloaded content
19 *
20 * Output format:
21 * [0] = associative array containing HTTP response headers
22 * [1] = URL content (downloaded data)
23 *
24 * Example:
25 * list($headers, $data) = get_http_response('http://sebauvage.net/');
26 * if (strpos($headers[0], '200 OK') !== false) {
27 * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
28 * } else {
29 * echo 'There was an error: '.htmlspecialchars($headers[0]);
30 * }
31 *
32 * @see https://secure.php.net/manual/en/ref.curl.php
33 * @see https://secure.php.net/manual/en/functions.anonymous.php
34 * @see https://secure.php.net/manual/en/function.preg-split.php
35 * @see https://secure.php.net/manual/en/function.explode.php
36 * @see http://stackoverflow.com/q/17641073
37 * @see http://stackoverflow.com/q/9183178
38 * @see http://stackoverflow.com/q/1462720
39 */
40 function get_http_response(
41 $url,
42 $timeout = 30,
43 $maxBytes = 4194304,
44 $curlHeaderFunction = null,
45 $curlWriteFunction = null
46 ) {
47 $urlObj = new Url($url);
48 $cleanUrl = $urlObj->idnToAscii();
49
50 if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) {
51 return [[0 => 'Invalid HTTP UrlUtils'], false];
52 }
53
54 $userAgent =
55 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
56 . ' Gecko/20100101 Firefox/45.0';
57 $acceptLanguage =
58 substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3';
59 $maxRedirs = 3;
60
61 if (!function_exists('curl_init')) {
62 return get_http_response_fallback(
63 $cleanUrl,
64 $timeout,
65 $maxBytes,
66 $userAgent,
67 $acceptLanguage,
68 $maxRedirs
69 );
70 }
71
72 $ch = curl_init($cleanUrl);
73 if ($ch === false) {
74 return [[0 => 'curl_init() error'], false];
75 }
76
77 // General cURL settings
78 curl_setopt($ch, CURLOPT_AUTOREFERER, true);
79 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
80 // Default header download if the $curlHeaderFunction is not defined
81 curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
82 curl_setopt(
83 $ch,
84 CURLOPT_HTTPHEADER,
85 ['Accept-Language: ' . $acceptLanguage]
86 );
87 curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs);
88 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
89 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
90 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
91
92 // Max download size management
93 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024 * 16);
94 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
95 if (is_callable($curlHeaderFunction)) {
96 curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
97 }
98 if (is_callable($curlWriteFunction)) {
99 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
100 }
101 curl_setopt(
102 $ch,
103 CURLOPT_PROGRESSFUNCTION,
104 function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
105 $downloaded = $arg2;
106
107 // Non-zero return stops downloading
108 return ($downloaded > $maxBytes) ? 1 : 0;
109 }
110 );
111
112 $response = curl_exec($ch);
113 $errorNo = curl_errno($ch);
114 $errorStr = curl_error($ch);
115 $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
116 curl_close($ch);
117
118 if ($response === false) {
119 if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) {
120 /*
121 * Workaround to match fallback method behaviour
122 * Removing this would require updating
123 * GetHttpUrlTest::testGetInvalidRemoteUrl()
124 */
125 return [false, false];
126 }
127 return [[0 => 'curl_exec() error: ' . $errorStr], false];
128 }
129
130 // Formatting output like the fallback method
131 $rawHeaders = substr($response, 0, $headSize);
132
133 // Keep only headers from latest redirection
134 $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders));
135 $rawHeadersLastRedir = end($rawHeadersArrayRedirs);
136
137 $content = substr($response, $headSize);
138 $headers = [];
139 foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) {
140 if (empty($line) || ctype_space($line)) {
141 continue;
142 }
143 $splitLine = explode(': ', $line, 2);
144 if (count($splitLine) > 1) {
145 $key = $splitLine[0];
146 $value = $splitLine[1];
147 if (array_key_exists($key, $headers)) {
148 if (!is_array($headers[$key])) {
149 $headers[$key] = [0 => $headers[$key]];
150 }
151 $headers[$key][] = $value;
152 } else {
153 $headers[$key] = $value;
154 }
155 } else {
156 $headers[] = $splitLine[0];
157 }
158 }
159
160 return [$headers, $content];
161 }
162
163 /**
164 * GET an HTTP URL to retrieve its content (fallback method)
165 *
166 * @param string $cleanUrl URL to get (http://... valid and in ASCII form)
167 * @param int $timeout network timeout (in seconds)
168 * @param int $maxBytes maximum downloaded bytes
169 * @param string $userAgent "User-Agent" header
170 * @param string $acceptLanguage "Accept-Language" header
171 * @param int $maxRedr maximum amount of redirections followed
172 *
173 * @return array HTTP response headers, downloaded content
174 *
175 * Output format:
176 * [0] = associative array containing HTTP response headers
177 * [1] = URL content (downloaded data)
178 *
179 * @see http://php.net/manual/en/function.file-get-contents.php
180 * @see http://php.net/manual/en/function.stream-context-create.php
181 * @see http://php.net/manual/en/function.get-headers.php
182 */
183 function get_http_response_fallback(
184 $cleanUrl,
185 $timeout,
186 $maxBytes,
187 $userAgent,
188 $acceptLanguage,
189 $maxRedr
190 ) {
191 $options = [
192 'http' => [
193 'method' => 'GET',
194 'timeout' => $timeout,
195 'user_agent' => $userAgent,
196 'header' => "Accept: */*\r\n"
197 . 'Accept-Language: ' . $acceptLanguage
198 ]
199 ];
200
201 stream_context_set_default($options);
202 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
203 if (! $headers || strpos($headers[0], '200 OK') === false) {
204 $options['http']['request_fulluri'] = true;
205 stream_context_set_default($options);
206 list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr);
207 }
208
209 if (! $headers) {
210 return [$headers, false];
211 }
212
213 try {
214 // TODO: catch Exception in calling code (thumbnailer)
215 $context = stream_context_create($options);
216 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
217 } catch (Exception $exc) {
218 return [[0 => 'HTTP Error'], $exc->getMessage()];
219 }
220
221 return [$headers, $content];
222 }
223
224 /**
225 * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
226 *
227 * @param string $url initial URL to reach.
228 * @param int $redirectionLimit max redirection follow.
229 *
230 * @return array HTTP headers, or false if it failed.
231 */
232 function get_redirected_headers($url, $redirectionLimit = 3)
233 {
234 $headers = get_headers($url, 1);
235 if (!empty($headers['location']) && empty($headers['Location'])) {
236 $headers['Location'] = $headers['location'];
237 }
238
239 // Headers found, redirection found, and limit not reached.
240 if (
241 $redirectionLimit-- > 0
242 && !empty($headers)
243 && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
244 && !empty($headers['Location'])
245 ) {
246 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
247 if ($redirection != $url) {
248 $redirection = getAbsoluteUrl($url, $redirection);
249 return get_redirected_headers($redirection, $redirectionLimit);
250 }
251 }
252
253 return [$headers, $url];
254 }
255
256 /**
257 * Get an absolute URL from a complete one, and another absolute/relative URL.
258 *
259 * @param string $originalUrl The original complete URL.
260 * @param string $newUrl The new one, absolute or relative.
261 *
262 * @return string Final URL:
263 * - $newUrl if it was already an absolute URL.
264 * - if it was relative, absolute URL from $originalUrl path.
265 */
266 function getAbsoluteUrl($originalUrl, $newUrl)
267 {
268 $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
269 // Already an absolute URL.
270 if (!empty($newScheme)) {
271 return $newUrl;
272 }
273
274 $parts = parse_url($originalUrl);
275 $final = $parts['scheme'] . '://' . $parts['host'];
276 $final .= (!empty($parts['port'])) ? $parts['port'] : '';
277 $final .= '/';
278 if ($newUrl[0] != '/') {
279 $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
280 }
281 $final .= ltrim($newUrl, '/');
282 return $final;
283 }
284
285 /**
286 * Returns the server's base URL: scheme://domain.tld[:port]
287 *
288 * @param array $server the $_SERVER array
289 *
290 * @return string the server's base URL
291 *
292 * @see http://www.ietf.org/rfc/rfc7239.txt
293 * @see http://www.ietf.org/rfc/rfc6648.txt
294 * @see http://stackoverflow.com/a/3561399
295 * @see http://stackoverflow.com/q/452375
296 */
297 function server_url($server)
298 {
299 $scheme = 'http';
300 $port = '';
301
302 // Shaarli is served behind a proxy
303 if (isset($server['HTTP_X_FORWARDED_PROTO'])) {
304 // Keep forwarded scheme
305 if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) {
306 $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']);
307 $scheme = trim($schemes[0]);
308 } else {
309 $scheme = $server['HTTP_X_FORWARDED_PROTO'];
310 }
311
312 if (isset($server['HTTP_X_FORWARDED_PORT'])) {
313 // Keep forwarded port
314 if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
315 $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
316 $port = trim($ports[0]);
317 } else {
318 $port = $server['HTTP_X_FORWARDED_PORT'];
319 }
320
321 // This is a workaround for proxies that don't forward the scheme properly.
322 // Connecting over port 443 has to be in HTTPS.
323 // See https://github.com/shaarli/Shaarli/issues/1022
324 if ($port == '443') {
325 $scheme = 'https';
326 }
327
328 if (
329 ($scheme == 'http' && $port != '80')
330 || ($scheme == 'https' && $port != '443')
331 ) {
332 $port = ':' . $port;
333 } else {
334 $port = '';
335 }
336 }
337
338 if (isset($server['HTTP_X_FORWARDED_HOST'])) {
339 // Keep forwarded host
340 if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) {
341 $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']);
342 $host = trim($hosts[0]);
343 } else {
344 $host = $server['HTTP_X_FORWARDED_HOST'];
345 }
346 } else {
347 $host = $server['SERVER_NAME'];
348 }
349
350 return $scheme . '://' . $host . $port;
351 }
352
353 // SSL detection
354 if (
355 (! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on')
356 || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')
357 ) {
358 $scheme = 'https';
359 }
360
361 // Do not append standard port values
362 if (
363 ($scheme == 'http' && $server['SERVER_PORT'] != '80')
364 || ($scheme == 'https' && $server['SERVER_PORT'] != '443')
365 ) {
366 $port = ':' . $server['SERVER_PORT'];
367 }
368
369 return $scheme . '://' . $server['SERVER_NAME'] . $port;
370 }
371
372 /**
373 * Returns the absolute URL of the current script, without the query
374 *
375 * If the resource is "index.php", then it is removed (for better-looking URLs)
376 *
377 * @param array $server the $_SERVER array
378 *
379 * @return string the absolute URL of the current script, without the query
380 */
381 function index_url($server)
382 {
383 if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
384 return rtrim(SHAARLI_ROOT_URL, '/') . '/';
385 }
386
387 $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
388 if (endsWith($scriptname, 'index.php')) {
389 $scriptname = substr($scriptname, 0, -9);
390 }
391 return server_url($server) . $scriptname;
392 }
393
394 /**
395 * Returns the absolute URL of the current script, with current route and query
396 *
397 * If the resource is "index.php", then it is removed (for better-looking URLs)
398 *
399 * @param array $server the $_SERVER array
400 *
401 * @return string the absolute URL of the current script, with the query
402 */
403 function page_url($server)
404 {
405 $scriptname = $server['SCRIPT_NAME'] ?? '';
406 if (endsWith($scriptname, 'index.php')) {
407 $scriptname = substr($scriptname, 0, -9);
408 }
409
410 $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
411 if (! empty($server['QUERY_STRING'])) {
412 return index_url($server) . $route . '?' . $server['QUERY_STRING'];
413 }
414
415 return index_url($server) . $route;
416 }
417
418 /**
419 * Retrieve the initial IP forwarded by the reverse proxy.
420 *
421 * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php
422 *
423 * @param array $server $_SERVER array which contains HTTP headers.
424 * @param array $trustedIps List of trusted IP from the configuration.
425 *
426 * @return string|bool The forwarded IP, or false if none could be extracted.
427 */
428 function getIpAddressFromProxy($server, $trustedIps)
429 {
430 $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR';
431 if (empty($server[$forwardedIpHeader])) {
432 return false;
433 }
434
435 $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]);
436 $ips = array_diff($ips, $trustedIps);
437 if (empty($ips)) {
438 return false;
439 }
440
441 return array_pop($ips);
442 }
443
444
445 /**
446 * Return an identifier based on the advertised client IP address(es)
447 *
448 * This aims at preventing session hijacking from users behind the same proxy
449 * by relying on HTTP headers.
450 *
451 * See:
452 * - https://secure.php.net/manual/en/reserved.variables.server.php
453 * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php
454 * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking
455 * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor
456 *
457 * @param array $server The $_SERVER array
458 *
459 * @return string An identifier based on client IP address information
460 */
461 function client_ip_id($server)
462 {
463 $ip = $server['REMOTE_ADDR'];
464
465 if (isset($server['HTTP_X_FORWARDED_FOR'])) {
466 $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR'];
467 }
468 if (isset($server['HTTP_CLIENT_IP'])) {
469 $ip = $ip . '_' . $server['HTTP_CLIENT_IP'];
470 }
471 return $ip;
472 }
473
474
475 /**
476 * Returns true if Shaarli's currently browsed in HTTPS.
477 * Supports reverse proxies (if the headers are correctly set).
478 *
479 * @param array $server $_SERVER.
480 *
481 * @return bool true if HTTPS, false otherwise.
482 */
483 function is_https($server)
484 {
485
486 if (isset($server['HTTP_X_FORWARDED_PORT'])) {
487 // Keep forwarded port
488 if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) {
489 $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']);
490 $port = trim($ports[0]);
491 } else {
492 $port = $server['HTTP_X_FORWARDED_PORT'];
493 }
494
495 if ($port == '443') {
496 return true;
497 }
498 }
499
500 return ! empty($server['HTTPS']);
501 }
502
503 /**
504 * Get cURL callback function for CURLOPT_WRITEFUNCTION
505 *
506 * @param string $charset to extract from the downloaded page (reference)
507 * @param string $curlGetInfo Optionally overrides curl_getinfo function
508 *
509 * @return Closure
510 */
511 function get_curl_header_callback(
512 &$charset,
513 $curlGetInfo = 'curl_getinfo'
514 ) {
515 $isRedirected = false;
516
517 return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
518 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
519 $chunkLength = strlen($data);
520 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
521 $isRedirected = true;
522 return $chunkLength;
523 }
524 if (!empty($responseCode) && $responseCode !== 200) {
525 return false;
526 }
527 // After a redirection, the content type will keep the previous request value
528 // until it finds the next content-type header.
529 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
530 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
531 }
532 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
533 return false;
534 }
535 if (!empty($contentType) && empty($charset)) {
536 $charset = header_extract_charset($contentType);
537 }
538
539 return $chunkLength;
540 };
541 }
542
543 /**
544 * Get cURL callback function for CURLOPT_WRITEFUNCTION
545 *
546 * @param string $charset to extract from the downloaded page (reference)
547 * @param string $title to extract from the downloaded page (reference)
548 * @param string $description to extract from the downloaded page (reference)
549 * @param string $keywords to extract from the downloaded page (reference)
550 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
551 * @param string $curlGetInfo Optionally overrides curl_getinfo function
552 *
553 * @return Closure
554 */
555 function get_curl_download_callback(
556 &$charset,
557 &$title,
558 &$description,
559 &$keywords,
560 $retrieveDescription,
561 $tagsSeparator
562 ) {
563 $currentChunk = 0;
564 $foundChunk = null;
565
566 /**
567 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
568 *
569 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
570 * Then we extract the title and the charset and stop the download when it's done.
571 *
572 * @param resource $ch cURL resource
573 * @param string $data chunk of data being downloaded
574 *
575 * @return int|bool length of $data or false if we need to stop the download
576 */
577 return function (
578 $ch,
579 $data
580 ) use (
581 $retrieveDescription,
582 $tagsSeparator,
583 &$charset,
584 &$title,
585 &$description,
586 &$keywords,
587 &$currentChunk,
588 &$foundChunk
589 ) {
590 $chunkLength = strlen($data);
591 $currentChunk++;
592
593 if (empty($charset)) {
594 $charset = html_extract_charset($data);
595 }
596 if (empty($title)) {
597 $title = html_extract_title($data);
598 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
599 }
600 if (empty($title)) {
601 $title = html_extract_tag('title', $data);
602 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
603 }
604 if ($retrieveDescription && empty($description)) {
605 $description = html_extract_tag('description', $data);
606 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
607 }
608 if ($retrieveDescription && empty($keywords)) {
609 $keywords = html_extract_tag('keywords', $data);
610 if (! empty($keywords)) {
611 $foundChunk = $currentChunk;
612 // Keywords use the format tag1, tag2 multiple words, tag
613 // So we split the result with `,`, then if a tag contains the separator we replace it by `-`.
614 $keywords = tags_array2str(array_map(function (string $keyword) use ($tagsSeparator): string {
615 return tags_array2str(tags_str2array($keyword, $tagsSeparator), '-');
616 }, tags_str2array($keywords, ',')), $tagsSeparator);
617 }
618 }
619
620 // We got everything we want, stop the download.
621 // If we already found either the title, description or keywords,
622 // it's highly unlikely that we'll found the other metas further than
623 // in the same chunk of data or the next one. So we also stop the download after that.
624 if (
625 (!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
626 && (! $retrieveDescription
627 || $foundChunk < $currentChunk
628 || (!empty($title) && !empty($description) && !empty($keywords))
629 )
630 ) {
631 return false;
632 }
633
634 return $chunkLength;
635 };
636 }