diff options
author | ArthurHoaro <arthur@hoa.ro> | 2020-10-13 12:05:08 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2020-10-13 12:05:08 +0200 |
commit | b6f678a5a1d15acf284ebcec16c905e976671ce1 (patch) | |
tree | 33c7da831482ed79c44896ef19c73c72ada84f2e /application/http | |
parent | b14687036b9b800681197f51fdc47e62f0c88e2e (diff) | |
parent | 1c1520b6b98ab20201bfe15577782a52320339df (diff) | |
download | Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.gz Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.zst Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.zip |
Merge branch 'v0.12' into latest
Diffstat (limited to 'application/http')
-rw-r--r-- | application/http/HttpAccess.php | 39 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 125 | ||||
-rw-r--r-- | application/http/UrlUtils.php | 2 |
3 files changed, 161 insertions, 5 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php new file mode 100644 index 00000000..81d9e076 --- /dev/null +++ b/application/http/HttpAccess.php | |||
@@ -0,0 +1,39 @@ | |||
1 | <?php | ||
2 | |||
3 | declare(strict_types=1); | ||
4 | |||
5 | namespace Shaarli\Http; | ||
6 | |||
7 | /** | ||
8 | * Class HttpAccess | ||
9 | * | ||
10 | * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`. | ||
11 | * It is used as dependency injection in Shaarli's container. | ||
12 | * | ||
13 | * @package Shaarli\Http | ||
14 | */ | ||
15 | class HttpAccess | ||
16 | { | ||
17 | public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | ||
18 | { | ||
19 | return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction); | ||
20 | } | ||
21 | |||
22 | public function getCurlDownloadCallback( | ||
23 | &$charset, | ||
24 | &$title, | ||
25 | &$description, | ||
26 | &$keywords, | ||
27 | $retrieveDescription, | ||
28 | $curlGetInfo = 'curl_getinfo' | ||
29 | ) { | ||
30 | return get_curl_download_callback( | ||
31 | $charset, | ||
32 | $title, | ||
33 | $description, | ||
34 | $keywords, | ||
35 | $retrieveDescription, | ||
36 | $curlGetInfo | ||
37 | ); | ||
38 | } | ||
39 | } | ||
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php index 2ea9195d..9f414073 100644 --- a/application/http/HttpUtils.php +++ b/application/http/HttpUtils.php | |||
@@ -369,7 +369,11 @@ function server_url($server) | |||
369 | */ | 369 | */ |
370 | function index_url($server) | 370 | function index_url($server) |
371 | { | 371 | { |
372 | $scriptname = $server['SCRIPT_NAME']; | 372 | if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) { |
373 | return rtrim(SHAARLI_ROOT_URL, '/') . '/'; | ||
374 | } | ||
375 | |||
376 | $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/'; | ||
373 | if (endsWith($scriptname, 'index.php')) { | 377 | if (endsWith($scriptname, 'index.php')) { |
374 | $scriptname = substr($scriptname, 0, -9); | 378 | $scriptname = substr($scriptname, 0, -9); |
375 | } | 379 | } |
@@ -377,7 +381,7 @@ function index_url($server) | |||
377 | } | 381 | } |
378 | 382 | ||
379 | /** | 383 | /** |
380 | * Returns the absolute URL of the current script, with the query | 384 | * Returns the absolute URL of the current script, with current route and query |
381 | * | 385 | * |
382 | * If the resource is "index.php", then it is removed (for better-looking URLs) | 386 | * If the resource is "index.php", then it is removed (for better-looking URLs) |
383 | * | 387 | * |
@@ -387,10 +391,17 @@ function index_url($server) | |||
387 | */ | 391 | */ |
388 | function page_url($server) | 392 | function page_url($server) |
389 | { | 393 | { |
394 | $scriptname = $server['SCRIPT_NAME'] ?? ''; | ||
395 | if (endsWith($scriptname, 'index.php')) { | ||
396 | $scriptname = substr($scriptname, 0, -9); | ||
397 | } | ||
398 | |||
399 | $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? ''); | ||
390 | if (! empty($server['QUERY_STRING'])) { | 400 | if (! empty($server['QUERY_STRING'])) { |
391 | return index_url($server).'?'.$server['QUERY_STRING']; | 401 | return index_url($server) . $route . '?' . $server['QUERY_STRING']; |
392 | } | 402 | } |
393 | return index_url($server); | 403 | |
404 | return index_url($server) . $route; | ||
394 | } | 405 | } |
395 | 406 | ||
396 | /** | 407 | /** |
@@ -477,3 +488,109 @@ function is_https($server) | |||
477 | 488 | ||
478 | return ! empty($server['HTTPS']); | 489 | return ! empty($server['HTTPS']); |
479 | } | 490 | } |
491 | |||
492 | /** | ||
493 | * Get cURL callback function for CURLOPT_WRITEFUNCTION | ||
494 | * | ||
495 | * @param string $charset to extract from the downloaded page (reference) | ||
496 | * @param string $title to extract from the downloaded page (reference) | ||
497 | * @param string $description to extract from the downloaded page (reference) | ||
498 | * @param string $keywords to extract from the downloaded page (reference) | ||
499 | * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content | ||
500 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | ||
501 | * | ||
502 | * @return Closure | ||
503 | */ | ||
504 | function get_curl_download_callback( | ||
505 | &$charset, | ||
506 | &$title, | ||
507 | &$description, | ||
508 | &$keywords, | ||
509 | $retrieveDescription, | ||
510 | $curlGetInfo = 'curl_getinfo' | ||
511 | ) { | ||
512 | $isRedirected = false; | ||
513 | $currentChunk = 0; | ||
514 | $foundChunk = null; | ||
515 | |||
516 | /** | ||
517 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). | ||
518 | * | ||
519 | * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' | ||
520 | * Then we extract the title and the charset and stop the download when it's done. | ||
521 | * | ||
522 | * @param resource $ch cURL resource | ||
523 | * @param string $data chunk of data being downloaded | ||
524 | * | ||
525 | * @return int|bool length of $data or false if we need to stop the download | ||
526 | */ | ||
527 | return function (&$ch, $data) use ( | ||
528 | $retrieveDescription, | ||
529 | $curlGetInfo, | ||
530 | &$charset, | ||
531 | &$title, | ||
532 | &$description, | ||
533 | &$keywords, | ||
534 | &$isRedirected, | ||
535 | &$currentChunk, | ||
536 | &$foundChunk | ||
537 | ) { | ||
538 | $currentChunk++; | ||
539 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | ||
540 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | ||
541 | $isRedirected = true; | ||
542 | return strlen($data); | ||
543 | } | ||
544 | if (!empty($responseCode) && $responseCode !== 200) { | ||
545 | return false; | ||
546 | } | ||
547 | // After a redirection, the content type will keep the previous request value | ||
548 | // until it finds the next content-type header. | ||
549 | if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { | ||
550 | $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE); | ||
551 | } | ||
552 | if (!empty($contentType) && strpos($contentType, 'text/html') === false) { | ||
553 | return false; | ||
554 | } | ||
555 | if (!empty($contentType) && empty($charset)) { | ||
556 | $charset = header_extract_charset($contentType); | ||
557 | } | ||
558 | if (empty($charset)) { | ||
559 | $charset = html_extract_charset($data); | ||
560 | } | ||
561 | if (empty($title)) { | ||
562 | $title = html_extract_title($data); | ||
563 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
564 | } | ||
565 | if ($retrieveDescription && empty($description)) { | ||
566 | $description = html_extract_tag('description', $data); | ||
567 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | ||
568 | } | ||
569 | if ($retrieveDescription && empty($keywords)) { | ||
570 | $keywords = html_extract_tag('keywords', $data); | ||
571 | if (! empty($keywords)) { | ||
572 | $foundChunk = $currentChunk; | ||
573 | // Keywords use the format tag1, tag2 multiple words, tag | ||
574 | // So we format them to match Shaarli's separator and glue multiple words with '-' | ||
575 | $keywords = implode(' ', array_map(function($keyword) { | ||
576 | return implode('-', preg_split('/\s+/', trim($keyword))); | ||
577 | }, explode(',', $keywords))); | ||
578 | } | ||
579 | } | ||
580 | |||
581 | // We got everything we want, stop the download. | ||
582 | // If we already found either the title, description or keywords, | ||
583 | // it's highly unlikely that we'll found the other metas further than | ||
584 | // in the same chunk of data or the next one. So we also stop the download after that. | ||
585 | if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | ||
586 | && (! $retrieveDescription | ||
587 | || $foundChunk < $currentChunk | ||
588 | || (!empty($title) && !empty($description) && !empty($keywords)) | ||
589 | ) | ||
590 | ) { | ||
591 | return false; | ||
592 | } | ||
593 | |||
594 | return strlen($data); | ||
595 | }; | ||
596 | } | ||
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php index 4bc84b82..e8d1a283 100644 --- a/application/http/UrlUtils.php +++ b/application/http/UrlUtils.php | |||
@@ -73,7 +73,7 @@ function add_trailing_slash($url) | |||
73 | */ | 73 | */ |
74 | function whitelist_protocols($url, $protocols) | 74 | function whitelist_protocols($url, $protocols) |
75 | { | 75 | { |
76 | if (startsWith($url, '?') || startsWith($url, '/')) { | 76 | if (startsWith($url, '?') || startsWith($url, '/') || startsWith($url, '#')) { |
77 | return $url; | 77 | return $url; |
78 | } | 78 | } |
79 | $protocols = array_merge(['http', 'https'], $protocols); | 79 | $protocols = array_merge(['http', 'https'], $protocols); |