aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/http
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-10-13 12:05:08 +0200
committerArthurHoaro <arthur@hoa.ro>2020-10-13 12:05:08 +0200
commitb6f678a5a1d15acf284ebcec16c905e976671ce1 (patch)
tree33c7da831482ed79c44896ef19c73c72ada84f2e /application/http
parentb14687036b9b800681197f51fdc47e62f0c88e2e (diff)
parent1c1520b6b98ab20201bfe15577782a52320339df (diff)
downloadShaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.gz
Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.tar.zst
Shaarli-b6f678a5a1d15acf284ebcec16c905e976671ce1.zip
Merge branch 'v0.12' into latest
Diffstat (limited to 'application/http')
-rw-r--r--application/http/HttpAccess.php39
-rw-r--r--application/http/HttpUtils.php125
-rw-r--r--application/http/UrlUtils.php2
3 files changed, 161 insertions, 5 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
new file mode 100644
index 00000000..81d9e076
--- /dev/null
+++ b/application/http/HttpAccess.php
@@ -0,0 +1,39 @@
1<?php
2
3declare(strict_types=1);
4
5namespace Shaarli\Http;
6
7/**
8 * Class HttpAccess
9 *
10 * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
11 * It is used as dependency injection in Shaarli's container.
12 *
13 * @package Shaarli\Http
14 */
15class HttpAccess
16{
17 public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
18 {
19 return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
20 }
21
22 public function getCurlDownloadCallback(
23 &$charset,
24 &$title,
25 &$description,
26 &$keywords,
27 $retrieveDescription,
28 $curlGetInfo = 'curl_getinfo'
29 ) {
30 return get_curl_download_callback(
31 $charset,
32 $title,
33 $description,
34 $keywords,
35 $retrieveDescription,
36 $curlGetInfo
37 );
38 }
39}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 2ea9195d..9f414073 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -369,7 +369,11 @@ function server_url($server)
369 */ 369 */
370function index_url($server) 370function index_url($server)
371{ 371{
372 $scriptname = $server['SCRIPT_NAME']; 372 if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
373 return rtrim(SHAARLI_ROOT_URL, '/') . '/';
374 }
375
376 $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
373 if (endsWith($scriptname, 'index.php')) { 377 if (endsWith($scriptname, 'index.php')) {
374 $scriptname = substr($scriptname, 0, -9); 378 $scriptname = substr($scriptname, 0, -9);
375 } 379 }
@@ -377,7 +381,7 @@ function index_url($server)
377} 381}
378 382
379/** 383/**
380 * Returns the absolute URL of the current script, with the query 384 * Returns the absolute URL of the current script, with current route and query
381 * 385 *
382 * If the resource is "index.php", then it is removed (for better-looking URLs) 386 * If the resource is "index.php", then it is removed (for better-looking URLs)
383 * 387 *
@@ -387,10 +391,17 @@ function index_url($server)
387 */ 391 */
388function page_url($server) 392function page_url($server)
389{ 393{
394 $scriptname = $server['SCRIPT_NAME'] ?? '';
395 if (endsWith($scriptname, 'index.php')) {
396 $scriptname = substr($scriptname, 0, -9);
397 }
398
399 $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
390 if (! empty($server['QUERY_STRING'])) { 400 if (! empty($server['QUERY_STRING'])) {
391 return index_url($server).'?'.$server['QUERY_STRING']; 401 return index_url($server) . $route . '?' . $server['QUERY_STRING'];
392 } 402 }
393 return index_url($server); 403
404 return index_url($server) . $route;
394} 405}
395 406
396/** 407/**
@@ -477,3 +488,109 @@ function is_https($server)
477 488
478 return ! empty($server['HTTPS']); 489 return ! empty($server['HTTPS']);
479} 490}
491
492/**
493 * Get cURL callback function for CURLOPT_WRITEFUNCTION
494 *
495 * @param string $charset to extract from the downloaded page (reference)
496 * @param string $title to extract from the downloaded page (reference)
497 * @param string $description to extract from the downloaded page (reference)
498 * @param string $keywords to extract from the downloaded page (reference)
499 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
500 * @param string $curlGetInfo Optionally overrides curl_getinfo function
501 *
502 * @return Closure
503 */
504function get_curl_download_callback(
505 &$charset,
506 &$title,
507 &$description,
508 &$keywords,
509 $retrieveDescription,
510 $curlGetInfo = 'curl_getinfo'
511) {
512 $isRedirected = false;
513 $currentChunk = 0;
514 $foundChunk = null;
515
516 /**
517 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
518 *
519 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
520 * Then we extract the title and the charset and stop the download when it's done.
521 *
522 * @param resource $ch cURL resource
523 * @param string $data chunk of data being downloaded
524 *
525 * @return int|bool length of $data or false if we need to stop the download
526 */
527 return function (&$ch, $data) use (
528 $retrieveDescription,
529 $curlGetInfo,
530 &$charset,
531 &$title,
532 &$description,
533 &$keywords,
534 &$isRedirected,
535 &$currentChunk,
536 &$foundChunk
537 ) {
538 $currentChunk++;
539 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
540 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
541 $isRedirected = true;
542 return strlen($data);
543 }
544 if (!empty($responseCode) && $responseCode !== 200) {
545 return false;
546 }
547 // After a redirection, the content type will keep the previous request value
548 // until it finds the next content-type header.
549 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
550 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
551 }
552 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
553 return false;
554 }
555 if (!empty($contentType) && empty($charset)) {
556 $charset = header_extract_charset($contentType);
557 }
558 if (empty($charset)) {
559 $charset = html_extract_charset($data);
560 }
561 if (empty($title)) {
562 $title = html_extract_title($data);
563 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
564 }
565 if ($retrieveDescription && empty($description)) {
566 $description = html_extract_tag('description', $data);
567 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
568 }
569 if ($retrieveDescription && empty($keywords)) {
570 $keywords = html_extract_tag('keywords', $data);
571 if (! empty($keywords)) {
572 $foundChunk = $currentChunk;
573 // Keywords use the format tag1, tag2 multiple words, tag
574 // So we format them to match Shaarli's separator and glue multiple words with '-'
575 $keywords = implode(' ', array_map(function($keyword) {
576 return implode('-', preg_split('/\s+/', trim($keyword)));
577 }, explode(',', $keywords)));
578 }
579 }
580
581 // We got everything we want, stop the download.
582 // If we already found either the title, description or keywords,
583 // it's highly unlikely that we'll found the other metas further than
584 // in the same chunk of data or the next one. So we also stop the download after that.
585 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
586 && (! $retrieveDescription
587 || $foundChunk < $currentChunk
588 || (!empty($title) && !empty($description) && !empty($keywords))
589 )
590 ) {
591 return false;
592 }
593
594 return strlen($data);
595 };
596}
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php
index 4bc84b82..e8d1a283 100644
--- a/application/http/UrlUtils.php
+++ b/application/http/UrlUtils.php
@@ -73,7 +73,7 @@ function add_trailing_slash($url)
73 */ 73 */
74function whitelist_protocols($url, $protocols) 74function whitelist_protocols($url, $protocols)
75{ 75{
76 if (startsWith($url, '?') || startsWith($url, '/')) { 76 if (startsWith($url, '?') || startsWith($url, '/') || startsWith($url, '#')) {
77 return $url; 77 return $url;
78 } 78 }
79 $protocols = array_merge(['http', 'https'], $protocols); 79 $protocols = array_merge(['http', 'https'], $protocols);