aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/http
diff options
context:
space:
mode:
Diffstat (limited to 'application/http')
-rw-r--r--application/http/HttpAccess.php47
-rw-r--r--application/http/HttpUtils.php196
-rw-r--r--application/http/MetadataRetriever.php69
3 files changed, 286 insertions, 26 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
new file mode 100644
index 00000000..646a5264
--- /dev/null
+++ b/application/http/HttpAccess.php
@@ -0,0 +1,47 @@
1<?php
2
3declare(strict_types=1);
4
5namespace Shaarli\Http;
6
7/**
8 * Class HttpAccess
9 *
10 * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
11 * It is used as dependency injection in Shaarli's container.
12 *
13 * @package Shaarli\Http
14 */
15class HttpAccess
16{
17 public function getHttpResponse(
18 $url,
19 $timeout = 30,
20 $maxBytes = 4194304,
21 $curlHeaderFunction = null,
22 $curlWriteFunction = null
23 ) {
24 return get_http_response($url, $timeout, $maxBytes, $curlHeaderFunction, $curlWriteFunction);
25 }
26
27 public function getCurlDownloadCallback(
28 &$charset,
29 &$title,
30 &$description,
31 &$keywords,
32 $retrieveDescription
33 ) {
34 return get_curl_download_callback(
35 $charset,
36 $title,
37 $description,
38 $keywords,
39 $retrieveDescription
40 );
41 }
42
43 public function getCurlHeaderCallback(&$charset, $curlGetInfo = 'curl_getinfo')
44 {
45 return get_curl_header_callback($charset, $curlGetInfo);
46 }
47}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index 2ea9195d..28c12969 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -6,12 +6,14 @@ use Shaarli\Http\Url;
6 * GET an HTTP URL to retrieve its content 6 * GET an HTTP URL to retrieve its content
7 * Uses the cURL library or a fallback method 7 * Uses the cURL library or a fallback method
8 * 8 *
9 * @param string $url URL to get (http://...) 9 * @param string $url URL to get (http://...)
10 * @param int $timeout network timeout (in seconds) 10 * @param int $timeout network timeout (in seconds)
11 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) 11 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
12 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). 12 * @param callable|string $curlHeaderFunction Optional callback called during the download of headers
13 * Can be used to add download conditions on the 13 * (CURLOPT_HEADERFUNCTION)
14 * headers (response code, content type, etc.). 14 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
15 * Can be used to add download conditions on the
16 * headers (response code, content type, etc.).
15 * 17 *
16 * @return array HTTP response headers, downloaded content 18 * @return array HTTP response headers, downloaded content
17 * 19 *
@@ -35,8 +37,13 @@ use Shaarli\Http\Url;
35 * @see http://stackoverflow.com/q/9183178 37 * @see http://stackoverflow.com/q/9183178
36 * @see http://stackoverflow.com/q/1462720 38 * @see http://stackoverflow.com/q/1462720
37 */ 39 */
38function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) 40function get_http_response(
39{ 41 $url,
42 $timeout = 30,
43 $maxBytes = 4194304,
44 $curlHeaderFunction = null,
45 $curlWriteFunction = null
46) {
40 $urlObj = new Url($url); 47 $urlObj = new Url($url);
41 $cleanUrl = $urlObj->idnToAscii(); 48 $cleanUrl = $urlObj->idnToAscii();
42 49
@@ -70,7 +77,8 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
70 // General cURL settings 77 // General cURL settings
71 curl_setopt($ch, CURLOPT_AUTOREFERER, true); 78 curl_setopt($ch, CURLOPT_AUTOREFERER, true);
72 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 79 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
73 curl_setopt($ch, CURLOPT_HEADER, true); 80 // Default header download if the $curlHeaderFunction is not defined
81 curl_setopt($ch, CURLOPT_HEADER, !is_callable($curlHeaderFunction));
74 curl_setopt( 82 curl_setopt(
75 $ch, 83 $ch,
76 CURLOPT_HTTPHEADER, 84 CURLOPT_HTTPHEADER,
@@ -81,25 +89,21 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteF
81 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 89 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
82 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); 90 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
83 91
84 if (is_callable($curlWriteFunction)) {
85 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
86 }
87
88 // Max download size management 92 // Max download size management
89 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); 93 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
90 curl_setopt($ch, CURLOPT_NOPROGRESS, false); 94 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
95 if (is_callable($curlHeaderFunction)) {
96 curl_setopt($ch, CURLOPT_HEADERFUNCTION, $curlHeaderFunction);
97 }
98 if (is_callable($curlWriteFunction)) {
99 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
100 }
91 curl_setopt( 101 curl_setopt(
92 $ch, 102 $ch,
93 CURLOPT_PROGRESSFUNCTION, 103 CURLOPT_PROGRESSFUNCTION,
94 function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { 104 function ($arg0, $arg1, $arg2, $arg3, $arg4) use ($maxBytes) {
95 if (version_compare(phpversion(), '5.5', '<')) { 105 $downloaded = $arg2;
96 // PHP version lower than 5.5 106
97 // Callback has 4 arguments
98 $downloaded = $arg1;
99 } else {
100 // Callback has 5 arguments
101 $downloaded = $arg2;
102 }
103 // Non-zero return stops downloading 107 // Non-zero return stops downloading
104 return ($downloaded > $maxBytes) ? 1 : 0; 108 return ($downloaded > $maxBytes) ? 1 : 0;
105 } 109 }
@@ -369,7 +373,11 @@ function server_url($server)
369 */ 373 */
370function index_url($server) 374function index_url($server)
371{ 375{
372 $scriptname = $server['SCRIPT_NAME']; 376 if (defined('SHAARLI_ROOT_URL') && null !== SHAARLI_ROOT_URL) {
377 return rtrim(SHAARLI_ROOT_URL, '/') . '/';
378 }
379
380 $scriptname = !empty($server['SCRIPT_NAME']) ? $server['SCRIPT_NAME'] : '/';
373 if (endsWith($scriptname, 'index.php')) { 381 if (endsWith($scriptname, 'index.php')) {
374 $scriptname = substr($scriptname, 0, -9); 382 $scriptname = substr($scriptname, 0, -9);
375 } 383 }
@@ -377,7 +385,7 @@ function index_url($server)
377} 385}
378 386
379/** 387/**
380 * Returns the absolute URL of the current script, with the query 388 * Returns the absolute URL of the current script, with current route and query
381 * 389 *
382 * If the resource is "index.php", then it is removed (for better-looking URLs) 390 * If the resource is "index.php", then it is removed (for better-looking URLs)
383 * 391 *
@@ -387,10 +395,17 @@ function index_url($server)
387 */ 395 */
388function page_url($server) 396function page_url($server)
389{ 397{
398 $scriptname = $server['SCRIPT_NAME'] ?? '';
399 if (endsWith($scriptname, 'index.php')) {
400 $scriptname = substr($scriptname, 0, -9);
401 }
402
403 $route = preg_replace('@^' . $scriptname . '@', '', $server['REQUEST_URI'] ?? '');
390 if (! empty($server['QUERY_STRING'])) { 404 if (! empty($server['QUERY_STRING'])) {
391 return index_url($server).'?'.$server['QUERY_STRING']; 405 return index_url($server) . $route . '?' . $server['QUERY_STRING'];
392 } 406 }
393 return index_url($server); 407
408 return index_url($server) . $route;
394} 409}
395 410
396/** 411/**
@@ -477,3 +492,132 @@ function is_https($server)
477 492
478 return ! empty($server['HTTPS']); 493 return ! empty($server['HTTPS']);
479} 494}
495
496/**
497 * Get cURL callback function for CURLOPT_WRITEFUNCTION
498 *
499 * @param string $charset to extract from the downloaded page (reference)
500 * @param string $curlGetInfo Optionally overrides curl_getinfo function
501 *
502 * @return Closure
503 */
504function get_curl_header_callback(
505 &$charset,
506 $curlGetInfo = 'curl_getinfo'
507) {
508 $isRedirected = false;
509
510 return function ($ch, $data) use ($curlGetInfo, &$charset, &$isRedirected) {
511 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
512 $chunkLength = strlen($data);
513 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
514 $isRedirected = true;
515 return $chunkLength;
516 }
517 if (!empty($responseCode) && $responseCode !== 200) {
518 return false;
519 }
520 // After a redirection, the content type will keep the previous request value
521 // until it finds the next content-type header.
522 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
523 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
524 }
525 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
526 return false;
527 }
528 if (!empty($contentType) && empty($charset)) {
529 $charset = header_extract_charset($contentType);
530 }
531
532 return $chunkLength;
533 };
534}
535
536/**
537 * Get cURL callback function for CURLOPT_WRITEFUNCTION
538 *
539 * @param string $charset to extract from the downloaded page (reference)
540 * @param string $title to extract from the downloaded page (reference)
541 * @param string $description to extract from the downloaded page (reference)
542 * @param string $keywords to extract from the downloaded page (reference)
543 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
544 * @param string $curlGetInfo Optionally overrides curl_getinfo function
545 *
546 * @return Closure
547 */
548function get_curl_download_callback(
549 &$charset,
550 &$title,
551 &$description,
552 &$keywords,
553 $retrieveDescription
554) {
555 $currentChunk = 0;
556 $foundChunk = null;
557
558 /**
559 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
560 *
561 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
562 * Then we extract the title and the charset and stop the download when it's done.
563 *
564 * @param resource $ch cURL resource
565 * @param string $data chunk of data being downloaded
566 *
567 * @return int|bool length of $data or false if we need to stop the download
568 */
569 return function ($ch, $data) use (
570 $retrieveDescription,
571 &$charset,
572 &$title,
573 &$description,
574 &$keywords,
575 &$currentChunk,
576 &$foundChunk
577 ) {
578 $chunkLength = strlen($data);
579 $currentChunk++;
580
581 if (empty($charset)) {
582 $charset = html_extract_charset($data);
583 }
584 if (empty($title)) {
585 $title = html_extract_title($data);
586 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
587 }
588 if (empty($title)) {
589 $title = html_extract_tag('title', $data);
590 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
591 }
592 if ($retrieveDescription && empty($description)) {
593 $description = html_extract_tag('description', $data);
594 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
595 }
596 if ($retrieveDescription && empty($keywords)) {
597 $keywords = html_extract_tag('keywords', $data);
598 if (! empty($keywords)) {
599 $foundChunk = $currentChunk;
600 // Keywords use the format tag1, tag2 multiple words, tag
601 // So we format them to match Shaarli's separator and glue multiple words with '-'
602 $keywords = implode(' ', array_map(function($keyword) {
603 return implode('-', preg_split('/\s+/', trim($keyword)));
604 }, explode(',', $keywords)));
605 }
606 }
607
608 // We got everything we want, stop the download.
609 // If we already found either the title, description or keywords,
610 // it's highly unlikely that we'll found the other metas further than
611 // in the same chunk of data or the next one. So we also stop the download after that.
612 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
613 && (! $retrieveDescription
614 || $foundChunk < $currentChunk
615 || (!empty($title) && !empty($description) && !empty($keywords))
616 )
617 ) {
618 return false;
619 }
620
621 return $chunkLength;
622 };
623}
diff --git a/application/http/MetadataRetriever.php b/application/http/MetadataRetriever.php
new file mode 100644
index 00000000..ba9bd40c
--- /dev/null
+++ b/application/http/MetadataRetriever.php
@@ -0,0 +1,69 @@
1<?php
2
3declare(strict_types=1);
4
5namespace Shaarli\Http;
6
7use Shaarli\Config\ConfigManager;
8
9/**
10 * HTTP Tool used to extract metadata from external URL (title, description, etc.).
11 */
12class MetadataRetriever
13{
14 /** @var ConfigManager */
15 protected $conf;
16
17 /** @var HttpAccess */
18 protected $httpAccess;
19
20 public function __construct(ConfigManager $conf, HttpAccess $httpAccess)
21 {
22 $this->conf = $conf;
23 $this->httpAccess = $httpAccess;
24 }
25
26 /**
27 * Retrieve metadata for given URL.
28 *
29 * @return array [
30 * 'title' => <remote title>,
31 * 'description' => <remote description>,
32 * 'tags' => <remote keywords>,
33 * ]
34 */
35 public function retrieve(string $url): array
36 {
37 $charset = null;
38 $title = null;
39 $description = null;
40 $tags = null;
41 $retrieveDescription = $this->conf->get('general.retrieve_description');
42
43 // Short timeout to keep the application responsive
44 // The callback will fill $charset and $title with data from the downloaded page.
45 $this->httpAccess->getHttpResponse(
46 $url,
47 $this->conf->get('general.download_timeout', 30),
48 $this->conf->get('general.download_max_size', 4194304),
49 $this->httpAccess->getCurlHeaderCallback($charset),
50 $this->httpAccess->getCurlDownloadCallback(
51 $charset,
52 $title,
53 $description,
54 $tags,
55 $retrieveDescription
56 )
57 );
58
59 if (!empty($title) && strtolower($charset) !== 'utf-8') {
60 $title = mb_convert_encoding($title, 'utf-8', $charset);
61 }
62
63 return [
64 'title' => $title,
65 'description' => $description,
66 'tags' => $tags,
67 ];
68 }
69}