aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/http
diff options
context:
space:
mode:
Diffstat (limited to 'application/http')
-rw-r--r--application/http/HttpAccess.php39
-rw-r--r--application/http/HttpUtils.php106
2 files changed, 145 insertions, 0 deletions
diff --git a/application/http/HttpAccess.php b/application/http/HttpAccess.php
new file mode 100644
index 00000000..81d9e076
--- /dev/null
+++ b/application/http/HttpAccess.php
@@ -0,0 +1,39 @@
1<?php
2
3declare(strict_types=1);
4
5namespace Shaarli\Http;
6
7/**
8 * Class HttpAccess
9 *
10 * This is mostly an OOP wrapper for HTTP functions defined in `HttpUtils`.
11 * It is used as dependency injection in Shaarli's container.
12 *
13 * @package Shaarli\Http
14 */
15class HttpAccess
16{
17 public function getHttpResponse($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
18 {
19 return get_http_response($url, $timeout, $maxBytes, $curlWriteFunction);
20 }
21
22 public function getCurlDownloadCallback(
23 &$charset,
24 &$title,
25 &$description,
26 &$keywords,
27 $retrieveDescription,
28 $curlGetInfo = 'curl_getinfo'
29 ) {
30 return get_curl_download_callback(
31 $charset,
32 $title,
33 $description,
34 $keywords,
35 $retrieveDescription,
36 $curlGetInfo
37 );
38 }
39}
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php
index f00c4336..4fc4e3dc 100644
--- a/application/http/HttpUtils.php
+++ b/application/http/HttpUtils.php
@@ -484,3 +484,109 @@ function is_https($server)
484 484
485 return ! empty($server['HTTPS']); 485 return ! empty($server['HTTPS']);
486} 486}
487
488/**
489 * Get cURL callback function for CURLOPT_WRITEFUNCTION
490 *
491 * @param string $charset to extract from the downloaded page (reference)
492 * @param string $title to extract from the downloaded page (reference)
493 * @param string $description to extract from the downloaded page (reference)
494 * @param string $keywords to extract from the downloaded page (reference)
495 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
496 * @param string $curlGetInfo Optionally overrides curl_getinfo function
497 *
498 * @return Closure
499 */
500function get_curl_download_callback(
501 &$charset,
502 &$title,
503 &$description,
504 &$keywords,
505 $retrieveDescription,
506 $curlGetInfo = 'curl_getinfo'
507) {
508 $isRedirected = false;
509 $currentChunk = 0;
510 $foundChunk = null;
511
512 /**
513 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
514 *
515 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
516 * Then we extract the title and the charset and stop the download when it's done.
517 *
518 * @param resource $ch cURL resource
519 * @param string $data chunk of data being downloaded
520 *
521 * @return int|bool length of $data or false if we need to stop the download
522 */
523 return function (&$ch, $data) use (
524 $retrieveDescription,
525 $curlGetInfo,
526 &$charset,
527 &$title,
528 &$description,
529 &$keywords,
530 &$isRedirected,
531 &$currentChunk,
532 &$foundChunk
533 ) {
534 $currentChunk++;
535 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
536 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
537 $isRedirected = true;
538 return strlen($data);
539 }
540 if (!empty($responseCode) && $responseCode !== 200) {
541 return false;
542 }
543 // After a redirection, the content type will keep the previous request value
544 // until it finds the next content-type header.
545 if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
546 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
547 }
548 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
549 return false;
550 }
551 if (!empty($contentType) && empty($charset)) {
552 $charset = header_extract_charset($contentType);
553 }
554 if (empty($charset)) {
555 $charset = html_extract_charset($data);
556 }
557 if (empty($title)) {
558 $title = html_extract_title($data);
559 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
560 }
561 if ($retrieveDescription && empty($description)) {
562 $description = html_extract_tag('description', $data);
563 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
564 }
565 if ($retrieveDescription && empty($keywords)) {
566 $keywords = html_extract_tag('keywords', $data);
567 if (! empty($keywords)) {
568 $foundChunk = $currentChunk;
569 // Keywords use the format tag1, tag2 multiple words, tag
570 // So we format them to match Shaarli's separator and glue multiple words with '-'
571 $keywords = implode(' ', array_map(function($keyword) {
572 return implode('-', preg_split('/\s+/', trim($keyword)));
573 }, explode(',', $keywords)));
574 }
575 }
576
577 // We got everything we want, stop the download.
578 // If we already found either the title, description or keywords,
579 // it's highly unlikely that we'll found the other metas further than
580 // in the same chunk of data or the next one. So we also stop the download after that.
581 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
582 && (! $retrieveDescription
583 || $foundChunk < $currentChunk
584 || (!empty($title) && !empty($description) && !empty($keywords))
585 )
586 ) {
587 return false;
588 }
589
590 return strlen($data);
591 };
592}