aboutsummaryrefslogtreecommitdiffhomepage
path: root/application
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2018-01-23 18:41:38 +0100
committerGitHub <noreply@github.com>2018-01-23 18:41:38 +0100
commitd449f79a0d7ca808b891baf73b9e25ce7f7e48fe (patch)
tree66e40b38bfce1475b745d6a1227f478f8e99ab75 /application
parent5f8c3f532ed16ad5b789f75e9ff745e5329271c3 (diff)
parentd65342e304f92643ba922200953cfebc51e1e482 (diff)
downloadShaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.gz
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.tar.zst
Shaarli-d449f79a0d7ca808b891baf73b9e25ce7f7e48fe.zip
Merge pull request #977 from ArthurHoaro/feature/dl-filter
Extract the title/charset during page download, and check content type
Diffstat (limited to 'application')
-rw-r--r--application/HttpUtils.php14
-rw-r--r--application/LinkUtils.php89
2 files changed, 65 insertions, 38 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index c9371b55..83a4c5e2 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -3,9 +3,11 @@
3 * GET an HTTP URL to retrieve its content 3 * GET an HTTP URL to retrieve its content
4 * Uses the cURL library or a fallback method 4 * Uses the cURL library or a fallback method
5 * 5 *
6 * @param string $url URL to get (http://...) 6 * @param string $url URL to get (http://...)
7 * @param int $timeout network timeout (in seconds) 7 * @param int $timeout network timeout (in seconds)
8 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) 8 * @param int $maxBytes maximum downloaded bytes (default: 4 MiB)
9 * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION).
10 * Can be used to add download conditions on the headers (response code, content type, etc.).
9 * 11 *
10 * @return array HTTP response headers, downloaded content 12 * @return array HTTP response headers, downloaded content
11 * 13 *
@@ -29,7 +31,7 @@
29 * @see http://stackoverflow.com/q/9183178 31 * @see http://stackoverflow.com/q/9183178
30 * @see http://stackoverflow.com/q/1462720 32 * @see http://stackoverflow.com/q/1462720
31 */ 33 */
32function get_http_response($url, $timeout = 30, $maxBytes = 4194304) 34function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null)
33{ 35{
34 $urlObj = new Url($url); 36 $urlObj = new Url($url);
35 $cleanUrl = $urlObj->idnToAscii(); 37 $cleanUrl = $urlObj->idnToAscii();
@@ -75,6 +77,10 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
75 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 77 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
76 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); 78 curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
77 79
80 if (is_callable($curlWriteFunction)) {
81 curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction);
82 }
83
78 // Max download size management 84 // Max download size management
79 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); 85 curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16);
80 curl_setopt($ch, CURLOPT_NOPROGRESS, false); 86 curl_setopt($ch, CURLOPT_NOPROGRESS, false);
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index e3d95d08..3705f7e9 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -1,60 +1,81 @@
1<?php 1<?php
2 2
3/** 3/**
4 * Extract title from an HTML document. 4 * Get cURL callback function for CURLOPT_WRITEFUNCTION
5 * 5 *
6 * @param string $html HTML content where to look for a title. 6 * @param string $charset to extract from the downloaded page (reference)
7 * @param string $title to extract from the downloaded page (reference)
8 * @param string $curlGetInfo Optionnaly overrides curl_getinfo function
7 * 9 *
8 * @return bool|string Extracted title if found, false otherwise. 10 * @return Closure
9 */ 11 */
10function html_extract_title($html) 12function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
11{ 13{
12 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { 14 /**
13 return trim(str_replace("\n", '', $matches[1])); 15 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
14 } 16 *
15 return false; 17 * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
18 * Then we extract the title and the charset and stop the download when it's done.
19 *
20 * @param resource $ch cURL resource
21 * @param string $data chunk of data being downloaded
22 *
23 * @return int|bool length of $data or false if we need to stop the download
24 */
25 return function(&$ch, $data) use ($curlGetInfo, &$charset, &$title) {
26 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
27 if (!empty($responseCode) && $responseCode != 200) {
28 return false;
29 }
30 $contentType = $curlGetInfo($ch, CURLINFO_CONTENT_TYPE);
31 if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
32 return false;
33 }
34 if (empty($charset)) {
35 $charset = header_extract_charset($contentType);
36 }
37 if (empty($charset)) {
38 $charset = html_extract_charset($data);
39 }
40 if (empty($title)) {
41 $title = html_extract_title($data);
42 }
43 // We got everything we want, stop the download.
44 if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
45 return false;
46 }
47
48 return strlen($data);
49 };
16} 50}
17 51
18/** 52/**
19 * Determine charset from downloaded page. 53 * Extract title from an HTML document.
20 * Priority:
21 * 1. HTTP headers (Content type).
22 * 2. HTML content page (tag <meta charset>).
23 * 3. Use a default charset (default: UTF-8).
24 * 54 *
25 * @param array $headers HTTP headers array. 55 * @param string $html HTML content where to look for a title.
26 * @param string $htmlContent HTML content where to look for charset.
27 * @param string $defaultCharset Default charset to apply if other methods failed.
28 * 56 *
29 * @return string Determined charset. 57 * @return bool|string Extracted title if found, false otherwise.
30 */ 58 */
31function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8') 59function html_extract_title($html)
32{ 60{
33 if ($charset = headers_extract_charset($headers)) { 61 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
34 return $charset; 62 return trim(str_replace("\n", '', $matches[1]));
35 }
36
37 if ($charset = html_extract_charset($htmlContent)) {
38 return $charset;
39 } 63 }
40 64 return false;
41 return $defaultCharset;
42} 65}
43 66
44/** 67/**
45 * Extract charset from HTTP headers if it's defined. 68 * Extract charset from HTTP header if it's defined.
46 * 69 *
47 * @param array $headers HTTP headers array. 70 * @param string $header HTTP header Content-Type line.
48 * 71 *
49 * @return bool|string Charset string if found (lowercase), false otherwise. 72 * @return bool|string Charset string if found (lowercase), false otherwise.
50 */ 73 */
51function headers_extract_charset($headers) 74function header_extract_charset($header)
52{ 75{
53 if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) { 76 preg_match('/charset="?([^; ]+)/i', $header, $match);
54 preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match); 77 if (! empty($match[1])) {
55 if (! empty($match[1])) { 78 return strtolower(trim($match[1]));
56 return strtolower(trim($match[1]));
57 }
58 } 79 }
59 80
60 return false; 81 return false;