diff options
author | VirtualTam <virtualtam@flibidi.net> | 2015-09-01 21:45:06 +0200 |
---|---|---|
committer | VirtualTam <virtualtam@flibidi.net> | 2015-09-06 19:30:26 +0200 |
commit | 451314eb48c7d922264adc6eada8a0273b12344c (patch) | |
tree | fddc1a6d7a92728d0dfdcbb676f113a09325b1c2 | |
parent | f5d6b19b73cd026cb1d937aab16d48b43e412c77 (diff) | |
download | Shaarli-451314eb48c7d922264adc6eada8a0273b12344c.tar.gz Shaarli-451314eb48c7d922264adc6eada8a0273b12344c.tar.zst Shaarli-451314eb48c7d922264adc6eada8a0273b12344c.zip |
HTTP: move utils to a proper file, add tests
Relates to #333
Modifications:
- move HTTP utils to 'application/HttpUtils.php'
- simplify logic
- replace 'http_parse_headers_shaarli' by built-in 'get_headers()'
- remove superfluous '$status' parameter (provided by the HTTP headers)
- apply coding conventions
- add test coverage (unitary only)
Signed-off-by: VirtualTam <virtualtam@flibidi.net>
-rw-r--r-- | application/HttpUtils.php | 52 | ||||
-rwxr-xr-x | index.php | 110 | ||||
-rw-r--r-- | tests/HttpUtilsTest.php | 38 |
3 files changed, 122 insertions, 78 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php new file mode 100644 index 00000000..175333ae --- /dev/null +++ b/application/HttpUtils.php | |||
@@ -0,0 +1,52 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * GET an HTTP URL to retrieve its content | ||
4 | * | ||
5 | * @param string $url URL to get (http://...) | ||
6 | * @param int $timeout network timeout (in seconds) | ||
7 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | ||
8 | * | ||
9 | * @return array HTTP response headers, downloaded content | ||
10 | * | ||
11 | * Output format: | ||
12 | * [0] = associative array containing HTTP response headers | ||
13 | * [1] = URL content (downloaded data) | ||
14 | * | ||
15 | * Example: | ||
16 | * list($headers, $data) = get_http_url('http://sebauvage.net/'); | ||
17 | * if (strpos($headers[0], '200 OK') !== false) { | ||
18 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); | ||
19 | * } else { | ||
20 | * echo 'There was an error: '.htmlspecialchars($headers[0]); | ||
21 | * } | ||
22 | * | ||
23 | * @see http://php.net/manual/en/function.file-get-contents.php | ||
24 | * @see http://php.net/manual/en/function.stream-context-create.php | ||
25 | * @see http://php.net/manual/en/function.get-headers.php | ||
26 | */ | ||
27 | function get_http_url($url, $timeout = 30, $maxBytes = 4194304) | ||
28 | { | ||
29 | $options = array( | ||
30 | 'http' => array( | ||
31 | 'method' => 'GET', | ||
32 | 'timeout' => $timeout, | ||
33 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' | ||
34 | .' Gecko/20100101 Firefox/23.0' | ||
35 | ) | ||
36 | ); | ||
37 | |||
38 | $context = stream_context_create($options); | ||
39 | |||
40 | try { | ||
41 | // TODO: catch Exception in calling code (thumbnailer) | ||
42 | $content = file_get_contents($url, false, $context, -1, $maxBytes); | ||
43 | } catch (Exception $exc) { | ||
44 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | ||
45 | } | ||
46 | |||
47 | if (!$content) { | ||
48 | return array(array(0 => 'HTTP Error'), ''); | ||
49 | } | ||
50 | |||
51 | return array(get_headers($url, 1), $content); | ||
52 | } | ||
@@ -59,6 +59,7 @@ if (is_file($GLOBALS['config']['CONFIG_FILE'])) { | |||
59 | // Shaarli library | 59 | // Shaarli library |
60 | require_once 'application/Cache.php'; | 60 | require_once 'application/Cache.php'; |
61 | require_once 'application/CachedPage.php'; | 61 | require_once 'application/CachedPage.php'; |
62 | require_once 'application/HttpUtils.php'; | ||
62 | require_once 'application/LinkDB.php'; | 63 | require_once 'application/LinkDB.php'; |
63 | require_once 'application/TimeZone.php'; | 64 | require_once 'application/TimeZone.php'; |
64 | require_once 'application/Url.php'; | 65 | require_once 'application/Url.php'; |
@@ -209,9 +210,11 @@ function checkUpdate() | |||
209 | // Get latest version number at most once a day. | 210 | // Get latest version number at most once a day. |
210 | if (!is_file($GLOBALS['config']['UPDATECHECK_FILENAME']) || (filemtime($GLOBALS['config']['UPDATECHECK_FILENAME'])<time()-($GLOBALS['config']['UPDATECHECK_INTERVAL']))) | 211 | if (!is_file($GLOBALS['config']['UPDATECHECK_FILENAME']) || (filemtime($GLOBALS['config']['UPDATECHECK_FILENAME'])<time()-($GLOBALS['config']['UPDATECHECK_INTERVAL']))) |
211 | { | 212 | { |
212 | $version=shaarli_version; | 213 | $version = shaarli_version; |
213 | list($httpstatus,$headers,$data) = getHTTP('https://raw.githubusercontent.com/shaarli/Shaarli/master/shaarli_version.php',2); | 214 | list($headers, $data) = get_http_url('https://raw.githubusercontent.com/shaarli/Shaarli/master/shaarli_version.php', 2); |
214 | if (strpos($httpstatus,'200 OK')!==false) $version=str_replace(' */ ?>','',str_replace('<?php /* ','',$data)); | 215 | if (strpos($headers[0], '200 OK') !== false) { |
216 | $version = str_replace(' */ ?>', '', str_replace('<?php /* ', '', $data)); | ||
217 | } | ||
215 | // If failed, never mind. We don't want to bother the user with that. | 218 | // If failed, never mind. We don't want to bother the user with that. |
216 | file_put_contents($GLOBALS['config']['UPDATECHECK_FILENAME'],$version); // touch file date | 219 | file_put_contents($GLOBALS['config']['UPDATECHECK_FILENAME'],$version); // touch file date |
217 | } | 220 | } |
@@ -535,53 +538,6 @@ function linkdate2iso8601($linkdate) | |||
535 | return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format. | 538 | return date('c',linkdate2timestamp($linkdate)); // 'c' is for ISO 8601 date format. |
536 | } | 539 | } |
537 | 540 | ||
538 | // Parse HTTP response headers and return an associative array. | ||
539 | function http_parse_headers_shaarli( $headers ) | ||
540 | { | ||
541 | $res=array(); | ||
542 | foreach($headers as $header) | ||
543 | { | ||
544 | $i = strpos($header,': '); | ||
545 | if ($i!==false) | ||
546 | { | ||
547 | $key=substr($header,0,$i); | ||
548 | $value=substr($header,$i+2,strlen($header)-$i-2); | ||
549 | $res[$key]=$value; | ||
550 | } | ||
551 | } | ||
552 | return $res; | ||
553 | } | ||
554 | |||
555 | /* GET an URL. | ||
556 | Input: $url : URL to get (http://...) | ||
557 | $timeout : Network timeout (will wait this many seconds for an anwser before giving up). | ||
558 | Output: An array. [0] = HTTP status message (e.g. "HTTP/1.1 200 OK") or error message | ||
559 | [1] = associative array containing HTTP response headers (e.g. echo getHTTP($url)[1]['Content-Type']) | ||
560 | [2] = data | ||
561 | Example: list($httpstatus,$headers,$data) = getHTTP('http://sebauvage.net/'); | ||
562 | if (strpos($httpstatus,'200 OK')!==false) | ||
563 | echo 'Data type: '.htmlspecialchars($headers['Content-Type']); | ||
564 | else | ||
565 | echo 'There was an error: '.htmlspecialchars($httpstatus) | ||
566 | */ | ||
567 | function getHTTP($url,$timeout=30) | ||
568 | { | ||
569 | try | ||
570 | { | ||
571 | $options = array('http'=>array('method'=>'GET','timeout' => $timeout, 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0) Gecko/20100101 Firefox/23.0')); // Force network timeout | ||
572 | $context = stream_context_create($options); | ||
573 | $data=file_get_contents($url,false,$context,-1, 4000000); // We download at most 4 Mb from source. | ||
574 | if (!$data) { return array('HTTP Error',array(),''); } | ||
575 | $httpStatus=$http_response_header[0]; // e.g. "HTTP/1.1 200 OK" | ||
576 | $responseHeaders=http_parse_headers_shaarli($http_response_header); | ||
577 | return array($httpStatus,$responseHeaders,$data); | ||
578 | } | ||
579 | catch (Exception $e) // getHTTP *can* fail silently (we don't care if the title cannot be fetched) | ||
580 | { | ||
581 | return array($e->getMessage(),'',''); | ||
582 | } | ||
583 | } | ||
584 | |||
585 | // Extract title from an HTML document. | 541 | // Extract title from an HTML document. |
586 | // (Returns an empty string if not found.) | 542 | // (Returns an empty string if not found.) |
587 | function html_extract_title($html) | 543 | function html_extract_title($html) |
@@ -1516,9 +1472,10 @@ function renderPage() | |||
1516 | $private = (!empty($_GET['private']) && $_GET['private'] === "1" ? 1 : 0); | 1472 | $private = (!empty($_GET['private']) && $_GET['private'] === "1" ? 1 : 0); |
1517 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) | 1473 | // If this is an HTTP(S) link, we try go get the page to extract the title (otherwise we will to straight to the edit form.) |
1518 | if (empty($title) && strpos($url->getScheme(), 'http') !== false) { | 1474 | if (empty($title) && strpos($url->getScheme(), 'http') !== false) { |
1519 | list($status,$headers,$data) = getHTTP($url,4); // Short timeout to keep the application responsive. | 1475 | // Short timeout to keep the application responsive |
1476 | list($headers, $data) = get_http_url($url, 4); | ||
1520 | // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html | 1477 | // FIXME: Decode charset according to specified in either 1) HTTP response headers or 2) <head> in html |
1521 | if (strpos($status,'200 OK')!==false) { | 1478 | if (strpos($headers[0], '200 OK') !== false) { |
1522 | // Look for charset in html header. | 1479 | // Look for charset in html header. |
1523 | preg_match('#<meta .*charset=.*>#Usi', $data, $meta); | 1480 | preg_match('#<meta .*charset=.*>#Usi', $data, $meta); |
1524 | 1481 | ||
@@ -2186,8 +2143,9 @@ function genThumbnail() | |||
2186 | } | 2143 | } |
2187 | else // This is a flickr page (html) | 2144 | else // This is a flickr page (html) |
2188 | { | 2145 | { |
2189 | list($httpstatus,$headers,$data) = getHTTP($url,20); // Get the flickr html page. | 2146 | // Get the flickr html page. |
2190 | if (strpos($httpstatus,'200 OK')!==false) | 2147 | list($headers, $data) = get_http_url($url, 20); |
2148 | if (strpos($headers[0], '200 OK') !== false) | ||
2191 | { | 2149 | { |
2192 | // flickr now nicely provides the URL of the thumbnail in each flickr page. | 2150 | // flickr now nicely provides the URL of the thumbnail in each flickr page. |
2193 | preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!',$data,$matches); | 2151 | preg_match('!<link rel=\"image_src\" href=\"(.+?)\"!',$data,$matches); |
@@ -2206,9 +2164,9 @@ function genThumbnail() | |||
2206 | 2164 | ||
2207 | if ($imageurl!='') | 2165 | if ($imageurl!='') |
2208 | { // Let's download the image. | 2166 | { // Let's download the image. |
2209 | list($httpstatus,$headers,$data) = getHTTP($imageurl,10); // Image is 240x120, so 10 seconds to download should be enough. | 2167 | // Image is 240x120, so 10 seconds to download should be enough. |
2210 | if (strpos($httpstatus,'200 OK')!==false) | 2168 | list($headers, $data) = get_http_url($imageurl, 10); |
2211 | { | 2169 | if (strpos($headers[0], '200 OK') !== false) { |
2212 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. | 2170 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. |
2213 | header('Content-Type: image/jpeg'); | 2171 | header('Content-Type: image/jpeg'); |
2214 | echo $data; | 2172 | echo $data; |
@@ -2222,15 +2180,13 @@ function genThumbnail() | |||
2222 | // This is more complex: we have to perform a HTTP request, then parse the result. | 2180 | // This is more complex: we have to perform a HTTP request, then parse the result. |
2223 | // Maybe we should deport this to JavaScript ? Example: http://stackoverflow.com/questions/1361149/get-img-thumbnails-from-vimeo/4285098#4285098 | 2181 | // Maybe we should deport this to JavaScript ? Example: http://stackoverflow.com/questions/1361149/get-img-thumbnails-from-vimeo/4285098#4285098 |
2224 | $vid = substr(parse_url($url,PHP_URL_PATH),1); | 2182 | $vid = substr(parse_url($url,PHP_URL_PATH),1); |
2225 | list($httpstatus,$headers,$data) = getHTTP('https://vimeo.com/api/v2/video/'.escape($vid).'.php',5); | 2183 | list($headers, $data) = get_http_url('https://vimeo.com/api/v2/video/'.escape($vid).'.php', 5); |
2226 | if (strpos($httpstatus,'200 OK')!==false) | 2184 | if (strpos($headers[0], '200 OK') !== false) { |
2227 | { | ||
2228 | $t = unserialize($data); | 2185 | $t = unserialize($data); |
2229 | $imageurl = $t[0]['thumbnail_medium']; | 2186 | $imageurl = $t[0]['thumbnail_medium']; |
2230 | // Then we download the image and serve it to our client. | 2187 | // Then we download the image and serve it to our client. |
2231 | list($httpstatus,$headers,$data) = getHTTP($imageurl,10); | 2188 | list($headers, $data) = get_http_url($imageurl, 10); |
2232 | if (strpos($httpstatus,'200 OK')!==false) | 2189 | if (strpos($headers[0], '200 OK') !== false) { |
2233 | { | ||
2234 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. | 2190 | file_put_contents($GLOBALS['config']['CACHEDIR'].'/'.$thumbname,$data); // Save image to cache. |
2235 | header('Content-Type: image/jpeg'); | 2191 | header('Content-Type: image/jpeg'); |
2236 | echo $data; | 2192 | echo $data; |
@@ -2244,17 +2200,16 @@ function genThumbnail() | |||
2244 | // The thumbnail for TED talks is located in the <link rel="image_src" [...]> tag on that page | 2200 | // The thumbnail for TED talks is located in the <link rel="image_src" [...]> tag on that page |
2245 | // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html | 2201 | // http://www.ted.com/talks/mikko_hypponen_fighting_viruses_defending_the_net.html |
2246 | // <link rel="image_src" href="http://images.ted.com/images/ted/28bced335898ba54d4441809c5b1112ffaf36781_389x292.jpg" /> | 2202 | // <link rel="image_src" href="http://images.ted.com/images/ted/28bced335898ba54d4441809c5b1112ffaf36781_389x292.jpg" /> |
2247 | list($httpstatus,$headers,$data) = getHTTP($url,5); | 2203 | list($headers, $data) = get_http_url($url, 5); |
2248 | if (strpos($httpstatus,'200 OK')!==false) | 2204 | if (strpos($headers[0], '200 OK') !== false) { |
2249 | { | ||
2250 | // Extract the link to the thumbnail | 2205 | // Extract the link to the thumbnail |
2251 | preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!',$data,$matches); | 2206 | preg_match('!link rel="image_src" href="(http://images.ted.com/images/ted/.+_\d+x\d+\.jpg)"!',$data,$matches); |
2252 | if (!empty($matches[1])) | 2207 | if (!empty($matches[1])) |
2253 | { // Let's download the image. | 2208 | { // Let's download the image. |
2254 | $imageurl=$matches[1]; | 2209 | $imageurl=$matches[1]; |
2255 | list($httpstatus,$headers,$data) = getHTTP($imageurl,20); // No control on image size, so wait long enough. | 2210 | // No control on image size, so wait long enough |
2256 | if (strpos($httpstatus,'200 OK')!==false) | 2211 | list($headers, $data) = get_http_url($imageurl, 20); |
2257 | { | 2212 | if (strpos($headers[0], '200 OK') !== false) { |
2258 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2213 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2259 | file_put_contents($filepath,$data); // Save image to cache. | 2214 | file_put_contents($filepath,$data); // Save image to cache. |
2260 | if (resizeImage($filepath)) | 2215 | if (resizeImage($filepath)) |
@@ -2273,17 +2228,16 @@ function genThumbnail() | |||
2273 | // There is no thumbnail available for xkcd comics, so download the whole image and resize it. | 2228 | // There is no thumbnail available for xkcd comics, so download the whole image and resize it. |
2274 | // http://xkcd.com/327/ | 2229 | // http://xkcd.com/327/ |
2275 | // <img src="http://imgs.xkcd.com/comics/exploits_of_a_mom.png" title="<BLABLA>" alt="<BLABLA>" /> | 2230 | // <img src="http://imgs.xkcd.com/comics/exploits_of_a_mom.png" title="<BLABLA>" alt="<BLABLA>" /> |
2276 | list($httpstatus,$headers,$data) = getHTTP($url,5); | 2231 | list($headers, $data) = get_http_url($url, 5); |
2277 | if (strpos($httpstatus,'200 OK')!==false) | 2232 | if (strpos($headers[0], '200 OK') !== false) { |
2278 | { | ||
2279 | // Extract the link to the thumbnail | 2233 | // Extract the link to the thumbnail |
2280 | preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!',$data,$matches); | 2234 | preg_match('!<img src="(http://imgs.xkcd.com/comics/.*)" title="[^s]!',$data,$matches); |
2281 | if (!empty($matches[1])) | 2235 | if (!empty($matches[1])) |
2282 | { // Let's download the image. | 2236 | { // Let's download the image. |
2283 | $imageurl=$matches[1]; | 2237 | $imageurl=$matches[1]; |
2284 | list($httpstatus,$headers,$data) = getHTTP($imageurl,20); // No control on image size, so wait long enough. | 2238 | // No control on image size, so wait long enough |
2285 | if (strpos($httpstatus,'200 OK')!==false) | 2239 | list($headers, $data) = get_http_url($imageurl, 20); |
2286 | { | 2240 | if (strpos($headers[0], '200 OK') !== false) { |
2287 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2241 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2288 | file_put_contents($filepath,$data); // Save image to cache. | 2242 | file_put_contents($filepath,$data); // Save image to cache. |
2289 | if (resizeImage($filepath)) | 2243 | if (resizeImage($filepath)) |
@@ -2300,9 +2254,9 @@ function genThumbnail() | |||
2300 | else | 2254 | else |
2301 | { | 2255 | { |
2302 | // For all other domains, we try to download the image and make a thumbnail. | 2256 | // For all other domains, we try to download the image and make a thumbnail. |
2303 | list($httpstatus,$headers,$data) = getHTTP($url,30); // We allow 30 seconds max to download (and downloads are limited to 4 Mb) | 2257 | // We allow 30 seconds max to download (and downloads are limited to 4 Mb) |
2304 | if (strpos($httpstatus,'200 OK')!==false) | 2258 | list($headers, $data) = get_http_url($url, 30); |
2305 | { | 2259 | if (strpos($headers[0], '200 OK') !== false) { |
2306 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; | 2260 | $filepath=$GLOBALS['config']['CACHEDIR'].'/'.$thumbname; |
2307 | file_put_contents($filepath,$data); // Save image to cache. | 2261 | file_put_contents($filepath,$data); // Save image to cache. |
2308 | if (resizeImage($filepath)) | 2262 | if (resizeImage($filepath)) |
diff --git a/tests/HttpUtilsTest.php b/tests/HttpUtilsTest.php new file mode 100644 index 00000000..76092b80 --- /dev/null +++ b/tests/HttpUtilsTest.php | |||
@@ -0,0 +1,38 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * HttpUtils' tests | ||
4 | */ | ||
5 | |||
6 | require_once 'application/HttpUtils.php'; | ||
7 | |||
8 | /** | ||
9 | * Unitary tests for get_http_url() | ||
10 | */ | ||
11 | class GetHttpUrlTest extends PHPUnit_Framework_TestCase | ||
12 | { | ||
13 | /** | ||
14 | * Get an invalid local URL | ||
15 | */ | ||
16 | public function testGetInvalidLocalUrl() | ||
17 | { | ||
18 | list($headers, $content) = get_http_url('/non/existent', 1); | ||
19 | $this->assertEquals('HTTP Error', $headers[0]); | ||
20 | $this->assertRegexp( | ||
21 | '/failed to open stream: No such file or directory/', | ||
22 | $content | ||
23 | ); | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * Get an invalid remote URL | ||
28 | */ | ||
29 | public function testGetInvalidRemoteUrl() | ||
30 | { | ||
31 | list($headers, $content) = get_http_url('http://non.existent', 1); | ||
32 | $this->assertEquals('HTTP Error', $headers[0]); | ||
33 | $this->assertRegexp( | ||
34 | '/Name or service not known/', | ||
35 | $content | ||
36 | ); | ||
37 | } | ||
38 | } | ||