aboutsummaryrefslogtreecommitdiffhomepage
path: root/application
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2016-01-04 10:45:54 +0100
committerArthurHoaro <arthur@hoa.ro>2016-01-11 21:19:31 +0100
commit1557cefbd76257ceb830f65806831b490faf0acc (patch)
tree787f6d8fdabe8ea2fc0c37b61d616e667cdfbda5 /application
parentc0a50f3663e207d5df007e0fa321219c1b32d6ea (diff)
downloadShaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.gz
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.tar.zst
Shaarli-1557cefbd76257ceb830f65806831b490faf0acc.zip
Fixes #410 - Retrieve title fails in multiple cases
* `get_http_url()` renamed to `get_http_response()`. * Use the same HTTP context to retrieve response headers and content. * Follow HTTP 301 and 302 redirections to retrieve the title (default max 3 redirections). * Add `LinkUtils` to extract titles and charset. * Try to retrieve charset from HTTP headers first (new), then HTML content. * Use mb_string to re-encode title if necessary.
Diffstat (limited to 'application')
-rw-r--r--application/ApplicationUtils.php2
-rwxr-xr-x[-rw-r--r--]application/HttpUtils.php49
-rwxr-xr-xapplication/LinkUtils.php79
-rwxr-xr-x[-rw-r--r--]application/Url.php11
4 files changed, 132 insertions, 9 deletions
diff --git a/application/ApplicationUtils.php b/application/ApplicationUtils.php
index 274331e1..978fc9da 100644
--- a/application/ApplicationUtils.php
+++ b/application/ApplicationUtils.php
@@ -19,7 +19,7 @@ class ApplicationUtils
19 */ 19 */
20 public static function getLatestGitVersionCode($url, $timeout=2) 20 public static function getLatestGitVersionCode($url, $timeout=2)
21 { 21 {
22 list($headers, $data) = get_http_url($url, $timeout); 22 list($headers, $data) = get_http_response($url, $timeout);
23 23
24 if (strpos($headers[0], '200 OK') === false) { 24 if (strpos($headers[0], '200 OK') === false) {
25 error_log('Failed to retrieve ' . $url); 25 error_log('Failed to retrieve ' . $url);
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 499220c5..e2c1cb47 100644..100755
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -13,7 +13,7 @@
13 * [1] = URL content (downloaded data) 13 * [1] = URL content (downloaded data)
14 * 14 *
15 * Example: 15 * Example:
16 * list($headers, $data) = get_http_url('http://sebauvage.net/'); 16 * list($headers, $data) = get_http_response('http://sebauvage.net/');
17 * if (strpos($headers[0], '200 OK') !== false) { 17 * if (strpos($headers[0], '200 OK') !== false) {
18 * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); 18 * echo 'Data type: '.htmlspecialchars($headers['Content-Type']);
19 * } else { 19 * } else {
@@ -24,31 +24,66 @@
24 * @see http://php.net/manual/en/function.stream-context-create.php 24 * @see http://php.net/manual/en/function.stream-context-create.php
25 * @see http://php.net/manual/en/function.get-headers.php 25 * @see http://php.net/manual/en/function.get-headers.php
26 */ 26 */
27function get_http_url($url, $timeout = 30, $maxBytes = 4194304) 27function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
28{ 28{
29 $urlObj = new Url($url);
30 if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
31 return array(array(0 => 'Invalid HTTP Url'), false);
32 }
33
29 $options = array( 34 $options = array(
30 'http' => array( 35 'http' => array(
31 'method' => 'GET', 36 'method' => 'GET',
32 'timeout' => $timeout, 37 'timeout' => $timeout,
33 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' 38 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)'
34 .' Gecko/20100101 Firefox/23.0' 39 .' Gecko/20100101 Firefox/23.0',
40 'request_fulluri' => true,
35 ) 41 )
36 ); 42 );
37 43
38 $context = stream_context_create($options); 44 $context = stream_context_create($options);
45 stream_context_set_default($options);
46
47 list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup());
48 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 return array($headers, false);
50 }
39 51
40 try { 52 try {
41 // TODO: catch Exception in calling code (thumbnailer) 53 // TODO: catch Exception in calling code (thumbnailer)
42 $content = file_get_contents($url, false, $context, -1, $maxBytes); 54 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
43 } catch (Exception $exc) { 55 } catch (Exception $exc) {
44 return array(array(0 => 'HTTP Error'), $exc->getMessage()); 56 return array(array(0 => 'HTTP Error'), $exc->getMessage());
45 } 57 }
46 58
47 if (!$content) { 59 return array($headers, $content);
48 return array(array(0 => 'HTTP Error'), ''); 60}
61
62/**
63 * Retrieve HTTP headers, following n redirections (temporary and permanent).
64 *
65 * @param string $url initial URL to reach.
66 * @param int $redirectionLimit max redirection follow..
67 *
68 * @return array
69 */
70function get_redirected_headers($url, $redirectionLimit = 3)
71{
72 $headers = get_headers($url, 1);
73
74 // Headers found, redirection found, and limit not reached.
75 if ($redirectionLimit-- > 0
76 && !empty($headers)
77 && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false)
78 && !empty($headers['Location'])) {
79
80 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
81 if ($redirection != $url) {
82 return get_redirected_headers($redirection, $redirectionLimit);
83 }
49 } 84 }
50 85
51 return array(get_headers($url, 1), $content); 86 return array($headers, $url);
52} 87}
53 88
54/** 89/**
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
new file mode 100755
index 00000000..26dd6b67
--- /dev/null
+++ b/application/LinkUtils.php
@@ -0,0 +1,79 @@
1<?php
2
3/**
4 * Extract title from an HTML document.
5 *
6 * @param string $html HTML content where to look for a title.
7 *
8 * @return bool|string Extracted title if found, false otherwise.
9 */
10function html_extract_title($html)
11{
12 if (preg_match('!<title>(.*)</title>!is', $html, $matches)) {
13 return trim(str_replace("\n", ' ', $matches[1]));
14 }
15 return false;
16}
17
18/**
19 * Determine charset from downloaded page.
20 * Priority:
21 * 1. HTTP headers (Content type).
22 * 2. HTML content page (tag <meta charset>).
23 * 3. Use a default charset (default: UTF-8).
24 *
25 * @param array $headers HTTP headers array.
26 * @param string $htmlContent HTML content where to look for charset.
27 * @param string $defaultCharset Default charset to apply if other methods failed.
28 *
29 * @return string Determined charset.
30 */
31function get_charset($headers, $htmlContent, $defaultCharset = 'utf-8')
32{
33 if ($charset = headers_extract_charset($headers)) {
34 return $charset;
35 }
36
37 if ($charset = html_extract_charset($htmlContent)) {
38 return $charset;
39 }
40
41 return $defaultCharset;
42}
43
44/**
45 * Extract charset from HTTP headers if it's defined.
46 *
47 * @param array $headers HTTP headers array.
48 *
49 * @return bool|string Charset string if found (lowercase), false otherwise.
50 */
51function headers_extract_charset($headers)
52{
53 if (! empty($headers['Content-Type']) && strpos($headers['Content-Type'], 'charset=') !== false) {
54 preg_match('/charset="?([^; ]+)/i', $headers['Content-Type'], $match);
55 if (! empty($match[1])) {
56 return strtolower(trim($match[1]));
57 }
58 }
59
60 return false;
61}
62
63/**
64 * Extract charset HTML content (tag <meta charset>).
65 *
66 * @param string $html HTML content where to look for charset.
67 *
68 * @return bool|string Charset string if found, false otherwise.
69 */
70function html_extract_charset($html)
71{
72 // Get encoding specified in HTML header.
73 preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc);
74 if (!empty($enc[1])) {
75 return strtolower($enc[1]);
76 }
77
78 return false;
79}
diff --git a/application/Url.php b/application/Url.php
index d80c9c58..a4ac2e73 100644..100755
--- a/application/Url.php
+++ b/application/Url.php
@@ -118,7 +118,7 @@ class Url
118 */ 118 */
119 public function __construct($url) 119 public function __construct($url)
120 { 120 {
121 $this->parts = parse_url($url); 121 $this->parts = parse_url(trim($url));
122 122
123 if (!empty($url) && empty($this->parts['scheme'])) { 123 if (!empty($url) && empty($this->parts['scheme'])) {
124 $this->parts['scheme'] = 'http'; 124 $this->parts['scheme'] = 'http';
@@ -201,4 +201,13 @@ class Url
201 } 201 }
202 return $this->parts['scheme']; 202 return $this->parts['scheme'];
203 } 203 }
204
205 /**
206 * Test if the Url is an HTTP one.
207 *
208 * @return true is HTTP, false otherwise.
209 */
210 public function isHttp() {
211 return strpos(strtolower($this->parts['scheme']), 'http') !== false;
212 }
204} 213}