diff options
author | ArthurHoaro <arthur@hoa.ro> | 2016-04-06 22:00:52 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2016-05-03 19:51:29 +0200 |
commit | ce7b0b6480aa854ee6893f5c889277b0e3b13efc (patch) | |
tree | 8d8beb4ea5568d9989a5ebf52e2adc542e17f74e | |
parent | 11609d9fd8ba53f049e6c913d8e3affab6cfc9ce (diff) | |
download | Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.gz Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.zst Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.zip |
Fixes #531 - Title retrieving is failing with multiple use case
see https://github.com/shaarli/Shaarli/issues/531 for details
-rw-r--r-- | application/HttpUtils.php | 60 | ||||
-rw-r--r-- | application/LinkUtils.php | 6 | ||||
-rw-r--r-- | application/Url.php | 42 | ||||
-rw-r--r-- | index.php | 8 | ||||
-rw-r--r-- | tests/HttpUtils/GetHttpUrlTest.php | 27 | ||||
-rw-r--r-- | tests/Url/UrlTest.php | 15 |
6 files changed, 142 insertions, 16 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index af7cb371..0e1ce879 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -27,7 +27,9 @@ | |||
27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | 27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) |
28 | { | 28 | { |
29 | $urlObj = new Url($url); | 29 | $urlObj = new Url($url); |
30 | if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | 30 | $cleanUrl = $urlObj->indToAscii(); |
31 | |||
32 | if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | ||
31 | return array(array(0 => 'Invalid HTTP Url'), false); | 33 | return array(array(0 => 'Invalid HTTP Url'), false); |
32 | } | 34 | } |
33 | 35 | ||
@@ -35,22 +37,27 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | |||
35 | 'http' => array( | 37 | 'http' => array( |
36 | 'method' => 'GET', | 38 | 'method' => 'GET', |
37 | 'timeout' => $timeout, | 39 | 'timeout' => $timeout, |
38 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' | 40 | 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' |
39 | .' Gecko/20100101 Firefox/23.0', | 41 | .' Gecko/20100101 Firefox/45.0', |
40 | 'request_fulluri' => true, | 42 | 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3', |
41 | ) | 43 | ) |
42 | ); | 44 | ); |
43 | 45 | ||
44 | $context = stream_context_create($options); | ||
45 | stream_context_set_default($options); | 46 | stream_context_set_default($options); |
47 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | ||
49 | $options['http']['request_fulluri'] = true; | ||
50 | stream_context_set_default($options); | ||
51 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | ||
52 | } | ||
46 | 53 | ||
47 | list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup()); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | 54 | if (! $headers || strpos($headers[0], '200 OK') === false) { |
49 | return array($headers, false); | 55 | return array($headers, false); |
50 | } | 56 | } |
51 | 57 | ||
52 | try { | 58 | try { |
53 | // TODO: catch Exception in calling code (thumbnailer) | 59 | // TODO: catch Exception in calling code (thumbnailer) |
60 | $context = stream_context_create($options); | ||
54 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); | 61 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); |
55 | } catch (Exception $exc) { | 62 | } catch (Exception $exc) { |
56 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | 63 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); |
@@ -60,16 +67,19 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | |||
60 | } | 67 | } |
61 | 68 | ||
62 | /** | 69 | /** |
63 | * Retrieve HTTP headers, following n redirections (temporary and permanent). | 70 | * Retrieve HTTP headers, following n redirections (temporary and permanent ones). |
64 | * | 71 | * |
65 | * @param string $url initial URL to reach. | 72 | * @param string $url initial URL to reach. |
66 | * @param int $redirectionLimit max redirection follow.. | 73 | * @param int $redirectionLimit max redirection follow.. |
67 | * | 74 | * |
68 | * @return array | 75 | * @return array HTTP headers, or false if it failed. |
69 | */ | 76 | */ |
70 | function get_redirected_headers($url, $redirectionLimit = 3) | 77 | function get_redirected_headers($url, $redirectionLimit = 3) |
71 | { | 78 | { |
72 | $headers = get_headers($url, 1); | 79 | $headers = get_headers($url, 1); |
80 | if (!empty($headers['location']) && empty($headers['Location'])) { | ||
81 | $headers['Location'] = $headers['location']; | ||
82 | } | ||
73 | 83 | ||
74 | // Headers found, redirection found, and limit not reached. | 84 | // Headers found, redirection found, and limit not reached. |
75 | if ($redirectionLimit-- > 0 | 85 | if ($redirectionLimit-- > 0 |
@@ -79,6 +89,7 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
79 | 89 | ||
80 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | 90 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; |
81 | if ($redirection != $url) { | 91 | if ($redirection != $url) { |
92 | $redirection = getAbsoluteUrl($url, $redirection); | ||
82 | return get_redirected_headers($redirection, $redirectionLimit); | 93 | return get_redirected_headers($redirection, $redirectionLimit); |
83 | } | 94 | } |
84 | } | 95 | } |
@@ -87,6 +98,35 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
87 | } | 98 | } |
88 | 99 | ||
89 | /** | 100 | /** |
101 | * Get an absolute URL from a complete one, and another absolute/relative URL. | ||
102 | * | ||
103 | * @param string $originalUrl The original complete URL. | ||
104 | * @param string $newUrl The new one, absolute or relative. | ||
105 | * | ||
106 | * @return string Final URL: | ||
107 | * - $newUrl if it was already an absolute URL. | ||
108 | * - if it was relative, absolute URL from $originalUrl path. | ||
109 | */ | ||
110 | function getAbsoluteUrl($originalUrl, $newUrl) | ||
111 | { | ||
112 | $newScheme = parse_url($newUrl, PHP_URL_SCHEME); | ||
113 | // Already an absolute URL. | ||
114 | if (!empty($newScheme)) { | ||
115 | return $newUrl; | ||
116 | } | ||
117 | |||
118 | $parts = parse_url($originalUrl); | ||
119 | $final = $parts['scheme'] .'://'. $parts['host']; | ||
120 | $final .= (!empty($parts['port'])) ? $parts['port'] : ''; | ||
121 | $final .= '/'; | ||
122 | if ($newUrl[0] != '/') { | ||
123 | $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/')); | ||
124 | } | ||
125 | $final .= ltrim($newUrl, '/'); | ||
126 | return $final; | ||
127 | } | ||
128 | |||
129 | /** | ||
90 | * Returns the server's base URL: scheme://domain.tld[:port] | 130 | * Returns the server's base URL: scheme://domain.tld[:port] |
91 | * | 131 | * |
92 | * @param array $server the $_SERVER array | 132 | * @param array $server the $_SERVER array |
diff --git a/application/LinkUtils.php b/application/LinkUtils.php index d8dc8b5e..2df76ba8 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php | |||
@@ -9,8 +9,8 @@ | |||
9 | */ | 9 | */ |
10 | function html_extract_title($html) | 10 | function html_extract_title($html) |
11 | { | 11 | { |
12 | if (preg_match('!<title>(.*?)</title>!is', $html, $matches)) { | 12 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { |
13 | return trim(str_replace("\n", ' ', $matches[1])); | 13 | return trim(str_replace("\n", '', $matches[1])); |
14 | } | 14 | } |
15 | return false; | 15 | return false; |
16 | } | 16 | } |
@@ -70,7 +70,7 @@ function headers_extract_charset($headers) | |||
70 | function html_extract_charset($html) | 70 | function html_extract_charset($html) |
71 | { | 71 | { |
72 | // Get encoding specified in HTML header. | 72 | // Get encoding specified in HTML header. |
73 | preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); | 73 | preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc); |
74 | if (!empty($enc[1])) { | 74 | if (!empty($enc[1])) { |
75 | return strtolower($enc[1]); | 75 | return strtolower($enc[1]); |
76 | } | 76 | } |
diff --git a/application/Url.php b/application/Url.php index af38c4d9..61a30a78 100644 --- a/application/Url.php +++ b/application/Url.php | |||
@@ -62,7 +62,21 @@ function add_trailing_slash($url) | |||
62 | { | 62 | { |
63 | return $url . (!endsWith($url, '/') ? '/' : ''); | 63 | return $url . (!endsWith($url, '/') ? '/' : ''); |
64 | } | 64 | } |
65 | /** | ||
66 | * Converts an URL with an IDN host to a ASCII one. | ||
67 | * | ||
68 | * @param string $url Input URL. | ||
69 | * | ||
70 | * @return string converted URL. | ||
71 | */ | ||
72 | function url_with_idn_to_ascii($url) | ||
73 | { | ||
74 | $parts = parse_url($url); | ||
75 | $parts['host'] = idn_to_ascii($parts['host']); | ||
65 | 76 | ||
77 | $httpUrl = new \http\Url($parts); | ||
78 | return $httpUrl->toString(); | ||
79 | } | ||
66 | /** | 80 | /** |
67 | * URL representation and cleanup utilities | 81 | * URL representation and cleanup utilities |
68 | * | 82 | * |
@@ -221,6 +235,22 @@ class Url | |||
221 | } | 235 | } |
222 | 236 | ||
223 | /** | 237 | /** |
238 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
239 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
240 | * | ||
241 | * @return string converted cleaned up URL. | ||
242 | */ | ||
243 | public function indToAscii() | ||
244 | { | ||
245 | $out = $this->cleanup(); | ||
246 | if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) { | ||
247 | return $out; | ||
248 | } | ||
249 | $asciiHost = idn_to_ascii($this->parts['host']); | ||
250 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
251 | } | ||
252 | |||
253 | /** | ||
224 | * Get URL scheme. | 254 | * Get URL scheme. |
225 | * | 255 | * |
226 | * @return string the URL scheme or false if none is provided. | 256 | * @return string the URL scheme or false if none is provided. |
@@ -233,6 +263,18 @@ class Url | |||
233 | } | 263 | } |
234 | 264 | ||
235 | /** | 265 | /** |
266 | * Get URL host. | ||
267 | * | ||
268 | * @return string the URL host or false if none is provided. | ||
269 | */ | ||
270 | public function getHost() { | ||
271 | if (empty($this->parts['host'])) { | ||
272 | return false; | ||
273 | } | ||
274 | return $this->parts['host']; | ||
275 | } | ||
276 | |||
277 | /** | ||
236 | * Test if the Url is an HTTP one. | 278 | * Test if the Url is an HTTP one. |
237 | * | 279 | * |
238 | * @return true is HTTP, false otherwise. | 280 | * @return true is HTTP, false otherwise. |
@@ -1516,7 +1516,7 @@ function renderPage() | |||
1516 | 1516 | ||
1517 | // -------- User want to post a new link: Display link edit form. | 1517 | // -------- User want to post a new link: Display link edit form. |
1518 | if (isset($_GET['post'])) { | 1518 | if (isset($_GET['post'])) { |
1519 | $url = cleanup_url(escape($_GET['post'])); | 1519 | $url = cleanup_url($_GET['post']); |
1520 | 1520 | ||
1521 | $link_is_new = false; | 1521 | $link_is_new = false; |
1522 | // Check if URL is not already in database (in this case, we will edit the existing link) | 1522 | // Check if URL is not already in database (in this case, we will edit the existing link) |
@@ -1541,8 +1541,8 @@ function renderPage() | |||
1541 | // Extract title. | 1541 | // Extract title. |
1542 | $title = html_extract_title($content); | 1542 | $title = html_extract_title($content); |
1543 | // Re-encode title in utf-8 if necessary. | 1543 | // Re-encode title in utf-8 if necessary. |
1544 | if (! empty($title) && $charset != 'utf-8') { | 1544 | if (! empty($title) && strtolower($charset) != 'utf-8') { |
1545 | $title = mb_convert_encoding($title, $charset, 'utf-8'); | 1545 | $title = mb_convert_encoding($title, 'utf-8', $charset); |
1546 | } | 1546 | } |
1547 | } | 1547 | } |
1548 | } | 1548 | } |
@@ -1551,6 +1551,8 @@ function renderPage() | |||
1551 | $url = '?' . smallHash($linkdate); | 1551 | $url = '?' . smallHash($linkdate); |
1552 | $title = 'Note: '; | 1552 | $title = 'Note: '; |
1553 | } | 1553 | } |
1554 | $url = escape($url); | ||
1555 | $title = escape($title); | ||
1554 | 1556 | ||
1555 | $link = array( | 1557 | $link = array( |
1556 | 'linkdate' => $linkdate, | 1558 | 'linkdate' => $linkdate, |
diff --git a/tests/HttpUtils/GetHttpUrlTest.php b/tests/HttpUtils/GetHttpUrlTest.php index fd293505..ea53de5f 100644 --- a/tests/HttpUtils/GetHttpUrlTest.php +++ b/tests/HttpUtils/GetHttpUrlTest.php | |||
@@ -35,4 +35,31 @@ class GetHttpUrlTest extends PHPUnit_Framework_TestCase | |||
35 | $this->assertFalse($headers); | 35 | $this->assertFalse($headers); |
36 | $this->assertFalse($content); | 36 | $this->assertFalse($content); |
37 | } | 37 | } |
38 | |||
39 | /** | ||
40 | * Test getAbsoluteUrl with relative target URL. | ||
41 | */ | ||
42 | public function testGetAbsoluteUrlWithRelative() | ||
43 | { | ||
44 | $origin = 'http://non.existent/blabla/?test'; | ||
45 | $target = '/stuff.php'; | ||
46 | |||
47 | $expected = 'http://non.existent/stuff.php'; | ||
48 | $this->assertEquals($expected, getAbsoluteUrl($origin, $target)); | ||
49 | |||
50 | $target = 'stuff.php'; | ||
51 | $expected = 'http://non.existent/blabla/stuff.php'; | ||
52 | $this->assertEquals($expected, getAbsoluteUrl($origin, $target)); | ||
53 | } | ||
54 | |||
55 | /** | ||
56 | * Test getAbsoluteUrl with absolute target URL. | ||
57 | */ | ||
58 | public function testGetAbsoluteUrlWithAbsolute() | ||
59 | { | ||
60 | $origin = 'http://non.existent/blabla/?test'; | ||
61 | $target = 'http://other.url/stuff.php'; | ||
62 | |||
63 | $this->assertEquals($target, getAbsoluteUrl($origin, $target)); | ||
64 | } | ||
38 | } | 65 | } |
diff --git a/tests/Url/UrlTest.php b/tests/Url/UrlTest.php index a64a73ea..5fdc8617 100644 --- a/tests/Url/UrlTest.php +++ b/tests/Url/UrlTest.php | |||
@@ -181,4 +181,19 @@ class UrlTest extends PHPUnit_Framework_TestCase | |||
181 | $url = new Url('ftp://save.tld/mysave'); | 181 | $url = new Url('ftp://save.tld/mysave'); |
182 | $this->assertFalse($url->isHttp()); | 182 | $this->assertFalse($url->isHttp()); |
183 | } | 183 | } |
184 | |||
185 | /** | ||
186 | * Test IndToAscii. | ||
187 | */ | ||
188 | function testIndToAscii() | ||
189 | { | ||
190 | $ind = 'http://www.académie-française.fr/'; | ||
191 | $expected = 'http://www.xn--acadmie-franaise-npb1a.fr/'; | ||
192 | $url = new Url($ind); | ||
193 | $this->assertEquals($expected, $url->indToAscii()); | ||
194 | |||
195 | $notInd = 'http://www.academie-francaise.fr/'; | ||
196 | $url = new Url($notInd); | ||
197 | $this->assertEquals($notInd, $url->indToAscii()); | ||
198 | } | ||
184 | } | 199 | } |