aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2016-04-06 22:00:52 +0200
committerArthurHoaro <arthur@hoa.ro>2016-05-03 19:51:29 +0200
commitce7b0b6480aa854ee6893f5c889277b0e3b13efc (patch)
tree8d8beb4ea5568d9989a5ebf52e2adc542e17f74e
parent11609d9fd8ba53f049e6c913d8e3affab6cfc9ce (diff)
downloadShaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.gz
Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.zst
Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.zip
Fixes #531 - Title retrieving is failing with multiple use case
see https://github.com/shaarli/Shaarli/issues/531 for details
-rw-r--r--application/HttpUtils.php60
-rw-r--r--application/LinkUtils.php6
-rw-r--r--application/Url.php42
-rw-r--r--index.php8
-rw-r--r--tests/HttpUtils/GetHttpUrlTest.php27
-rw-r--r--tests/Url/UrlTest.php15
6 files changed, 142 insertions, 16 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index af7cb371..0e1ce879 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -27,7 +27,9 @@
27function get_http_response($url, $timeout = 30, $maxBytes = 4194304) 27function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
28{ 28{
29 $urlObj = new Url($url); 29 $urlObj = new Url($url);
30 if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { 30 $cleanUrl = $urlObj->indToAscii();
31
32 if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
31 return array(array(0 => 'Invalid HTTP Url'), false); 33 return array(array(0 => 'Invalid HTTP Url'), false);
32 } 34 }
33 35
@@ -35,22 +37,27 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
35 'http' => array( 37 'http' => array(
36 'method' => 'GET', 38 'method' => 'GET',
37 'timeout' => $timeout, 39 'timeout' => $timeout,
38 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' 40 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
39 .' Gecko/20100101 Firefox/23.0', 41 .' Gecko/20100101 Firefox/45.0',
40 'request_fulluri' => true, 42 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3',
41 ) 43 )
42 ); 44 );
43 45
44 $context = stream_context_create($options);
45 stream_context_set_default($options); 46 stream_context_set_default($options);
47 list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
48 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 $options['http']['request_fulluri'] = true;
50 stream_context_set_default($options);
51 list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
52 }
46 53
47 list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup());
48 if (! $headers || strpos($headers[0], '200 OK') === false) { 54 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 return array($headers, false); 55 return array($headers, false);
50 } 56 }
51 57
52 try { 58 try {
53 // TODO: catch Exception in calling code (thumbnailer) 59 // TODO: catch Exception in calling code (thumbnailer)
60 $context = stream_context_create($options);
54 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); 61 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
55 } catch (Exception $exc) { 62 } catch (Exception $exc) {
56 return array(array(0 => 'HTTP Error'), $exc->getMessage()); 63 return array(array(0 => 'HTTP Error'), $exc->getMessage());
@@ -60,16 +67,19 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
60} 67}
61 68
62/** 69/**
63 * Retrieve HTTP headers, following n redirections (temporary and permanent). 70 * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
64 * 71 *
65 * @param string $url initial URL to reach. 72 * @param string $url initial URL to reach.
66 * @param int $redirectionLimit max redirection follow.. 73 * @param int $redirectionLimit max redirection follow..
67 * 74 *
68 * @return array 75 * @return array HTTP headers, or false if it failed.
69 */ 76 */
70function get_redirected_headers($url, $redirectionLimit = 3) 77function get_redirected_headers($url, $redirectionLimit = 3)
71{ 78{
72 $headers = get_headers($url, 1); 79 $headers = get_headers($url, 1);
80 if (!empty($headers['location']) && empty($headers['Location'])) {
81 $headers['Location'] = $headers['location'];
82 }
73 83
74 // Headers found, redirection found, and limit not reached. 84 // Headers found, redirection found, and limit not reached.
75 if ($redirectionLimit-- > 0 85 if ($redirectionLimit-- > 0
@@ -79,6 +89,7 @@ function get_redirected_headers($url, $redirectionLimit = 3)
79 89
80 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; 90 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
81 if ($redirection != $url) { 91 if ($redirection != $url) {
92 $redirection = getAbsoluteUrl($url, $redirection);
82 return get_redirected_headers($redirection, $redirectionLimit); 93 return get_redirected_headers($redirection, $redirectionLimit);
83 } 94 }
84 } 95 }
@@ -87,6 +98,35 @@ function get_redirected_headers($url, $redirectionLimit = 3)
87} 98}
88 99
89/** 100/**
101 * Get an absolute URL from a complete one, and another absolute/relative URL.
102 *
103 * @param string $originalUrl The original complete URL.
104 * @param string $newUrl The new one, absolute or relative.
105 *
106 * @return string Final URL:
107 * - $newUrl if it was already an absolute URL.
108 * - if it was relative, absolute URL from $originalUrl path.
109 */
110function getAbsoluteUrl($originalUrl, $newUrl)
111{
112 $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
113 // Already an absolute URL.
114 if (!empty($newScheme)) {
115 return $newUrl;
116 }
117
118 $parts = parse_url($originalUrl);
119 $final = $parts['scheme'] .'://'. $parts['host'];
120 $final .= (!empty($parts['port'])) ? $parts['port'] : '';
121 $final .= '/';
122 if ($newUrl[0] != '/') {
123 $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
124 }
125 $final .= ltrim($newUrl, '/');
126 return $final;
127}
128
129/**
90 * Returns the server's base URL: scheme://domain.tld[:port] 130 * Returns the server's base URL: scheme://domain.tld[:port]
91 * 131 *
92 * @param array $server the $_SERVER array 132 * @param array $server the $_SERVER array
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index d8dc8b5e..2df76ba8 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -9,8 +9,8 @@
9 */ 9 */
10function html_extract_title($html) 10function html_extract_title($html)
11{ 11{
12 if (preg_match('!<title>(.*?)</title>!is', $html, $matches)) { 12 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
13 return trim(str_replace("\n", ' ', $matches[1])); 13 return trim(str_replace("\n", '', $matches[1]));
14 } 14 }
15 return false; 15 return false;
16} 16}
@@ -70,7 +70,7 @@ function headers_extract_charset($headers)
70function html_extract_charset($html) 70function html_extract_charset($html)
71{ 71{
72 // Get encoding specified in HTML header. 72 // Get encoding specified in HTML header.
73 preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); 73 preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
74 if (!empty($enc[1])) { 74 if (!empty($enc[1])) {
75 return strtolower($enc[1]); 75 return strtolower($enc[1]);
76 } 76 }
diff --git a/application/Url.php b/application/Url.php
index af38c4d9..61a30a78 100644
--- a/application/Url.php
+++ b/application/Url.php
@@ -62,7 +62,21 @@ function add_trailing_slash($url)
62{ 62{
63 return $url . (!endsWith($url, '/') ? '/' : ''); 63 return $url . (!endsWith($url, '/') ? '/' : '');
64} 64}
65/**
66 * Converts an URL with an IDN host to a ASCII one.
67 *
68 * @param string $url Input URL.
69 *
70 * @return string converted URL.
71 */
72function url_with_idn_to_ascii($url)
73{
74 $parts = parse_url($url);
75 $parts['host'] = idn_to_ascii($parts['host']);
65 76
77 $httpUrl = new \http\Url($parts);
78 return $httpUrl->toString();
79}
66/** 80/**
67 * URL representation and cleanup utilities 81 * URL representation and cleanup utilities
68 * 82 *
@@ -221,6 +235,22 @@ class Url
221 } 235 }
222 236
223 /** 237 /**
238 * Converts an URL with an International Domain Name host to a ASCII one.
239 * This requires PHP-intl. If it's not available, just returns this->cleanup().
240 *
241 * @return string converted cleaned up URL.
242 */
243 public function indToAscii()
244 {
245 $out = $this->cleanup();
246 if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) {
247 return $out;
248 }
249 $asciiHost = idn_to_ascii($this->parts['host']);
250 return str_replace($this->parts['host'], $asciiHost, $out);
251 }
252
253 /**
224 * Get URL scheme. 254 * Get URL scheme.
225 * 255 *
226 * @return string the URL scheme or false if none is provided. 256 * @return string the URL scheme or false if none is provided.
@@ -233,6 +263,18 @@ class Url
233 } 263 }
234 264
235 /** 265 /**
266 * Get URL host.
267 *
268 * @return string the URL host or false if none is provided.
269 */
270 public function getHost() {
271 if (empty($this->parts['host'])) {
272 return false;
273 }
274 return $this->parts['host'];
275 }
276
277 /**
236 * Test if the Url is an HTTP one. 278 * Test if the Url is an HTTP one.
237 * 279 *
238 * @return true is HTTP, false otherwise. 280 * @return true is HTTP, false otherwise.
diff --git a/index.php b/index.php
index dfc00fbd..41a42cf6 100644
--- a/index.php
+++ b/index.php
@@ -1516,7 +1516,7 @@ function renderPage()
1516 1516
1517 // -------- User want to post a new link: Display link edit form. 1517 // -------- User want to post a new link: Display link edit form.
1518 if (isset($_GET['post'])) { 1518 if (isset($_GET['post'])) {
1519 $url = cleanup_url(escape($_GET['post'])); 1519 $url = cleanup_url($_GET['post']);
1520 1520
1521 $link_is_new = false; 1521 $link_is_new = false;
1522 // Check if URL is not already in database (in this case, we will edit the existing link) 1522 // Check if URL is not already in database (in this case, we will edit the existing link)
@@ -1541,8 +1541,8 @@ function renderPage()
1541 // Extract title. 1541 // Extract title.
1542 $title = html_extract_title($content); 1542 $title = html_extract_title($content);
1543 // Re-encode title in utf-8 if necessary. 1543 // Re-encode title in utf-8 if necessary.
1544 if (! empty($title) && $charset != 'utf-8') { 1544 if (! empty($title) && strtolower($charset) != 'utf-8') {
1545 $title = mb_convert_encoding($title, $charset, 'utf-8'); 1545 $title = mb_convert_encoding($title, 'utf-8', $charset);
1546 } 1546 }
1547 } 1547 }
1548 } 1548 }
@@ -1551,6 +1551,8 @@ function renderPage()
1551 $url = '?' . smallHash($linkdate); 1551 $url = '?' . smallHash($linkdate);
1552 $title = 'Note: '; 1552 $title = 'Note: ';
1553 } 1553 }
1554 $url = escape($url);
1555 $title = escape($title);
1554 1556
1555 $link = array( 1557 $link = array(
1556 'linkdate' => $linkdate, 1558 'linkdate' => $linkdate,
diff --git a/tests/HttpUtils/GetHttpUrlTest.php b/tests/HttpUtils/GetHttpUrlTest.php
index fd293505..ea53de5f 100644
--- a/tests/HttpUtils/GetHttpUrlTest.php
+++ b/tests/HttpUtils/GetHttpUrlTest.php
@@ -35,4 +35,31 @@ class GetHttpUrlTest extends PHPUnit_Framework_TestCase
35 $this->assertFalse($headers); 35 $this->assertFalse($headers);
36 $this->assertFalse($content); 36 $this->assertFalse($content);
37 } 37 }
38
39 /**
40 * Test getAbsoluteUrl with relative target URL.
41 */
42 public function testGetAbsoluteUrlWithRelative()
43 {
44 $origin = 'http://non.existent/blabla/?test';
45 $target = '/stuff.php';
46
47 $expected = 'http://non.existent/stuff.php';
48 $this->assertEquals($expected, getAbsoluteUrl($origin, $target));
49
50 $target = 'stuff.php';
51 $expected = 'http://non.existent/blabla/stuff.php';
52 $this->assertEquals($expected, getAbsoluteUrl($origin, $target));
53 }
54
55 /**
56 * Test getAbsoluteUrl with absolute target URL.
57 */
58 public function testGetAbsoluteUrlWithAbsolute()
59 {
60 $origin = 'http://non.existent/blabla/?test';
61 $target = 'http://other.url/stuff.php';
62
63 $this->assertEquals($target, getAbsoluteUrl($origin, $target));
64 }
38} 65}
diff --git a/tests/Url/UrlTest.php b/tests/Url/UrlTest.php
index a64a73ea..5fdc8617 100644
--- a/tests/Url/UrlTest.php
+++ b/tests/Url/UrlTest.php
@@ -181,4 +181,19 @@ class UrlTest extends PHPUnit_Framework_TestCase
181 $url = new Url('ftp://save.tld/mysave'); 181 $url = new Url('ftp://save.tld/mysave');
182 $this->assertFalse($url->isHttp()); 182 $this->assertFalse($url->isHttp());
183 } 183 }
184
185 /**
186 * Test IndToAscii.
187 */
188 function testIndToAscii()
189 {
190 $ind = 'http://www.académie-française.fr/';
191 $expected = 'http://www.xn--acadmie-franaise-npb1a.fr/';
192 $url = new Url($ind);
193 $this->assertEquals($expected, $url->indToAscii());
194
195 $notInd = 'http://www.academie-francaise.fr/';
196 $url = new Url($notInd);
197 $this->assertEquals($notInd, $url->indToAscii());
198 }
184} 199}