aboutsummaryrefslogtreecommitdiffhomepage
path: root/application
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2016-04-06 22:00:52 +0200
committerArthurHoaro <arthur@hoa.ro>2016-05-03 19:51:29 +0200
commitce7b0b6480aa854ee6893f5c889277b0e3b13efc (patch)
tree8d8beb4ea5568d9989a5ebf52e2adc542e17f74e /application
parent11609d9fd8ba53f049e6c913d8e3affab6cfc9ce (diff)
downloadShaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.gz
Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.tar.zst
Shaarli-ce7b0b6480aa854ee6893f5c889277b0e3b13efc.zip
Fixes #531 - Title retrieving is failing with multiple use case
see https://github.com/shaarli/Shaarli/issues/531 for details
Diffstat (limited to 'application')
-rw-r--r--application/HttpUtils.php60
-rw-r--r--application/LinkUtils.php6
-rw-r--r--application/Url.php42
3 files changed, 95 insertions, 13 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index af7cb371..0e1ce879 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -27,7 +27,9 @@
27function get_http_response($url, $timeout = 30, $maxBytes = 4194304) 27function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
28{ 28{
29 $urlObj = new Url($url); 29 $urlObj = new Url($url);
30 if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { 30 $cleanUrl = $urlObj->indToAscii();
31
32 if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) {
31 return array(array(0 => 'Invalid HTTP Url'), false); 33 return array(array(0 => 'Invalid HTTP Url'), false);
32 } 34 }
33 35
@@ -35,22 +37,27 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
35 'http' => array( 37 'http' => array(
36 'method' => 'GET', 38 'method' => 'GET',
37 'timeout' => $timeout, 39 'timeout' => $timeout,
38 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' 40 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)'
39 .' Gecko/20100101 Firefox/23.0', 41 .' Gecko/20100101 Firefox/45.0',
40 'request_fulluri' => true, 42 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3',
41 ) 43 )
42 ); 44 );
43 45
44 $context = stream_context_create($options);
45 stream_context_set_default($options); 46 stream_context_set_default($options);
47 list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
48 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 $options['http']['request_fulluri'] = true;
50 stream_context_set_default($options);
51 list($headers, $finalUrl) = get_redirected_headers($cleanUrl);
52 }
46 53
47 list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup());
48 if (! $headers || strpos($headers[0], '200 OK') === false) { 54 if (! $headers || strpos($headers[0], '200 OK') === false) {
49 return array($headers, false); 55 return array($headers, false);
50 } 56 }
51 57
52 try { 58 try {
53 // TODO: catch Exception in calling code (thumbnailer) 59 // TODO: catch Exception in calling code (thumbnailer)
60 $context = stream_context_create($options);
54 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); 61 $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes);
55 } catch (Exception $exc) { 62 } catch (Exception $exc) {
56 return array(array(0 => 'HTTP Error'), $exc->getMessage()); 63 return array(array(0 => 'HTTP Error'), $exc->getMessage());
@@ -60,16 +67,19 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304)
60} 67}
61 68
62/** 69/**
63 * Retrieve HTTP headers, following n redirections (temporary and permanent). 70 * Retrieve HTTP headers, following n redirections (temporary and permanent ones).
64 * 71 *
65 * @param string $url initial URL to reach. 72 * @param string $url initial URL to reach.
66 * @param int $redirectionLimit max redirection follow.. 73 * @param int $redirectionLimit max redirection follow..
67 * 74 *
68 * @return array 75 * @return array HTTP headers, or false if it failed.
69 */ 76 */
70function get_redirected_headers($url, $redirectionLimit = 3) 77function get_redirected_headers($url, $redirectionLimit = 3)
71{ 78{
72 $headers = get_headers($url, 1); 79 $headers = get_headers($url, 1);
80 if (!empty($headers['location']) && empty($headers['Location'])) {
81 $headers['Location'] = $headers['location'];
82 }
73 83
74 // Headers found, redirection found, and limit not reached. 84 // Headers found, redirection found, and limit not reached.
75 if ($redirectionLimit-- > 0 85 if ($redirectionLimit-- > 0
@@ -79,6 +89,7 @@ function get_redirected_headers($url, $redirectionLimit = 3)
79 89
80 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; 90 $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location'];
81 if ($redirection != $url) { 91 if ($redirection != $url) {
92 $redirection = getAbsoluteUrl($url, $redirection);
82 return get_redirected_headers($redirection, $redirectionLimit); 93 return get_redirected_headers($redirection, $redirectionLimit);
83 } 94 }
84 } 95 }
@@ -87,6 +98,35 @@ function get_redirected_headers($url, $redirectionLimit = 3)
87} 98}
88 99
89/** 100/**
101 * Get an absolute URL from a complete one, and another absolute/relative URL.
102 *
103 * @param string $originalUrl The original complete URL.
104 * @param string $newUrl The new one, absolute or relative.
105 *
106 * @return string Final URL:
107 * - $newUrl if it was already an absolute URL.
108 * - if it was relative, absolute URL from $originalUrl path.
109 */
110function getAbsoluteUrl($originalUrl, $newUrl)
111{
112 $newScheme = parse_url($newUrl, PHP_URL_SCHEME);
113 // Already an absolute URL.
114 if (!empty($newScheme)) {
115 return $newUrl;
116 }
117
118 $parts = parse_url($originalUrl);
119 $final = $parts['scheme'] .'://'. $parts['host'];
120 $final .= (!empty($parts['port'])) ? $parts['port'] : '';
121 $final .= '/';
122 if ($newUrl[0] != '/') {
123 $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/'));
124 }
125 $final .= ltrim($newUrl, '/');
126 return $final;
127}
128
129/**
90 * Returns the server's base URL: scheme://domain.tld[:port] 130 * Returns the server's base URL: scheme://domain.tld[:port]
91 * 131 *
92 * @param array $server the $_SERVER array 132 * @param array $server the $_SERVER array
diff --git a/application/LinkUtils.php b/application/LinkUtils.php
index d8dc8b5e..2df76ba8 100644
--- a/application/LinkUtils.php
+++ b/application/LinkUtils.php
@@ -9,8 +9,8 @@
9 */ 9 */
10function html_extract_title($html) 10function html_extract_title($html)
11{ 11{
12 if (preg_match('!<title>(.*?)</title>!is', $html, $matches)) { 12 if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) {
13 return trim(str_replace("\n", ' ', $matches[1])); 13 return trim(str_replace("\n", '', $matches[1]));
14 } 14 }
15 return false; 15 return false;
16} 16}
@@ -70,7 +70,7 @@ function headers_extract_charset($headers)
70function html_extract_charset($html) 70function html_extract_charset($html)
71{ 71{
72 // Get encoding specified in HTML header. 72 // Get encoding specified in HTML header.
73 preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); 73 preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc);
74 if (!empty($enc[1])) { 74 if (!empty($enc[1])) {
75 return strtolower($enc[1]); 75 return strtolower($enc[1]);
76 } 76 }
diff --git a/application/Url.php b/application/Url.php
index af38c4d9..61a30a78 100644
--- a/application/Url.php
+++ b/application/Url.php
@@ -62,7 +62,21 @@ function add_trailing_slash($url)
62{ 62{
63 return $url . (!endsWith($url, '/') ? '/' : ''); 63 return $url . (!endsWith($url, '/') ? '/' : '');
64} 64}
65/**
66 * Converts an URL with an IDN host to a ASCII one.
67 *
68 * @param string $url Input URL.
69 *
70 * @return string converted URL.
71 */
72function url_with_idn_to_ascii($url)
73{
74 $parts = parse_url($url);
75 $parts['host'] = idn_to_ascii($parts['host']);
65 76
77 $httpUrl = new \http\Url($parts);
78 return $httpUrl->toString();
79}
66/** 80/**
67 * URL representation and cleanup utilities 81 * URL representation and cleanup utilities
68 * 82 *
@@ -221,6 +235,22 @@ class Url
221 } 235 }
222 236
223 /** 237 /**
238 * Converts an URL with an International Domain Name host to a ASCII one.
239 * This requires PHP-intl. If it's not available, just returns this->cleanup().
240 *
241 * @return string converted cleaned up URL.
242 */
243 public function indToAscii()
244 {
245 $out = $this->cleanup();
246 if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) {
247 return $out;
248 }
249 $asciiHost = idn_to_ascii($this->parts['host']);
250 return str_replace($this->parts['host'], $asciiHost, $out);
251 }
252
253 /**
224 * Get URL scheme. 254 * Get URL scheme.
225 * 255 *
226 * @return string the URL scheme or false if none is provided. 256 * @return string the URL scheme or false if none is provided.
@@ -233,6 +263,18 @@ class Url
233 } 263 }
234 264
235 /** 265 /**
266 * Get URL host.
267 *
268 * @return string the URL host or false if none is provided.
269 */
270 public function getHost() {
271 if (empty($this->parts['host'])) {
272 return false;
273 }
274 return $this->parts['host'];
275 }
276
277 /**
236 * Test if the Url is an HTTP one. 278 * Test if the Url is an HTTP one.
237 * 279 *
238 * @return true is HTTP, false otherwise. 280 * @return true is HTTP, false otherwise.