diff options
author | Arthur <arthur@hoa.ro> | 2016-05-03 19:53:57 +0200 |
---|---|---|
committer | Arthur <arthur@hoa.ro> | 2016-05-03 19:53:57 +0200 |
commit | 47be06098396b5eef35234b88227d64ab81bd988 (patch) | |
tree | bb42c742179e75c7f15c4126ddf79838ceb94331 /application | |
parent | 5a63c34f3a68ce2f53ee9164e2a35a0904676399 (diff) | |
parent | ce7b0b6480aa854ee6893f5c889277b0e3b13efc (diff) | |
download | Shaarli-47be06098396b5eef35234b88227d64ab81bd988.tar.gz Shaarli-47be06098396b5eef35234b88227d64ab81bd988.tar.zst Shaarli-47be06098396b5eef35234b88227d64ab81bd988.zip |
Merge pull request #532 from ArthurHoaro/hotfix/title-retrieve-the-return
Fixes #531 - Title retrieving is failing with multiple use case
Diffstat (limited to 'application')
-rw-r--r-- | application/HttpUtils.php | 60 | ||||
-rw-r--r-- | application/LinkUtils.php | 6 | ||||
-rw-r--r-- | application/Url.php | 42 |
3 files changed, 95 insertions, 13 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index af7cb371..0e1ce879 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -27,7 +27,9 @@ | |||
27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | 27 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304) |
28 | { | 28 | { |
29 | $urlObj = new Url($url); | 29 | $urlObj = new Url($url); |
30 | if (! filter_var($url, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | 30 | $cleanUrl = $urlObj->indToAscii(); |
31 | |||
32 | if (! filter_var($cleanUrl, FILTER_VALIDATE_URL) || ! $urlObj->isHttp()) { | ||
31 | return array(array(0 => 'Invalid HTTP Url'), false); | 33 | return array(array(0 => 'Invalid HTTP Url'), false); |
32 | } | 34 | } |
33 | 35 | ||
@@ -35,22 +37,27 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | |||
35 | 'http' => array( | 37 | 'http' => array( |
36 | 'method' => 'GET', | 38 | 'method' => 'GET', |
37 | 'timeout' => $timeout, | 39 | 'timeout' => $timeout, |
38 | 'user_agent' => 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:23.0)' | 40 | 'user_agent' => 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' |
39 | .' Gecko/20100101 Firefox/23.0', | 41 | .' Gecko/20100101 Firefox/45.0', |
40 | 'request_fulluri' => true, | 42 | 'accept_language' => substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3', |
41 | ) | 43 | ) |
42 | ); | 44 | ); |
43 | 45 | ||
44 | $context = stream_context_create($options); | ||
45 | stream_context_set_default($options); | 46 | stream_context_set_default($options); |
47 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | ||
49 | $options['http']['request_fulluri'] = true; | ||
50 | stream_context_set_default($options); | ||
51 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl); | ||
52 | } | ||
46 | 53 | ||
47 | list($headers, $finalUrl) = get_redirected_headers($urlObj->cleanup()); | ||
48 | if (! $headers || strpos($headers[0], '200 OK') === false) { | 54 | if (! $headers || strpos($headers[0], '200 OK') === false) { |
49 | return array($headers, false); | 55 | return array($headers, false); |
50 | } | 56 | } |
51 | 57 | ||
52 | try { | 58 | try { |
53 | // TODO: catch Exception in calling code (thumbnailer) | 59 | // TODO: catch Exception in calling code (thumbnailer) |
60 | $context = stream_context_create($options); | ||
54 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); | 61 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); |
55 | } catch (Exception $exc) { | 62 | } catch (Exception $exc) { |
56 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | 63 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); |
@@ -60,16 +67,19 @@ function get_http_response($url, $timeout = 30, $maxBytes = 4194304) | |||
60 | } | 67 | } |
61 | 68 | ||
62 | /** | 69 | /** |
63 | * Retrieve HTTP headers, following n redirections (temporary and permanent). | 70 | * Retrieve HTTP headers, following n redirections (temporary and permanent ones). |
64 | * | 71 | * |
65 | * @param string $url initial URL to reach. | 72 | * @param string $url initial URL to reach. |
66 | * @param int $redirectionLimit max redirection follow.. | 73 | * @param int $redirectionLimit max redirection follow.. |
67 | * | 74 | * |
68 | * @return array | 75 | * @return array HTTP headers, or false if it failed. |
69 | */ | 76 | */ |
70 | function get_redirected_headers($url, $redirectionLimit = 3) | 77 | function get_redirected_headers($url, $redirectionLimit = 3) |
71 | { | 78 | { |
72 | $headers = get_headers($url, 1); | 79 | $headers = get_headers($url, 1); |
80 | if (!empty($headers['location']) && empty($headers['Location'])) { | ||
81 | $headers['Location'] = $headers['location']; | ||
82 | } | ||
73 | 83 | ||
74 | // Headers found, redirection found, and limit not reached. | 84 | // Headers found, redirection found, and limit not reached. |
75 | if ($redirectionLimit-- > 0 | 85 | if ($redirectionLimit-- > 0 |
@@ -79,6 +89,7 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
79 | 89 | ||
80 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | 90 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; |
81 | if ($redirection != $url) { | 91 | if ($redirection != $url) { |
92 | $redirection = getAbsoluteUrl($url, $redirection); | ||
82 | return get_redirected_headers($redirection, $redirectionLimit); | 93 | return get_redirected_headers($redirection, $redirectionLimit); |
83 | } | 94 | } |
84 | } | 95 | } |
@@ -87,6 +98,35 @@ function get_redirected_headers($url, $redirectionLimit = 3) | |||
87 | } | 98 | } |
88 | 99 | ||
89 | /** | 100 | /** |
101 | * Get an absolute URL from a complete one, and another absolute/relative URL. | ||
102 | * | ||
103 | * @param string $originalUrl The original complete URL. | ||
104 | * @param string $newUrl The new one, absolute or relative. | ||
105 | * | ||
106 | * @return string Final URL: | ||
107 | * - $newUrl if it was already an absolute URL. | ||
108 | * - if it was relative, absolute URL from $originalUrl path. | ||
109 | */ | ||
110 | function getAbsoluteUrl($originalUrl, $newUrl) | ||
111 | { | ||
112 | $newScheme = parse_url($newUrl, PHP_URL_SCHEME); | ||
113 | // Already an absolute URL. | ||
114 | if (!empty($newScheme)) { | ||
115 | return $newUrl; | ||
116 | } | ||
117 | |||
118 | $parts = parse_url($originalUrl); | ||
119 | $final = $parts['scheme'] .'://'. $parts['host']; | ||
120 | $final .= (!empty($parts['port'])) ? $parts['port'] : ''; | ||
121 | $final .= '/'; | ||
122 | if ($newUrl[0] != '/') { | ||
123 | $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/')); | ||
124 | } | ||
125 | $final .= ltrim($newUrl, '/'); | ||
126 | return $final; | ||
127 | } | ||
128 | |||
129 | /** | ||
90 | * Returns the server's base URL: scheme://domain.tld[:port] | 130 | * Returns the server's base URL: scheme://domain.tld[:port] |
91 | * | 131 | * |
92 | * @param array $server the $_SERVER array | 132 | * @param array $server the $_SERVER array |
diff --git a/application/LinkUtils.php b/application/LinkUtils.php index d8dc8b5e..2df76ba8 100644 --- a/application/LinkUtils.php +++ b/application/LinkUtils.php | |||
@@ -9,8 +9,8 @@ | |||
9 | */ | 9 | */ |
10 | function html_extract_title($html) | 10 | function html_extract_title($html) |
11 | { | 11 | { |
12 | if (preg_match('!<title>(.*?)</title>!is', $html, $matches)) { | 12 | if (preg_match('!<title.*?>(.*?)</title>!is', $html, $matches)) { |
13 | return trim(str_replace("\n", ' ', $matches[1])); | 13 | return trim(str_replace("\n", '', $matches[1])); |
14 | } | 14 | } |
15 | return false; | 15 | return false; |
16 | } | 16 | } |
@@ -70,7 +70,7 @@ function headers_extract_charset($headers) | |||
70 | function html_extract_charset($html) | 70 | function html_extract_charset($html) |
71 | { | 71 | { |
72 | // Get encoding specified in HTML header. | 72 | // Get encoding specified in HTML header. |
73 | preg_match('#<meta .*charset="?([^">/]+)"? */?>#Usi', $html, $enc); | 73 | preg_match('#<meta .*charset=["\']?([^";\'>/]+)["\']? */?>#Usi', $html, $enc); |
74 | if (!empty($enc[1])) { | 74 | if (!empty($enc[1])) { |
75 | return strtolower($enc[1]); | 75 | return strtolower($enc[1]); |
76 | } | 76 | } |
diff --git a/application/Url.php b/application/Url.php index af38c4d9..61a30a78 100644 --- a/application/Url.php +++ b/application/Url.php | |||
@@ -62,7 +62,21 @@ function add_trailing_slash($url) | |||
62 | { | 62 | { |
63 | return $url . (!endsWith($url, '/') ? '/' : ''); | 63 | return $url . (!endsWith($url, '/') ? '/' : ''); |
64 | } | 64 | } |
65 | /** | ||
66 | * Converts an URL with an IDN host to a ASCII one. | ||
67 | * | ||
68 | * @param string $url Input URL. | ||
69 | * | ||
70 | * @return string converted URL. | ||
71 | */ | ||
72 | function url_with_idn_to_ascii($url) | ||
73 | { | ||
74 | $parts = parse_url($url); | ||
75 | $parts['host'] = idn_to_ascii($parts['host']); | ||
65 | 76 | ||
77 | $httpUrl = new \http\Url($parts); | ||
78 | return $httpUrl->toString(); | ||
79 | } | ||
66 | /** | 80 | /** |
67 | * URL representation and cleanup utilities | 81 | * URL representation and cleanup utilities |
68 | * | 82 | * |
@@ -221,6 +235,22 @@ class Url | |||
221 | } | 235 | } |
222 | 236 | ||
223 | /** | 237 | /** |
238 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
239 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
240 | * | ||
241 | * @return string converted cleaned up URL. | ||
242 | */ | ||
243 | public function indToAscii() | ||
244 | { | ||
245 | $out = $this->cleanup(); | ||
246 | if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) { | ||
247 | return $out; | ||
248 | } | ||
249 | $asciiHost = idn_to_ascii($this->parts['host']); | ||
250 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
251 | } | ||
252 | |||
253 | /** | ||
224 | * Get URL scheme. | 254 | * Get URL scheme. |
225 | * | 255 | * |
226 | * @return string the URL scheme or false if none is provided. | 256 | * @return string the URL scheme or false if none is provided. |
@@ -233,6 +263,18 @@ class Url | |||
233 | } | 263 | } |
234 | 264 | ||
235 | /** | 265 | /** |
266 | * Get URL host. | ||
267 | * | ||
268 | * @return string the URL host or false if none is provided. | ||
269 | */ | ||
270 | public function getHost() { | ||
271 | if (empty($this->parts['host'])) { | ||
272 | return false; | ||
273 | } | ||
274 | return $this->parts['host']; | ||
275 | } | ||
276 | |||
277 | /** | ||
236 | * Test if the Url is an HTTP one. | 278 | * Test if the Url is an HTTP one. |
237 | * | 279 | * |
238 | * @return true is HTTP, false otherwise. | 280 | * @return true is HTTP, false otherwise. |