diff options
Diffstat (limited to 'application/http')
-rw-r--r-- | application/http/Base64Url.php | 35 | ||||
-rw-r--r-- | application/http/HttpUtils.php | 479 | ||||
-rw-r--r-- | application/http/Url.php | 217 | ||||
-rw-r--r-- | application/http/UrlUtils.php | 88 |
4 files changed, 819 insertions, 0 deletions
diff --git a/application/http/Base64Url.php b/application/http/Base64Url.php new file mode 100644 index 00000000..33fa7c1f --- /dev/null +++ b/application/http/Base64Url.php | |||
@@ -0,0 +1,35 @@ | |||
1 | <?php | ||
2 | |||
3 | namespace Shaarli\Http; | ||
4 | |||
5 | /** | ||
6 | * URL-safe Base64 operations | ||
7 | * | ||
8 | * @see https://en.wikipedia.org/wiki/Base64#URL_applications | ||
9 | */ | ||
10 | class Base64Url | ||
11 | { | ||
12 | /** | ||
13 | * Base64Url-encodes data | ||
14 | * | ||
15 | * @param string $data Data to encode | ||
16 | * | ||
17 | * @return string Base64Url-encoded data | ||
18 | */ | ||
19 | public static function encode($data) | ||
20 | { | ||
21 | return rtrim(strtr(base64_encode($data), '+/', '-_'), '='); | ||
22 | } | ||
23 | |||
24 | /** | ||
25 | * Decodes Base64Url-encoded data | ||
26 | * | ||
27 | * @param string $data Data to decode | ||
28 | * | ||
29 | * @return string Decoded data | ||
30 | */ | ||
31 | public static function decode($data) | ||
32 | { | ||
33 | return base64_decode(str_pad(strtr($data, '-_', '+/'), strlen($data) % 4, '=', STR_PAD_RIGHT)); | ||
34 | } | ||
35 | } | ||
diff --git a/application/http/HttpUtils.php b/application/http/HttpUtils.php new file mode 100644 index 00000000..2ea9195d --- /dev/null +++ b/application/http/HttpUtils.php | |||
@@ -0,0 +1,479 @@ | |||
1 | <?php | ||
2 | |||
3 | use Shaarli\Http\Url; | ||
4 | |||
5 | /** | ||
6 | * GET an HTTP URL to retrieve its content | ||
7 | * Uses the cURL library or a fallback method | ||
8 | * | ||
9 | * @param string $url URL to get (http://...) | ||
10 | * @param int $timeout network timeout (in seconds) | ||
11 | * @param int $maxBytes maximum downloaded bytes (default: 4 MiB) | ||
12 | * @param callable|string $curlWriteFunction Optional callback called during the download (cURL CURLOPT_WRITEFUNCTION). | ||
13 | * Can be used to add download conditions on the | ||
14 | * headers (response code, content type, etc.). | ||
15 | * | ||
16 | * @return array HTTP response headers, downloaded content | ||
17 | * | ||
18 | * Output format: | ||
19 | * [0] = associative array containing HTTP response headers | ||
20 | * [1] = URL content (downloaded data) | ||
21 | * | ||
22 | * Example: | ||
23 | * list($headers, $data) = get_http_response('http://sebauvage.net/'); | ||
24 | * if (strpos($headers[0], '200 OK') !== false) { | ||
25 | * echo 'Data type: '.htmlspecialchars($headers['Content-Type']); | ||
26 | * } else { | ||
27 | * echo 'There was an error: '.htmlspecialchars($headers[0]); | ||
28 | * } | ||
29 | * | ||
30 | * @see https://secure.php.net/manual/en/ref.curl.php | ||
31 | * @see https://secure.php.net/manual/en/functions.anonymous.php | ||
32 | * @see https://secure.php.net/manual/en/function.preg-split.php | ||
33 | * @see https://secure.php.net/manual/en/function.explode.php | ||
34 | * @see http://stackoverflow.com/q/17641073 | ||
35 | * @see http://stackoverflow.com/q/9183178 | ||
36 | * @see http://stackoverflow.com/q/1462720 | ||
37 | */ | ||
38 | function get_http_response($url, $timeout = 30, $maxBytes = 4194304, $curlWriteFunction = null) | ||
39 | { | ||
40 | $urlObj = new Url($url); | ||
41 | $cleanUrl = $urlObj->idnToAscii(); | ||
42 | |||
43 | if (!filter_var($cleanUrl, FILTER_VALIDATE_URL) || !$urlObj->isHttp()) { | ||
44 | return array(array(0 => 'Invalid HTTP UrlUtils'), false); | ||
45 | } | ||
46 | |||
47 | $userAgent = | ||
48 | 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:45.0)' | ||
49 | . ' Gecko/20100101 Firefox/45.0'; | ||
50 | $acceptLanguage = | ||
51 | substr(setlocale(LC_COLLATE, 0), 0, 2) . ',en-US;q=0.7,en;q=0.3'; | ||
52 | $maxRedirs = 3; | ||
53 | |||
54 | if (!function_exists('curl_init')) { | ||
55 | return get_http_response_fallback( | ||
56 | $cleanUrl, | ||
57 | $timeout, | ||
58 | $maxBytes, | ||
59 | $userAgent, | ||
60 | $acceptLanguage, | ||
61 | $maxRedirs | ||
62 | ); | ||
63 | } | ||
64 | |||
65 | $ch = curl_init($cleanUrl); | ||
66 | if ($ch === false) { | ||
67 | return array(array(0 => 'curl_init() error'), false); | ||
68 | } | ||
69 | |||
70 | // General cURL settings | ||
71 | curl_setopt($ch, CURLOPT_AUTOREFERER, true); | ||
72 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | ||
73 | curl_setopt($ch, CURLOPT_HEADER, true); | ||
74 | curl_setopt( | ||
75 | $ch, | ||
76 | CURLOPT_HTTPHEADER, | ||
77 | array('Accept-Language: ' . $acceptLanguage) | ||
78 | ); | ||
79 | curl_setopt($ch, CURLOPT_MAXREDIRS, $maxRedirs); | ||
80 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | ||
81 | curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | ||
82 | curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); | ||
83 | |||
84 | if (is_callable($curlWriteFunction)) { | ||
85 | curl_setopt($ch, CURLOPT_WRITEFUNCTION, $curlWriteFunction); | ||
86 | } | ||
87 | |||
88 | // Max download size management | ||
89 | curl_setopt($ch, CURLOPT_BUFFERSIZE, 1024*16); | ||
90 | curl_setopt($ch, CURLOPT_NOPROGRESS, false); | ||
91 | curl_setopt( | ||
92 | $ch, | ||
93 | CURLOPT_PROGRESSFUNCTION, | ||
94 | function ($arg0, $arg1, $arg2, $arg3, $arg4 = 0) use ($maxBytes) { | ||
95 | if (version_compare(phpversion(), '5.5', '<')) { | ||
96 | // PHP version lower than 5.5 | ||
97 | // Callback has 4 arguments | ||
98 | $downloaded = $arg1; | ||
99 | } else { | ||
100 | // Callback has 5 arguments | ||
101 | $downloaded = $arg2; | ||
102 | } | ||
103 | // Non-zero return stops downloading | ||
104 | return ($downloaded > $maxBytes) ? 1 : 0; | ||
105 | } | ||
106 | ); | ||
107 | |||
108 | $response = curl_exec($ch); | ||
109 | $errorNo = curl_errno($ch); | ||
110 | $errorStr = curl_error($ch); | ||
111 | $headSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); | ||
112 | curl_close($ch); | ||
113 | |||
114 | if ($response === false) { | ||
115 | if ($errorNo == CURLE_COULDNT_RESOLVE_HOST) { | ||
116 | /* | ||
117 | * Workaround to match fallback method behaviour | ||
118 | * Removing this would require updating | ||
119 | * GetHttpUrlTest::testGetInvalidRemoteUrl() | ||
120 | */ | ||
121 | return array(false, false); | ||
122 | } | ||
123 | return array(array(0 => 'curl_exec() error: ' . $errorStr), false); | ||
124 | } | ||
125 | |||
126 | // Formatting output like the fallback method | ||
127 | $rawHeaders = substr($response, 0, $headSize); | ||
128 | |||
129 | // Keep only headers from latest redirection | ||
130 | $rawHeadersArrayRedirs = explode("\r\n\r\n", trim($rawHeaders)); | ||
131 | $rawHeadersLastRedir = end($rawHeadersArrayRedirs); | ||
132 | |||
133 | $content = substr($response, $headSize); | ||
134 | $headers = array(); | ||
135 | foreach (preg_split('~[\r\n]+~', $rawHeadersLastRedir) as $line) { | ||
136 | if (empty($line) || ctype_space($line)) { | ||
137 | continue; | ||
138 | } | ||
139 | $splitLine = explode(': ', $line, 2); | ||
140 | if (count($splitLine) > 1) { | ||
141 | $key = $splitLine[0]; | ||
142 | $value = $splitLine[1]; | ||
143 | if (array_key_exists($key, $headers)) { | ||
144 | if (!is_array($headers[$key])) { | ||
145 | $headers[$key] = array(0 => $headers[$key]); | ||
146 | } | ||
147 | $headers[$key][] = $value; | ||
148 | } else { | ||
149 | $headers[$key] = $value; | ||
150 | } | ||
151 | } else { | ||
152 | $headers[] = $splitLine[0]; | ||
153 | } | ||
154 | } | ||
155 | |||
156 | return array($headers, $content); | ||
157 | } | ||
158 | |||
159 | /** | ||
160 | * GET an HTTP URL to retrieve its content (fallback method) | ||
161 | * | ||
162 | * @param string $cleanUrl URL to get (http://... valid and in ASCII form) | ||
163 | * @param int $timeout network timeout (in seconds) | ||
164 | * @param int $maxBytes maximum downloaded bytes | ||
165 | * @param string $userAgent "User-Agent" header | ||
166 | * @param string $acceptLanguage "Accept-Language" header | ||
167 | * @param int $maxRedr maximum amount of redirections followed | ||
168 | * | ||
169 | * @return array HTTP response headers, downloaded content | ||
170 | * | ||
171 | * Output format: | ||
172 | * [0] = associative array containing HTTP response headers | ||
173 | * [1] = URL content (downloaded data) | ||
174 | * | ||
175 | * @see http://php.net/manual/en/function.file-get-contents.php | ||
176 | * @see http://php.net/manual/en/function.stream-context-create.php | ||
177 | * @see http://php.net/manual/en/function.get-headers.php | ||
178 | */ | ||
179 | function get_http_response_fallback( | ||
180 | $cleanUrl, | ||
181 | $timeout, | ||
182 | $maxBytes, | ||
183 | $userAgent, | ||
184 | $acceptLanguage, | ||
185 | $maxRedr | ||
186 | ) { | ||
187 | $options = array( | ||
188 | 'http' => array( | ||
189 | 'method' => 'GET', | ||
190 | 'timeout' => $timeout, | ||
191 | 'user_agent' => $userAgent, | ||
192 | 'header' => "Accept: */*\r\n" | ||
193 | . 'Accept-Language: ' . $acceptLanguage | ||
194 | ) | ||
195 | ); | ||
196 | |||
197 | stream_context_set_default($options); | ||
198 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); | ||
199 | if (! $headers || strpos($headers[0], '200 OK') === false) { | ||
200 | $options['http']['request_fulluri'] = true; | ||
201 | stream_context_set_default($options); | ||
202 | list($headers, $finalUrl) = get_redirected_headers($cleanUrl, $maxRedr); | ||
203 | } | ||
204 | |||
205 | if (! $headers) { | ||
206 | return array($headers, false); | ||
207 | } | ||
208 | |||
209 | try { | ||
210 | // TODO: catch Exception in calling code (thumbnailer) | ||
211 | $context = stream_context_create($options); | ||
212 | $content = file_get_contents($finalUrl, false, $context, -1, $maxBytes); | ||
213 | } catch (Exception $exc) { | ||
214 | return array(array(0 => 'HTTP Error'), $exc->getMessage()); | ||
215 | } | ||
216 | |||
217 | return array($headers, $content); | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * Retrieve HTTP headers, following n redirections (temporary and permanent ones). | ||
222 | * | ||
223 | * @param string $url initial URL to reach. | ||
224 | * @param int $redirectionLimit max redirection follow. | ||
225 | * | ||
226 | * @return array HTTP headers, or false if it failed. | ||
227 | */ | ||
228 | function get_redirected_headers($url, $redirectionLimit = 3) | ||
229 | { | ||
230 | $headers = get_headers($url, 1); | ||
231 | if (!empty($headers['location']) && empty($headers['Location'])) { | ||
232 | $headers['Location'] = $headers['location']; | ||
233 | } | ||
234 | |||
235 | // Headers found, redirection found, and limit not reached. | ||
236 | if ($redirectionLimit-- > 0 | ||
237 | && !empty($headers) | ||
238 | && (strpos($headers[0], '301') !== false || strpos($headers[0], '302') !== false) | ||
239 | && !empty($headers['Location'])) { | ||
240 | $redirection = is_array($headers['Location']) ? end($headers['Location']) : $headers['Location']; | ||
241 | if ($redirection != $url) { | ||
242 | $redirection = getAbsoluteUrl($url, $redirection); | ||
243 | return get_redirected_headers($redirection, $redirectionLimit); | ||
244 | } | ||
245 | } | ||
246 | |||
247 | return array($headers, $url); | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * Get an absolute URL from a complete one, and another absolute/relative URL. | ||
252 | * | ||
253 | * @param string $originalUrl The original complete URL. | ||
254 | * @param string $newUrl The new one, absolute or relative. | ||
255 | * | ||
256 | * @return string Final URL: | ||
257 | * - $newUrl if it was already an absolute URL. | ||
258 | * - if it was relative, absolute URL from $originalUrl path. | ||
259 | */ | ||
260 | function getAbsoluteUrl($originalUrl, $newUrl) | ||
261 | { | ||
262 | $newScheme = parse_url($newUrl, PHP_URL_SCHEME); | ||
263 | // Already an absolute URL. | ||
264 | if (!empty($newScheme)) { | ||
265 | return $newUrl; | ||
266 | } | ||
267 | |||
268 | $parts = parse_url($originalUrl); | ||
269 | $final = $parts['scheme'] .'://'. $parts['host']; | ||
270 | $final .= (!empty($parts['port'])) ? $parts['port'] : ''; | ||
271 | $final .= '/'; | ||
272 | if ($newUrl[0] != '/') { | ||
273 | $final .= substr(ltrim($parts['path'], '/'), 0, strrpos($parts['path'], '/')); | ||
274 | } | ||
275 | $final .= ltrim($newUrl, '/'); | ||
276 | return $final; | ||
277 | } | ||
278 | |||
279 | /** | ||
280 | * Returns the server's base URL: scheme://domain.tld[:port] | ||
281 | * | ||
282 | * @param array $server the $_SERVER array | ||
283 | * | ||
284 | * @return string the server's base URL | ||
285 | * | ||
286 | * @see http://www.ietf.org/rfc/rfc7239.txt | ||
287 | * @see http://www.ietf.org/rfc/rfc6648.txt | ||
288 | * @see http://stackoverflow.com/a/3561399 | ||
289 | * @see http://stackoverflow.com/q/452375 | ||
290 | */ | ||
291 | function server_url($server) | ||
292 | { | ||
293 | $scheme = 'http'; | ||
294 | $port = ''; | ||
295 | |||
296 | // Shaarli is served behind a proxy | ||
297 | if (isset($server['HTTP_X_FORWARDED_PROTO'])) { | ||
298 | // Keep forwarded scheme | ||
299 | if (strpos($server['HTTP_X_FORWARDED_PROTO'], ',') !== false) { | ||
300 | $schemes = explode(',', $server['HTTP_X_FORWARDED_PROTO']); | ||
301 | $scheme = trim($schemes[0]); | ||
302 | } else { | ||
303 | $scheme = $server['HTTP_X_FORWARDED_PROTO']; | ||
304 | } | ||
305 | |||
306 | if (isset($server['HTTP_X_FORWARDED_PORT'])) { | ||
307 | // Keep forwarded port | ||
308 | if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) { | ||
309 | $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']); | ||
310 | $port = trim($ports[0]); | ||
311 | } else { | ||
312 | $port = $server['HTTP_X_FORWARDED_PORT']; | ||
313 | } | ||
314 | |||
315 | // This is a workaround for proxies that don't forward the scheme properly. | ||
316 | // Connecting over port 443 has to be in HTTPS. | ||
317 | // See https://github.com/shaarli/Shaarli/issues/1022 | ||
318 | if ($port == '443') { | ||
319 | $scheme = 'https'; | ||
320 | } | ||
321 | |||
322 | if (($scheme == 'http' && $port != '80') | ||
323 | || ($scheme == 'https' && $port != '443') | ||
324 | ) { | ||
325 | $port = ':' . $port; | ||
326 | } else { | ||
327 | $port = ''; | ||
328 | } | ||
329 | } | ||
330 | |||
331 | if (isset($server['HTTP_X_FORWARDED_HOST'])) { | ||
332 | // Keep forwarded host | ||
333 | if (strpos($server['HTTP_X_FORWARDED_HOST'], ',') !== false) { | ||
334 | $hosts = explode(',', $server['HTTP_X_FORWARDED_HOST']); | ||
335 | $host = trim($hosts[0]); | ||
336 | } else { | ||
337 | $host = $server['HTTP_X_FORWARDED_HOST']; | ||
338 | } | ||
339 | } else { | ||
340 | $host = $server['SERVER_NAME']; | ||
341 | } | ||
342 | |||
343 | return $scheme.'://'.$host.$port; | ||
344 | } | ||
345 | |||
346 | // SSL detection | ||
347 | if ((! empty($server['HTTPS']) && strtolower($server['HTTPS']) == 'on') | ||
348 | || (isset($server['SERVER_PORT']) && $server['SERVER_PORT'] == '443')) { | ||
349 | $scheme = 'https'; | ||
350 | } | ||
351 | |||
352 | // Do not append standard port values | ||
353 | if (($scheme == 'http' && $server['SERVER_PORT'] != '80') | ||
354 | || ($scheme == 'https' && $server['SERVER_PORT'] != '443')) { | ||
355 | $port = ':'.$server['SERVER_PORT']; | ||
356 | } | ||
357 | |||
358 | return $scheme.'://'.$server['SERVER_NAME'].$port; | ||
359 | } | ||
360 | |||
361 | /** | ||
362 | * Returns the absolute URL of the current script, without the query | ||
363 | * | ||
364 | * If the resource is "index.php", then it is removed (for better-looking URLs) | ||
365 | * | ||
366 | * @param array $server the $_SERVER array | ||
367 | * | ||
368 | * @return string the absolute URL of the current script, without the query | ||
369 | */ | ||
370 | function index_url($server) | ||
371 | { | ||
372 | $scriptname = $server['SCRIPT_NAME']; | ||
373 | if (endsWith($scriptname, 'index.php')) { | ||
374 | $scriptname = substr($scriptname, 0, -9); | ||
375 | } | ||
376 | return server_url($server) . $scriptname; | ||
377 | } | ||
378 | |||
379 | /** | ||
380 | * Returns the absolute URL of the current script, with the query | ||
381 | * | ||
382 | * If the resource is "index.php", then it is removed (for better-looking URLs) | ||
383 | * | ||
384 | * @param array $server the $_SERVER array | ||
385 | * | ||
386 | * @return string the absolute URL of the current script, with the query | ||
387 | */ | ||
388 | function page_url($server) | ||
389 | { | ||
390 | if (! empty($server['QUERY_STRING'])) { | ||
391 | return index_url($server).'?'.$server['QUERY_STRING']; | ||
392 | } | ||
393 | return index_url($server); | ||
394 | } | ||
395 | |||
396 | /** | ||
397 | * Retrieve the initial IP forwarded by the reverse proxy. | ||
398 | * | ||
399 | * Inspired from: https://github.com/zendframework/zend-http/blob/master/src/PhpEnvironment/RemoteAddress.php | ||
400 | * | ||
401 | * @param array $server $_SERVER array which contains HTTP headers. | ||
402 | * @param array $trustedIps List of trusted IP from the configuration. | ||
403 | * | ||
404 | * @return string|bool The forwarded IP, or false if none could be extracted. | ||
405 | */ | ||
406 | function getIpAddressFromProxy($server, $trustedIps) | ||
407 | { | ||
408 | $forwardedIpHeader = 'HTTP_X_FORWARDED_FOR'; | ||
409 | if (empty($server[$forwardedIpHeader])) { | ||
410 | return false; | ||
411 | } | ||
412 | |||
413 | $ips = preg_split('/\s*,\s*/', $server[$forwardedIpHeader]); | ||
414 | $ips = array_diff($ips, $trustedIps); | ||
415 | if (empty($ips)) { | ||
416 | return false; | ||
417 | } | ||
418 | |||
419 | return array_pop($ips); | ||
420 | } | ||
421 | |||
422 | |||
423 | /** | ||
424 | * Return an identifier based on the advertised client IP address(es) | ||
425 | * | ||
426 | * This aims at preventing session hijacking from users behind the same proxy | ||
427 | * by relying on HTTP headers. | ||
428 | * | ||
429 | * See: | ||
430 | * - https://secure.php.net/manual/en/reserved.variables.server.php | ||
431 | * - https://stackoverflow.com/questions/3003145/how-to-get-the-client-ip-address-in-php | ||
432 | * - https://stackoverflow.com/questions/12233406/preventing-session-hijacking | ||
433 | * - https://stackoverflow.com/questions/21354859/trusting-x-forwarded-for-to-identify-a-visitor | ||
434 | * | ||
435 | * @param array $server The $_SERVER array | ||
436 | * | ||
437 | * @return string An identifier based on client IP address information | ||
438 | */ | ||
439 | function client_ip_id($server) | ||
440 | { | ||
441 | $ip = $server['REMOTE_ADDR']; | ||
442 | |||
443 | if (isset($server['HTTP_X_FORWARDED_FOR'])) { | ||
444 | $ip = $ip . '_' . $server['HTTP_X_FORWARDED_FOR']; | ||
445 | } | ||
446 | if (isset($server['HTTP_CLIENT_IP'])) { | ||
447 | $ip = $ip . '_' . $server['HTTP_CLIENT_IP']; | ||
448 | } | ||
449 | return $ip; | ||
450 | } | ||
451 | |||
452 | |||
453 | /** | ||
454 | * Returns true if Shaarli's currently browsed in HTTPS. | ||
455 | * Supports reverse proxies (if the headers are correctly set). | ||
456 | * | ||
457 | * @param array $server $_SERVER. | ||
458 | * | ||
459 | * @return bool true if HTTPS, false otherwise. | ||
460 | */ | ||
461 | function is_https($server) | ||
462 | { | ||
463 | |||
464 | if (isset($server['HTTP_X_FORWARDED_PORT'])) { | ||
465 | // Keep forwarded port | ||
466 | if (strpos($server['HTTP_X_FORWARDED_PORT'], ',') !== false) { | ||
467 | $ports = explode(',', $server['HTTP_X_FORWARDED_PORT']); | ||
468 | $port = trim($ports[0]); | ||
469 | } else { | ||
470 | $port = $server['HTTP_X_FORWARDED_PORT']; | ||
471 | } | ||
472 | |||
473 | if ($port == '443') { | ||
474 | return true; | ||
475 | } | ||
476 | } | ||
477 | |||
478 | return ! empty($server['HTTPS']); | ||
479 | } | ||
diff --git a/application/http/Url.php b/application/http/Url.php new file mode 100644 index 00000000..90444a2f --- /dev/null +++ b/application/http/Url.php | |||
@@ -0,0 +1,217 @@ | |||
1 | <?php | ||
2 | |||
3 | namespace Shaarli\Http; | ||
4 | |||
5 | /** | ||
6 | * URL representation and cleanup utilities | ||
7 | * | ||
8 | * Form | ||
9 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
10 | * | ||
11 | * Examples | ||
12 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
13 | * https://host.name.tld | ||
14 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
15 | * | ||
16 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
17 | */ | ||
18 | class Url | ||
19 | { | ||
20 | private static $annoyingQueryParams = array( | ||
21 | |||
22 | 'action_object_map=', | ||
23 | 'action_ref_map=', | ||
24 | 'action_type_map=', | ||
25 | 'fb_', | ||
26 | 'fb=', | ||
27 | 'PHPSESSID=', | ||
28 | |||
29 | // Scoop.it | ||
30 | '__scoop', | ||
31 | |||
32 | // Google Analytics & FeedProxy | ||
33 | 'utm_', | ||
34 | |||
35 | // ATInternet | ||
36 | 'xtor=', | ||
37 | |||
38 | // Other | ||
39 | 'campaign_' | ||
40 | ); | ||
41 | |||
42 | private static $annoyingFragments = array( | ||
43 | // ATInternet | ||
44 | 'xtor=RSS-', | ||
45 | |||
46 | // Misc. | ||
47 | 'tk.rss_all' | ||
48 | ); | ||
49 | |||
50 | /* | ||
51 | * URL parts represented as an array | ||
52 | * | ||
53 | * @see http://php.net/parse_url | ||
54 | */ | ||
55 | protected $parts; | ||
56 | |||
57 | /** | ||
58 | * Parses a string containing a URL | ||
59 | * | ||
60 | * @param string $url a string containing a URL | ||
61 | */ | ||
62 | public function __construct($url) | ||
63 | { | ||
64 | $url = self::cleanupUnparsedUrl(trim($url)); | ||
65 | $this->parts = parse_url($url); | ||
66 | |||
67 | if (!empty($url) && empty($this->parts['scheme'])) { | ||
68 | $this->parts['scheme'] = 'http'; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | /** | ||
73 | * Clean up URL before it's parsed. | ||
74 | * ie. handle urlencode, url prefixes, etc. | ||
75 | * | ||
76 | * @param string $url URL to clean. | ||
77 | * | ||
78 | * @return string cleaned URL. | ||
79 | */ | ||
80 | protected static function cleanupUnparsedUrl($url) | ||
81 | { | ||
82 | return self::removeFirefoxAboutReader($url); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * Remove Firefox Reader prefix if it's present. | ||
87 | * | ||
88 | * @param string $input url | ||
89 | * | ||
90 | * @return string cleaned url | ||
91 | */ | ||
92 | protected static function removeFirefoxAboutReader($input) | ||
93 | { | ||
94 | $firefoxPrefix = 'about://reader?url='; | ||
95 | if (startsWith($input, $firefoxPrefix)) { | ||
96 | return urldecode(ltrim($input, $firefoxPrefix)); | ||
97 | } | ||
98 | return $input; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * Returns a string representation of this URL | ||
103 | */ | ||
104 | public function toString() | ||
105 | { | ||
106 | return unparse_url($this->parts); | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * Removes undesired query parameters | ||
111 | */ | ||
112 | protected function cleanupQuery() | ||
113 | { | ||
114 | if (!isset($this->parts['query'])) { | ||
115 | return; | ||
116 | } | ||
117 | |||
118 | $queryParams = explode('&', $this->parts['query']); | ||
119 | |||
120 | foreach (self::$annoyingQueryParams as $annoying) { | ||
121 | foreach ($queryParams as $param) { | ||
122 | if (startsWith($param, $annoying)) { | ||
123 | $queryParams = array_diff($queryParams, array($param)); | ||
124 | continue; | ||
125 | } | ||
126 | } | ||
127 | } | ||
128 | |||
129 | if (count($queryParams) == 0) { | ||
130 | unset($this->parts['query']); | ||
131 | return; | ||
132 | } | ||
133 | |||
134 | $this->parts['query'] = implode('&', $queryParams); | ||
135 | } | ||
136 | |||
137 | /** | ||
138 | * Removes undesired fragments | ||
139 | */ | ||
140 | protected function cleanupFragment() | ||
141 | { | ||
142 | if (!isset($this->parts['fragment'])) { | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | foreach (self::$annoyingFragments as $annoying) { | ||
147 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
148 | unset($this->parts['fragment']); | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * Removes undesired query parameters and fragments | ||
156 | * | ||
157 | * @return string the string representation of this URL after cleanup | ||
158 | */ | ||
159 | public function cleanup() | ||
160 | { | ||
161 | $this->cleanupQuery(); | ||
162 | $this->cleanupFragment(); | ||
163 | return $this->toString(); | ||
164 | } | ||
165 | |||
166 | /** | ||
167 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
168 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
169 | * | ||
170 | * @return string converted cleaned up URL. | ||
171 | */ | ||
172 | public function idnToAscii() | ||
173 | { | ||
174 | $out = $this->cleanup(); | ||
175 | if (!function_exists('idn_to_ascii') || !isset($this->parts['host'])) { | ||
176 | return $out; | ||
177 | } | ||
178 | $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46); | ||
179 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
180 | } | ||
181 | |||
182 | /** | ||
183 | * Get URL scheme. | ||
184 | * | ||
185 | * @return string the URL scheme or false if none is provided. | ||
186 | */ | ||
187 | public function getScheme() | ||
188 | { | ||
189 | if (!isset($this->parts['scheme'])) { | ||
190 | return false; | ||
191 | } | ||
192 | return $this->parts['scheme']; | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * Get URL host. | ||
197 | * | ||
198 | * @return string the URL host or false if none is provided. | ||
199 | */ | ||
200 | public function getHost() | ||
201 | { | ||
202 | if (empty($this->parts['host'])) { | ||
203 | return false; | ||
204 | } | ||
205 | return $this->parts['host']; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * Test if the UrlUtils is an HTTP one. | ||
210 | * | ||
211 | * @return true is HTTP, false otherwise. | ||
212 | */ | ||
213 | public function isHttp() | ||
214 | { | ||
215 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
216 | } | ||
217 | } | ||
diff --git a/application/http/UrlUtils.php b/application/http/UrlUtils.php new file mode 100644 index 00000000..4bc84b82 --- /dev/null +++ b/application/http/UrlUtils.php | |||
@@ -0,0 +1,88 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Converts an array-represented URL to a string | ||
4 | * | ||
5 | * Source: http://php.net/manual/en/function.parse-url.php#106731 | ||
6 | * | ||
7 | * @see http://php.net/manual/en/function.parse-url.php | ||
8 | * | ||
9 | * @param array $parsedUrl an array-represented URL | ||
10 | * | ||
11 | * @return string the string representation of the URL | ||
12 | */ | ||
13 | function unparse_url($parsedUrl) | ||
14 | { | ||
15 | $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : ''; | ||
16 | $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; | ||
17 | $port = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : ''; | ||
18 | $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; | ||
19 | $pass = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass'] : ''; | ||
20 | $pass = ($user || $pass) ? "$pass@" : ''; | ||
21 | $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; | ||
22 | $query = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : ''; | ||
23 | $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : ''; | ||
24 | |||
25 | return "$scheme$user$pass$host$port$path$query$fragment"; | ||
26 | } | ||
27 | |||
28 | /** | ||
29 | * Removes undesired query parameters and fragments | ||
30 | * | ||
31 | * @param string url UrlUtils to be cleaned | ||
32 | * | ||
33 | * @return string the string representation of this URL after cleanup | ||
34 | */ | ||
35 | function cleanup_url($url) | ||
36 | { | ||
37 | $obj_url = new \Shaarli\Http\Url($url); | ||
38 | return $obj_url->cleanup(); | ||
39 | } | ||
40 | |||
41 | /** | ||
42 | * Get URL scheme. | ||
43 | * | ||
44 | * @param string url UrlUtils for which the scheme is requested | ||
45 | * | ||
46 | * @return mixed the URL scheme or false if none is provided. | ||
47 | */ | ||
48 | function get_url_scheme($url) | ||
49 | { | ||
50 | $obj_url = new \Shaarli\Http\Url($url); | ||
51 | return $obj_url->getScheme(); | ||
52 | } | ||
53 | |||
54 | /** | ||
55 | * Adds a trailing slash at the end of URL if necessary. | ||
56 | * | ||
57 | * @param string $url URL to check/edit. | ||
58 | * | ||
59 | * @return string $url URL with a end trailing slash. | ||
60 | */ | ||
61 | function add_trailing_slash($url) | ||
62 | { | ||
63 | return $url . (!endsWith($url, '/') ? '/' : ''); | ||
64 | } | ||
65 | |||
66 | /** | ||
67 | * Replace not whitelisted protocols by 'http://' from given URL. | ||
68 | * | ||
69 | * @param string $url URL to clean | ||
70 | * @param array $protocols List of allowed protocols (aside from http(s)). | ||
71 | * | ||
72 | * @return string URL with allowed protocol | ||
73 | */ | ||
74 | function whitelist_protocols($url, $protocols) | ||
75 | { | ||
76 | if (startsWith($url, '?') || startsWith($url, '/')) { | ||
77 | return $url; | ||
78 | } | ||
79 | $protocols = array_merge(['http', 'https'], $protocols); | ||
80 | $protocol = preg_match('#^(\w+):/?/?#', $url, $match); | ||
81 | // Protocol not allowed: we remove it and replace it with http | ||
82 | if ($protocol === 1 && ! in_array($match[1], $protocols)) { | ||
83 | $url = str_replace($match[0], 'http://', $url); | ||
84 | } elseif ($protocol !== 1) { | ||
85 | $url = 'http://' . $url; | ||
86 | } | ||
87 | return $url; | ||
88 | } | ||