]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | /** | |
3 | * Converts an array-represented URL to a string | |
4 | * | |
5 | * Source: http://php.net/manual/en/function.parse-url.php#106731 | |
6 | * | |
7 | * @see http://php.net/manual/en/function.parse-url.php | |
8 | * | |
9 | * @param array $parsedUrl an array-represented URL | |
10 | * | |
11 | * @return string the string representation of the URL | |
12 | */ | |
13 | function unparse_url($parsedUrl) | |
14 | { | |
15 | $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : ''; | |
16 | $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; | |
17 | $port = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : ''; | |
18 | $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; | |
19 | $pass = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass'] : ''; | |
20 | $pass = ($user || $pass) ? "$pass@" : ''; | |
21 | $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; | |
22 | $query = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : ''; | |
23 | $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : ''; | |
24 | ||
25 | return "$scheme$user$pass$host$port$path$query$fragment"; | |
26 | } | |
27 | ||
28 | /** | |
29 | * Removes undesired query parameters and fragments | |
30 | * | |
31 | * @param string url Url to be cleaned | |
32 | * | |
33 | * @return string the string representation of this URL after cleanup | |
34 | */ | |
35 | function cleanup_url($url) | |
36 | { | |
37 | $obj_url = new Url($url); | |
38 | return $obj_url->cleanup(); | |
39 | } | |
40 | ||
41 | /** | |
42 | * Get URL scheme. | |
43 | * | |
44 | * @param string url Url for which the scheme is requested | |
45 | * | |
46 | * @return mixed the URL scheme or false if none is provided. | |
47 | */ | |
48 | function get_url_scheme($url) | |
49 | { | |
50 | $obj_url = new Url($url); | |
51 | return $obj_url->getScheme(); | |
52 | } | |
53 | ||
54 | /** | |
55 | * Adds a trailing slash at the end of URL if necessary. | |
56 | * | |
57 | * @param string $url URL to check/edit. | |
58 | * | |
59 | * @return string $url URL with a end trailing slash. | |
60 | */ | |
61 | function add_trailing_slash($url) | |
62 | { | |
63 | return $url . (!endsWith($url, '/') ? '/' : ''); | |
64 | } | |
65 | ||
66 | /** | |
67 | * Replace not whitelisted protocols by 'http://' from given URL. | |
68 | * | |
69 | * @param string $url URL to clean | |
70 | * @param array $protocols List of allowed protocols (aside from http(s)). | |
71 | * | |
72 | * @return string URL with allowed protocol | |
73 | */ | |
74 | function whitelist_protocols($url, $protocols) | |
75 | { | |
76 | if (startsWith($url, '?') || startsWith($url, '/')) { | |
77 | return $url; | |
78 | } | |
79 | $protocols = array_merge(['http', 'https'], $protocols); | |
80 | $protocol = preg_match('#^(\w+):/?/?#', $url, $match); | |
81 | // Protocol not allowed: we remove it and replace it with http | |
82 | if ($protocol === 1 && ! in_array($match[1], $protocols)) { | |
83 | $url = str_replace($match[0], 'http://', $url); | |
84 | } else if ($protocol !== 1) { | |
85 | $url = 'http://' . $url; | |
86 | } | |
87 | return $url; | |
88 | } | |
89 | ||
90 | /** | |
91 | * URL representation and cleanup utilities | |
92 | * | |
93 | * Form | |
94 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | |
95 | * | |
96 | * Examples | |
97 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | |
98 | * https://host.name.tld | |
99 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | |
100 | * | |
101 | * @see http://www.faqs.org/rfcs/rfc3986.html | |
102 | */ | |
103 | class Url | |
104 | { | |
105 | private static $annoyingQueryParams = array( | |
106 | ||
107 | 'action_object_map=', | |
108 | 'action_ref_map=', | |
109 | 'action_type_map=', | |
110 | 'fb_', | |
111 | 'fb=', | |
112 | 'PHPSESSID=', | |
113 | ||
114 | // Scoop.it | |
115 | '__scoop', | |
116 | ||
117 | // Google Analytics & FeedProxy | |
118 | 'utm_', | |
119 | ||
120 | // ATInternet | |
121 | 'xtor=', | |
122 | ||
123 | // Other | |
124 | 'campaign_' | |
125 | ); | |
126 | ||
127 | private static $annoyingFragments = array( | |
128 | // ATInternet | |
129 | 'xtor=RSS-', | |
130 | ||
131 | // Misc. | |
132 | 'tk.rss_all' | |
133 | ); | |
134 | ||
135 | /* | |
136 | * URL parts represented as an array | |
137 | * | |
138 | * @see http://php.net/parse_url | |
139 | */ | |
140 | protected $parts; | |
141 | ||
142 | /** | |
143 | * Parses a string containing a URL | |
144 | * | |
145 | * @param string $url a string containing a URL | |
146 | */ | |
147 | public function __construct($url) | |
148 | { | |
149 | $url = self::cleanupUnparsedUrl(trim($url)); | |
150 | $this->parts = parse_url($url); | |
151 | ||
152 | if (!empty($url) && empty($this->parts['scheme'])) { | |
153 | $this->parts['scheme'] = 'http'; | |
154 | } | |
155 | } | |
156 | ||
157 | /** | |
158 | * Clean up URL before it's parsed. | |
159 | * ie. handle urlencode, url prefixes, etc. | |
160 | * | |
161 | * @param string $url URL to clean. | |
162 | * | |
163 | * @return string cleaned URL. | |
164 | */ | |
165 | protected static function cleanupUnparsedUrl($url) | |
166 | { | |
167 | return self::removeFirefoxAboutReader($url); | |
168 | } | |
169 | ||
170 | /** | |
171 | * Remove Firefox Reader prefix if it's present. | |
172 | * | |
173 | * @param string $input url | |
174 | * | |
175 | * @return string cleaned url | |
176 | */ | |
177 | protected static function removeFirefoxAboutReader($input) | |
178 | { | |
179 | $firefoxPrefix = 'about://reader?url='; | |
180 | if (startsWith($input, $firefoxPrefix)) { | |
181 | return urldecode(ltrim($input, $firefoxPrefix)); | |
182 | } | |
183 | return $input; | |
184 | } | |
185 | ||
186 | /** | |
187 | * Returns a string representation of this URL | |
188 | */ | |
189 | public function toString() | |
190 | { | |
191 | return unparse_url($this->parts); | |
192 | } | |
193 | ||
194 | /** | |
195 | * Removes undesired query parameters | |
196 | */ | |
197 | protected function cleanupQuery() | |
198 | { | |
199 | if (! isset($this->parts['query'])) { | |
200 | return; | |
201 | } | |
202 | ||
203 | $queryParams = explode('&', $this->parts['query']); | |
204 | ||
205 | foreach (self::$annoyingQueryParams as $annoying) { | |
206 | foreach ($queryParams as $param) { | |
207 | if (startsWith($param, $annoying)) { | |
208 | $queryParams = array_diff($queryParams, array($param)); | |
209 | continue; | |
210 | } | |
211 | } | |
212 | } | |
213 | ||
214 | if (count($queryParams) == 0) { | |
215 | unset($this->parts['query']); | |
216 | return; | |
217 | } | |
218 | ||
219 | $this->parts['query'] = implode('&', $queryParams); | |
220 | } | |
221 | ||
222 | /** | |
223 | * Removes undesired fragments | |
224 | */ | |
225 | protected function cleanupFragment() | |
226 | { | |
227 | if (! isset($this->parts['fragment'])) { | |
228 | return; | |
229 | } | |
230 | ||
231 | foreach (self::$annoyingFragments as $annoying) { | |
232 | if (startsWith($this->parts['fragment'], $annoying)) { | |
233 | unset($this->parts['fragment']); | |
234 | break; | |
235 | } | |
236 | } | |
237 | } | |
238 | ||
239 | /** | |
240 | * Removes undesired query parameters and fragments | |
241 | * | |
242 | * @return string the string representation of this URL after cleanup | |
243 | */ | |
244 | public function cleanup() | |
245 | { | |
246 | $this->cleanupQuery(); | |
247 | $this->cleanupFragment(); | |
248 | return $this->toString(); | |
249 | } | |
250 | ||
251 | /** | |
252 | * Converts an URL with an International Domain Name host to a ASCII one. | |
253 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | |
254 | * | |
255 | * @return string converted cleaned up URL. | |
256 | */ | |
257 | public function idnToAscii() | |
258 | { | |
259 | $out = $this->cleanup(); | |
260 | if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) { | |
261 | return $out; | |
262 | } | |
263 | $asciiHost = idn_to_ascii($this->parts['host']); | |
264 | return str_replace($this->parts['host'], $asciiHost, $out); | |
265 | } | |
266 | ||
267 | /** | |
268 | * Get URL scheme. | |
269 | * | |
270 | * @return string the URL scheme or false if none is provided. | |
271 | */ | |
272 | public function getScheme() { | |
273 | if (!isset($this->parts['scheme'])) { | |
274 | return false; | |
275 | } | |
276 | return $this->parts['scheme']; | |
277 | } | |
278 | ||
279 | /** | |
280 | * Get URL host. | |
281 | * | |
282 | * @return string the URL host or false if none is provided. | |
283 | */ | |
284 | public function getHost() { | |
285 | if (empty($this->parts['host'])) { | |
286 | return false; | |
287 | } | |
288 | return $this->parts['host']; | |
289 | } | |
290 | ||
291 | /** | |
292 | * Test if the Url is an HTTP one. | |
293 | * | |
294 | * @return true is HTTP, false otherwise. | |
295 | */ | |
296 | public function isHttp() { | |
297 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | |
298 | } | |
299 | } |