diff options
Diffstat (limited to 'application')
-rw-r--r-- | application/HttpUtils.php | 3 | ||||
-rw-r--r-- | application/Url.php | 218 | ||||
-rw-r--r-- | application/http/Url.php | 217 |
3 files changed, 222 insertions, 216 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 9c438160..51af5d0d 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -1,4 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | |||
3 | use Shaarli\Http\Url; | ||
4 | |||
2 | /** | 5 | /** |
3 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
4 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
diff --git a/application/Url.php b/application/Url.php index 3b7f19c2..81f72fb0 100644 --- a/application/Url.php +++ b/application/Url.php | |||
@@ -34,7 +34,7 @@ function unparse_url($parsedUrl) | |||
34 | */ | 34 | */ |
35 | function cleanup_url($url) | 35 | function cleanup_url($url) |
36 | { | 36 | { |
37 | $obj_url = new Url($url); | 37 | $obj_url = new \Shaarli\Http\Url($url); |
38 | return $obj_url->cleanup(); | 38 | return $obj_url->cleanup(); |
39 | } | 39 | } |
40 | 40 | ||
@@ -47,7 +47,7 @@ function cleanup_url($url) | |||
47 | */ | 47 | */ |
48 | function get_url_scheme($url) | 48 | function get_url_scheme($url) |
49 | { | 49 | { |
50 | $obj_url = new Url($url); | 50 | $obj_url = new \Shaarli\Http\Url($url); |
51 | return $obj_url->getScheme(); | 51 | return $obj_url->getScheme(); |
52 | } | 52 | } |
53 | 53 | ||
@@ -86,217 +86,3 @@ function whitelist_protocols($url, $protocols) | |||
86 | } | 86 | } |
87 | return $url; | 87 | return $url; |
88 | } | 88 | } |
89 | |||
90 | /** | ||
91 | * URL representation and cleanup utilities | ||
92 | * | ||
93 | * Form | ||
94 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
95 | * | ||
96 | * Examples | ||
97 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
98 | * https://host.name.tld | ||
99 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
100 | * | ||
101 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
102 | */ | ||
103 | class Url | ||
104 | { | ||
105 | private static $annoyingQueryParams = array( | ||
106 | |||
107 | 'action_object_map=', | ||
108 | 'action_ref_map=', | ||
109 | 'action_type_map=', | ||
110 | 'fb_', | ||
111 | 'fb=', | ||
112 | 'PHPSESSID=', | ||
113 | |||
114 | // Scoop.it | ||
115 | '__scoop', | ||
116 | |||
117 | // Google Analytics & FeedProxy | ||
118 | 'utm_', | ||
119 | |||
120 | // ATInternet | ||
121 | 'xtor=', | ||
122 | |||
123 | // Other | ||
124 | 'campaign_' | ||
125 | ); | ||
126 | |||
127 | private static $annoyingFragments = array( | ||
128 | // ATInternet | ||
129 | 'xtor=RSS-', | ||
130 | |||
131 | // Misc. | ||
132 | 'tk.rss_all' | ||
133 | ); | ||
134 | |||
135 | /* | ||
136 | * URL parts represented as an array | ||
137 | * | ||
138 | * @see http://php.net/parse_url | ||
139 | */ | ||
140 | protected $parts; | ||
141 | |||
142 | /** | ||
143 | * Parses a string containing a URL | ||
144 | * | ||
145 | * @param string $url a string containing a URL | ||
146 | */ | ||
147 | public function __construct($url) | ||
148 | { | ||
149 | $url = self::cleanupUnparsedUrl(trim($url)); | ||
150 | $this->parts = parse_url($url); | ||
151 | |||
152 | if (!empty($url) && empty($this->parts['scheme'])) { | ||
153 | $this->parts['scheme'] = 'http'; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * Clean up URL before it's parsed. | ||
159 | * ie. handle urlencode, url prefixes, etc. | ||
160 | * | ||
161 | * @param string $url URL to clean. | ||
162 | * | ||
163 | * @return string cleaned URL. | ||
164 | */ | ||
165 | protected static function cleanupUnparsedUrl($url) | ||
166 | { | ||
167 | return self::removeFirefoxAboutReader($url); | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * Remove Firefox Reader prefix if it's present. | ||
172 | * | ||
173 | * @param string $input url | ||
174 | * | ||
175 | * @return string cleaned url | ||
176 | */ | ||
177 | protected static function removeFirefoxAboutReader($input) | ||
178 | { | ||
179 | $firefoxPrefix = 'about://reader?url='; | ||
180 | if (startsWith($input, $firefoxPrefix)) { | ||
181 | return urldecode(ltrim($input, $firefoxPrefix)); | ||
182 | } | ||
183 | return $input; | ||
184 | } | ||
185 | |||
186 | /** | ||
187 | * Returns a string representation of this URL | ||
188 | */ | ||
189 | public function toString() | ||
190 | { | ||
191 | return unparse_url($this->parts); | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * Removes undesired query parameters | ||
196 | */ | ||
197 | protected function cleanupQuery() | ||
198 | { | ||
199 | if (! isset($this->parts['query'])) { | ||
200 | return; | ||
201 | } | ||
202 | |||
203 | $queryParams = explode('&', $this->parts['query']); | ||
204 | |||
205 | foreach (self::$annoyingQueryParams as $annoying) { | ||
206 | foreach ($queryParams as $param) { | ||
207 | if (startsWith($param, $annoying)) { | ||
208 | $queryParams = array_diff($queryParams, array($param)); | ||
209 | continue; | ||
210 | } | ||
211 | } | ||
212 | } | ||
213 | |||
214 | if (count($queryParams) == 0) { | ||
215 | unset($this->parts['query']); | ||
216 | return; | ||
217 | } | ||
218 | |||
219 | $this->parts['query'] = implode('&', $queryParams); | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * Removes undesired fragments | ||
224 | */ | ||
225 | protected function cleanupFragment() | ||
226 | { | ||
227 | if (! isset($this->parts['fragment'])) { | ||
228 | return; | ||
229 | } | ||
230 | |||
231 | foreach (self::$annoyingFragments as $annoying) { | ||
232 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
233 | unset($this->parts['fragment']); | ||
234 | break; | ||
235 | } | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * Removes undesired query parameters and fragments | ||
241 | * | ||
242 | * @return string the string representation of this URL after cleanup | ||
243 | */ | ||
244 | public function cleanup() | ||
245 | { | ||
246 | $this->cleanupQuery(); | ||
247 | $this->cleanupFragment(); | ||
248 | return $this->toString(); | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
253 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
254 | * | ||
255 | * @return string converted cleaned up URL. | ||
256 | */ | ||
257 | public function idnToAscii() | ||
258 | { | ||
259 | $out = $this->cleanup(); | ||
260 | if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) { | ||
261 | return $out; | ||
262 | } | ||
263 | $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46); | ||
264 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * Get URL scheme. | ||
269 | * | ||
270 | * @return string the URL scheme or false if none is provided. | ||
271 | */ | ||
272 | public function getScheme() | ||
273 | { | ||
274 | if (!isset($this->parts['scheme'])) { | ||
275 | return false; | ||
276 | } | ||
277 | return $this->parts['scheme']; | ||
278 | } | ||
279 | |||
280 | /** | ||
281 | * Get URL host. | ||
282 | * | ||
283 | * @return string the URL host or false if none is provided. | ||
284 | */ | ||
285 | public function getHost() | ||
286 | { | ||
287 | if (empty($this->parts['host'])) { | ||
288 | return false; | ||
289 | } | ||
290 | return $this->parts['host']; | ||
291 | } | ||
292 | |||
293 | /** | ||
294 | * Test if the Url is an HTTP one. | ||
295 | * | ||
296 | * @return true is HTTP, false otherwise. | ||
297 | */ | ||
298 | public function isHttp() | ||
299 | { | ||
300 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
301 | } | ||
302 | } | ||
diff --git a/application/http/Url.php b/application/http/Url.php new file mode 100644 index 00000000..260231c6 --- /dev/null +++ b/application/http/Url.php | |||
@@ -0,0 +1,217 @@ | |||
1 | <?php | ||
2 | |||
3 | namespace Shaarli\Http; | ||
4 | |||
5 | /** | ||
6 | * URL representation and cleanup utilities | ||
7 | * | ||
8 | * Form | ||
9 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
10 | * | ||
11 | * Examples | ||
12 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
13 | * https://host.name.tld | ||
14 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
15 | * | ||
16 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
17 | */ | ||
18 | class Url | ||
19 | { | ||
20 | private static $annoyingQueryParams = array( | ||
21 | |||
22 | 'action_object_map=', | ||
23 | 'action_ref_map=', | ||
24 | 'action_type_map=', | ||
25 | 'fb_', | ||
26 | 'fb=', | ||
27 | 'PHPSESSID=', | ||
28 | |||
29 | // Scoop.it | ||
30 | '__scoop', | ||
31 | |||
32 | // Google Analytics & FeedProxy | ||
33 | 'utm_', | ||
34 | |||
35 | // ATInternet | ||
36 | 'xtor=', | ||
37 | |||
38 | // Other | ||
39 | 'campaign_' | ||
40 | ); | ||
41 | |||
42 | private static $annoyingFragments = array( | ||
43 | // ATInternet | ||
44 | 'xtor=RSS-', | ||
45 | |||
46 | // Misc. | ||
47 | 'tk.rss_all' | ||
48 | ); | ||
49 | |||
50 | /* | ||
51 | * URL parts represented as an array | ||
52 | * | ||
53 | * @see http://php.net/parse_url | ||
54 | */ | ||
55 | protected $parts; | ||
56 | |||
57 | /** | ||
58 | * Parses a string containing a URL | ||
59 | * | ||
60 | * @param string $url a string containing a URL | ||
61 | */ | ||
62 | public function __construct($url) | ||
63 | { | ||
64 | $url = self::cleanupUnparsedUrl(trim($url)); | ||
65 | $this->parts = parse_url($url); | ||
66 | |||
67 | if (!empty($url) && empty($this->parts['scheme'])) { | ||
68 | $this->parts['scheme'] = 'http'; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | /** | ||
73 | * Clean up URL before it's parsed. | ||
74 | * ie. handle urlencode, url prefixes, etc. | ||
75 | * | ||
76 | * @param string $url URL to clean. | ||
77 | * | ||
78 | * @return string cleaned URL. | ||
79 | */ | ||
80 | protected static function cleanupUnparsedUrl($url) | ||
81 | { | ||
82 | return self::removeFirefoxAboutReader($url); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * Remove Firefox Reader prefix if it's present. | ||
87 | * | ||
88 | * @param string $input url | ||
89 | * | ||
90 | * @return string cleaned url | ||
91 | */ | ||
92 | protected static function removeFirefoxAboutReader($input) | ||
93 | { | ||
94 | $firefoxPrefix = 'about://reader?url='; | ||
95 | if (startsWith($input, $firefoxPrefix)) { | ||
96 | return urldecode(ltrim($input, $firefoxPrefix)); | ||
97 | } | ||
98 | return $input; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * Returns a string representation of this URL | ||
103 | */ | ||
104 | public function toString() | ||
105 | { | ||
106 | return unparse_url($this->parts); | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * Removes undesired query parameters | ||
111 | */ | ||
112 | protected function cleanupQuery() | ||
113 | { | ||
114 | if (!isset($this->parts['query'])) { | ||
115 | return; | ||
116 | } | ||
117 | |||
118 | $queryParams = explode('&', $this->parts['query']); | ||
119 | |||
120 | foreach (self::$annoyingQueryParams as $annoying) { | ||
121 | foreach ($queryParams as $param) { | ||
122 | if (startsWith($param, $annoying)) { | ||
123 | $queryParams = array_diff($queryParams, array($param)); | ||
124 | continue; | ||
125 | } | ||
126 | } | ||
127 | } | ||
128 | |||
129 | if (count($queryParams) == 0) { | ||
130 | unset($this->parts['query']); | ||
131 | return; | ||
132 | } | ||
133 | |||
134 | $this->parts['query'] = implode('&', $queryParams); | ||
135 | } | ||
136 | |||
137 | /** | ||
138 | * Removes undesired fragments | ||
139 | */ | ||
140 | protected function cleanupFragment() | ||
141 | { | ||
142 | if (!isset($this->parts['fragment'])) { | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | foreach (self::$annoyingFragments as $annoying) { | ||
147 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
148 | unset($this->parts['fragment']); | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * Removes undesired query parameters and fragments | ||
156 | * | ||
157 | * @return string the string representation of this URL after cleanup | ||
158 | */ | ||
159 | public function cleanup() | ||
160 | { | ||
161 | $this->cleanupQuery(); | ||
162 | $this->cleanupFragment(); | ||
163 | return $this->toString(); | ||
164 | } | ||
165 | |||
166 | /** | ||
167 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
168 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
169 | * | ||
170 | * @return string converted cleaned up URL. | ||
171 | */ | ||
172 | public function idnToAscii() | ||
173 | { | ||
174 | $out = $this->cleanup(); | ||
175 | if (!function_exists('idn_to_ascii') || !isset($this->parts['host'])) { | ||
176 | return $out; | ||
177 | } | ||
178 | $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46); | ||
179 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
180 | } | ||
181 | |||
182 | /** | ||
183 | * Get URL scheme. | ||
184 | * | ||
185 | * @return string the URL scheme or false if none is provided. | ||
186 | */ | ||
187 | public function getScheme() | ||
188 | { | ||
189 | if (!isset($this->parts['scheme'])) { | ||
190 | return false; | ||
191 | } | ||
192 | return $this->parts['scheme']; | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * Get URL host. | ||
197 | * | ||
198 | * @return string the URL host or false if none is provided. | ||
199 | */ | ||
200 | public function getHost() | ||
201 | { | ||
202 | if (empty($this->parts['host'])) { | ||
203 | return false; | ||
204 | } | ||
205 | return $this->parts['host']; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * Test if the Url is an HTTP one. | ||
210 | * | ||
211 | * @return true is HTTP, false otherwise. | ||
212 | */ | ||
213 | public function isHttp() | ||
214 | { | ||
215 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
216 | } | ||
217 | } | ||