diff options
-rw-r--r-- | application/HttpUtils.php | 3 | ||||
-rw-r--r-- | application/Url.php | 218 | ||||
-rw-r--r-- | application/http/Url.php | 217 | ||||
-rw-r--r-- | tests/http/UrlTest.php (renamed from tests/Url/UrlTest.php) | 23 |
4 files changed, 234 insertions, 227 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php index 9c438160..51af5d0d 100644 --- a/application/HttpUtils.php +++ b/application/HttpUtils.php | |||
@@ -1,4 +1,7 @@ | |||
1 | <?php | 1 | <?php |
2 | |||
3 | use Shaarli\Http\Url; | ||
4 | |||
2 | /** | 5 | /** |
3 | * GET an HTTP URL to retrieve its content | 6 | * GET an HTTP URL to retrieve its content |
4 | * Uses the cURL library or a fallback method | 7 | * Uses the cURL library or a fallback method |
diff --git a/application/Url.php b/application/Url.php index 3b7f19c2..81f72fb0 100644 --- a/application/Url.php +++ b/application/Url.php | |||
@@ -34,7 +34,7 @@ function unparse_url($parsedUrl) | |||
34 | */ | 34 | */ |
35 | function cleanup_url($url) | 35 | function cleanup_url($url) |
36 | { | 36 | { |
37 | $obj_url = new Url($url); | 37 | $obj_url = new \Shaarli\Http\Url($url); |
38 | return $obj_url->cleanup(); | 38 | return $obj_url->cleanup(); |
39 | } | 39 | } |
40 | 40 | ||
@@ -47,7 +47,7 @@ function cleanup_url($url) | |||
47 | */ | 47 | */ |
48 | function get_url_scheme($url) | 48 | function get_url_scheme($url) |
49 | { | 49 | { |
50 | $obj_url = new Url($url); | 50 | $obj_url = new \Shaarli\Http\Url($url); |
51 | return $obj_url->getScheme(); | 51 | return $obj_url->getScheme(); |
52 | } | 52 | } |
53 | 53 | ||
@@ -86,217 +86,3 @@ function whitelist_protocols($url, $protocols) | |||
86 | } | 86 | } |
87 | return $url; | 87 | return $url; |
88 | } | 88 | } |
89 | |||
90 | /** | ||
91 | * URL representation and cleanup utilities | ||
92 | * | ||
93 | * Form | ||
94 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
95 | * | ||
96 | * Examples | ||
97 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
98 | * https://host.name.tld | ||
99 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
100 | * | ||
101 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
102 | */ | ||
103 | class Url | ||
104 | { | ||
105 | private static $annoyingQueryParams = array( | ||
106 | |||
107 | 'action_object_map=', | ||
108 | 'action_ref_map=', | ||
109 | 'action_type_map=', | ||
110 | 'fb_', | ||
111 | 'fb=', | ||
112 | 'PHPSESSID=', | ||
113 | |||
114 | // Scoop.it | ||
115 | '__scoop', | ||
116 | |||
117 | // Google Analytics & FeedProxy | ||
118 | 'utm_', | ||
119 | |||
120 | // ATInternet | ||
121 | 'xtor=', | ||
122 | |||
123 | // Other | ||
124 | 'campaign_' | ||
125 | ); | ||
126 | |||
127 | private static $annoyingFragments = array( | ||
128 | // ATInternet | ||
129 | 'xtor=RSS-', | ||
130 | |||
131 | // Misc. | ||
132 | 'tk.rss_all' | ||
133 | ); | ||
134 | |||
135 | /* | ||
136 | * URL parts represented as an array | ||
137 | * | ||
138 | * @see http://php.net/parse_url | ||
139 | */ | ||
140 | protected $parts; | ||
141 | |||
142 | /** | ||
143 | * Parses a string containing a URL | ||
144 | * | ||
145 | * @param string $url a string containing a URL | ||
146 | */ | ||
147 | public function __construct($url) | ||
148 | { | ||
149 | $url = self::cleanupUnparsedUrl(trim($url)); | ||
150 | $this->parts = parse_url($url); | ||
151 | |||
152 | if (!empty($url) && empty($this->parts['scheme'])) { | ||
153 | $this->parts['scheme'] = 'http'; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * Clean up URL before it's parsed. | ||
159 | * ie. handle urlencode, url prefixes, etc. | ||
160 | * | ||
161 | * @param string $url URL to clean. | ||
162 | * | ||
163 | * @return string cleaned URL. | ||
164 | */ | ||
165 | protected static function cleanupUnparsedUrl($url) | ||
166 | { | ||
167 | return self::removeFirefoxAboutReader($url); | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * Remove Firefox Reader prefix if it's present. | ||
172 | * | ||
173 | * @param string $input url | ||
174 | * | ||
175 | * @return string cleaned url | ||
176 | */ | ||
177 | protected static function removeFirefoxAboutReader($input) | ||
178 | { | ||
179 | $firefoxPrefix = 'about://reader?url='; | ||
180 | if (startsWith($input, $firefoxPrefix)) { | ||
181 | return urldecode(ltrim($input, $firefoxPrefix)); | ||
182 | } | ||
183 | return $input; | ||
184 | } | ||
185 | |||
186 | /** | ||
187 | * Returns a string representation of this URL | ||
188 | */ | ||
189 | public function toString() | ||
190 | { | ||
191 | return unparse_url($this->parts); | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * Removes undesired query parameters | ||
196 | */ | ||
197 | protected function cleanupQuery() | ||
198 | { | ||
199 | if (! isset($this->parts['query'])) { | ||
200 | return; | ||
201 | } | ||
202 | |||
203 | $queryParams = explode('&', $this->parts['query']); | ||
204 | |||
205 | foreach (self::$annoyingQueryParams as $annoying) { | ||
206 | foreach ($queryParams as $param) { | ||
207 | if (startsWith($param, $annoying)) { | ||
208 | $queryParams = array_diff($queryParams, array($param)); | ||
209 | continue; | ||
210 | } | ||
211 | } | ||
212 | } | ||
213 | |||
214 | if (count($queryParams) == 0) { | ||
215 | unset($this->parts['query']); | ||
216 | return; | ||
217 | } | ||
218 | |||
219 | $this->parts['query'] = implode('&', $queryParams); | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * Removes undesired fragments | ||
224 | */ | ||
225 | protected function cleanupFragment() | ||
226 | { | ||
227 | if (! isset($this->parts['fragment'])) { | ||
228 | return; | ||
229 | } | ||
230 | |||
231 | foreach (self::$annoyingFragments as $annoying) { | ||
232 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
233 | unset($this->parts['fragment']); | ||
234 | break; | ||
235 | } | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /** | ||
240 | * Removes undesired query parameters and fragments | ||
241 | * | ||
242 | * @return string the string representation of this URL after cleanup | ||
243 | */ | ||
244 | public function cleanup() | ||
245 | { | ||
246 | $this->cleanupQuery(); | ||
247 | $this->cleanupFragment(); | ||
248 | return $this->toString(); | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
253 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
254 | * | ||
255 | * @return string converted cleaned up URL. | ||
256 | */ | ||
257 | public function idnToAscii() | ||
258 | { | ||
259 | $out = $this->cleanup(); | ||
260 | if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) { | ||
261 | return $out; | ||
262 | } | ||
263 | $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46); | ||
264 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * Get URL scheme. | ||
269 | * | ||
270 | * @return string the URL scheme or false if none is provided. | ||
271 | */ | ||
272 | public function getScheme() | ||
273 | { | ||
274 | if (!isset($this->parts['scheme'])) { | ||
275 | return false; | ||
276 | } | ||
277 | return $this->parts['scheme']; | ||
278 | } | ||
279 | |||
280 | /** | ||
281 | * Get URL host. | ||
282 | * | ||
283 | * @return string the URL host or false if none is provided. | ||
284 | */ | ||
285 | public function getHost() | ||
286 | { | ||
287 | if (empty($this->parts['host'])) { | ||
288 | return false; | ||
289 | } | ||
290 | return $this->parts['host']; | ||
291 | } | ||
292 | |||
293 | /** | ||
294 | * Test if the Url is an HTTP one. | ||
295 | * | ||
296 | * @return true is HTTP, false otherwise. | ||
297 | */ | ||
298 | public function isHttp() | ||
299 | { | ||
300 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
301 | } | ||
302 | } | ||
diff --git a/application/http/Url.php b/application/http/Url.php new file mode 100644 index 00000000..260231c6 --- /dev/null +++ b/application/http/Url.php | |||
@@ -0,0 +1,217 @@ | |||
1 | <?php | ||
2 | |||
3 | namespace Shaarli\Http; | ||
4 | |||
5 | /** | ||
6 | * URL representation and cleanup utilities | ||
7 | * | ||
8 | * Form | ||
9 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
10 | * | ||
11 | * Examples | ||
12 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
13 | * https://host.name.tld | ||
14 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
15 | * | ||
16 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
17 | */ | ||
18 | class Url | ||
19 | { | ||
20 | private static $annoyingQueryParams = array( | ||
21 | |||
22 | 'action_object_map=', | ||
23 | 'action_ref_map=', | ||
24 | 'action_type_map=', | ||
25 | 'fb_', | ||
26 | 'fb=', | ||
27 | 'PHPSESSID=', | ||
28 | |||
29 | // Scoop.it | ||
30 | '__scoop', | ||
31 | |||
32 | // Google Analytics & FeedProxy | ||
33 | 'utm_', | ||
34 | |||
35 | // ATInternet | ||
36 | 'xtor=', | ||
37 | |||
38 | // Other | ||
39 | 'campaign_' | ||
40 | ); | ||
41 | |||
42 | private static $annoyingFragments = array( | ||
43 | // ATInternet | ||
44 | 'xtor=RSS-', | ||
45 | |||
46 | // Misc. | ||
47 | 'tk.rss_all' | ||
48 | ); | ||
49 | |||
50 | /* | ||
51 | * URL parts represented as an array | ||
52 | * | ||
53 | * @see http://php.net/parse_url | ||
54 | */ | ||
55 | protected $parts; | ||
56 | |||
57 | /** | ||
58 | * Parses a string containing a URL | ||
59 | * | ||
60 | * @param string $url a string containing a URL | ||
61 | */ | ||
62 | public function __construct($url) | ||
63 | { | ||
64 | $url = self::cleanupUnparsedUrl(trim($url)); | ||
65 | $this->parts = parse_url($url); | ||
66 | |||
67 | if (!empty($url) && empty($this->parts['scheme'])) { | ||
68 | $this->parts['scheme'] = 'http'; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | /** | ||
73 | * Clean up URL before it's parsed. | ||
74 | * ie. handle urlencode, url prefixes, etc. | ||
75 | * | ||
76 | * @param string $url URL to clean. | ||
77 | * | ||
78 | * @return string cleaned URL. | ||
79 | */ | ||
80 | protected static function cleanupUnparsedUrl($url) | ||
81 | { | ||
82 | return self::removeFirefoxAboutReader($url); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * Remove Firefox Reader prefix if it's present. | ||
87 | * | ||
88 | * @param string $input url | ||
89 | * | ||
90 | * @return string cleaned url | ||
91 | */ | ||
92 | protected static function removeFirefoxAboutReader($input) | ||
93 | { | ||
94 | $firefoxPrefix = 'about://reader?url='; | ||
95 | if (startsWith($input, $firefoxPrefix)) { | ||
96 | return urldecode(ltrim($input, $firefoxPrefix)); | ||
97 | } | ||
98 | return $input; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * Returns a string representation of this URL | ||
103 | */ | ||
104 | public function toString() | ||
105 | { | ||
106 | return unparse_url($this->parts); | ||
107 | } | ||
108 | |||
109 | /** | ||
110 | * Removes undesired query parameters | ||
111 | */ | ||
112 | protected function cleanupQuery() | ||
113 | { | ||
114 | if (!isset($this->parts['query'])) { | ||
115 | return; | ||
116 | } | ||
117 | |||
118 | $queryParams = explode('&', $this->parts['query']); | ||
119 | |||
120 | foreach (self::$annoyingQueryParams as $annoying) { | ||
121 | foreach ($queryParams as $param) { | ||
122 | if (startsWith($param, $annoying)) { | ||
123 | $queryParams = array_diff($queryParams, array($param)); | ||
124 | continue; | ||
125 | } | ||
126 | } | ||
127 | } | ||
128 | |||
129 | if (count($queryParams) == 0) { | ||
130 | unset($this->parts['query']); | ||
131 | return; | ||
132 | } | ||
133 | |||
134 | $this->parts['query'] = implode('&', $queryParams); | ||
135 | } | ||
136 | |||
137 | /** | ||
138 | * Removes undesired fragments | ||
139 | */ | ||
140 | protected function cleanupFragment() | ||
141 | { | ||
142 | if (!isset($this->parts['fragment'])) { | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | foreach (self::$annoyingFragments as $annoying) { | ||
147 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
148 | unset($this->parts['fragment']); | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * Removes undesired query parameters and fragments | ||
156 | * | ||
157 | * @return string the string representation of this URL after cleanup | ||
158 | */ | ||
159 | public function cleanup() | ||
160 | { | ||
161 | $this->cleanupQuery(); | ||
162 | $this->cleanupFragment(); | ||
163 | return $this->toString(); | ||
164 | } | ||
165 | |||
166 | /** | ||
167 | * Converts an URL with an International Domain Name host to a ASCII one. | ||
168 | * This requires PHP-intl. If it's not available, just returns this->cleanup(). | ||
169 | * | ||
170 | * @return string converted cleaned up URL. | ||
171 | */ | ||
172 | public function idnToAscii() | ||
173 | { | ||
174 | $out = $this->cleanup(); | ||
175 | if (!function_exists('idn_to_ascii') || !isset($this->parts['host'])) { | ||
176 | return $out; | ||
177 | } | ||
178 | $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46); | ||
179 | return str_replace($this->parts['host'], $asciiHost, $out); | ||
180 | } | ||
181 | |||
182 | /** | ||
183 | * Get URL scheme. | ||
184 | * | ||
185 | * @return string the URL scheme or false if none is provided. | ||
186 | */ | ||
187 | public function getScheme() | ||
188 | { | ||
189 | if (!isset($this->parts['scheme'])) { | ||
190 | return false; | ||
191 | } | ||
192 | return $this->parts['scheme']; | ||
193 | } | ||
194 | |||
195 | /** | ||
196 | * Get URL host. | ||
197 | * | ||
198 | * @return string the URL host or false if none is provided. | ||
199 | */ | ||
200 | public function getHost() | ||
201 | { | ||
202 | if (empty($this->parts['host'])) { | ||
203 | return false; | ||
204 | } | ||
205 | return $this->parts['host']; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * Test if the Url is an HTTP one. | ||
210 | * | ||
211 | * @return true is HTTP, false otherwise. | ||
212 | */ | ||
213 | public function isHttp() | ||
214 | { | ||
215 | return strpos(strtolower($this->parts['scheme']), 'http') !== false; | ||
216 | } | ||
217 | } | ||
diff --git a/tests/Url/UrlTest.php b/tests/http/UrlTest.php index db229ce0..011b416d 100644 --- a/tests/Url/UrlTest.php +++ b/tests/http/UrlTest.php | |||
@@ -3,12 +3,13 @@ | |||
3 | * Url's tests | 3 | * Url's tests |
4 | */ | 4 | */ |
5 | 5 | ||
6 | require_once 'application/Url.php'; | 6 | namespace Shaarli\Http; |
7 | |||
7 | 8 | ||
8 | /** | 9 | /** |
9 | * Unitary tests for URL utilities | 10 | * Unitary tests for URL utilities |
10 | */ | 11 | */ |
11 | class UrlTest extends PHPUnit_Framework_TestCase | 12 | class UrlTest extends \PHPUnit\Framework\TestCase |
12 | { | 13 | { |
13 | // base URL for tests | 14 | // base URL for tests |
14 | protected static $baseUrl = 'http://domain.tld:3000'; | 15 | protected static $baseUrl = 'http://domain.tld:3000'; |
@@ -18,7 +19,7 @@ class UrlTest extends PHPUnit_Framework_TestCase | |||
18 | */ | 19 | */ |
19 | private function assertUrlIsCleaned($query = '', $fragment = '') | 20 | private function assertUrlIsCleaned($query = '', $fragment = '') |
20 | { | 21 | { |
21 | $url = new Url(self::$baseUrl.$query.$fragment); | 22 | $url = new Url(self::$baseUrl . $query . $fragment); |
22 | $url->cleanup(); | 23 | $url->cleanup(); |
23 | $this->assertEquals(self::$baseUrl, $url->toString()); | 24 | $this->assertEquals(self::$baseUrl, $url->toString()); |
24 | } | 25 | } |
@@ -38,7 +39,7 @@ class UrlTest extends PHPUnit_Framework_TestCase | |||
38 | public function testConstruct() | 39 | public function testConstruct() |
39 | { | 40 | { |
40 | $ref = 'http://username:password@hostname:9090/path' | 41 | $ref = 'http://username:password@hostname:9090/path' |
41 | .'?arg1=value1&arg2=value2#anchor'; | 42 | . '?arg1=value1&arg2=value2#anchor'; |
42 | $url = new Url($ref); | 43 | $url = new Url($ref); |
43 | $this->assertEquals($ref, $url->toString()); | 44 | $this->assertEquals($ref, $url->toString()); |
44 | } | 45 | } |
@@ -52,7 +53,7 @@ class UrlTest extends PHPUnit_Framework_TestCase | |||
52 | $this->assertUrlIsCleaned(); | 53 | $this->assertUrlIsCleaned(); |
53 | 54 | ||
54 | // URL with no annoying elements | 55 | // URL with no annoying elements |
55 | $ref = self::$baseUrl.'?p1=val1&p2=1234#edit'; | 56 | $ref = self::$baseUrl . '?p1=val1&p2=1234#edit'; |
56 | $url = new Url($ref); | 57 | $url = new Url($ref); |
57 | $this->assertEquals($ref, $url->cleanup()); | 58 | $this->assertEquals($ref, $url->cleanup()); |
58 | } | 59 | } |
@@ -115,26 +116,26 @@ class UrlTest extends PHPUnit_Framework_TestCase | |||
115 | // ditch annoying query params and fragment, keep useful params | 116 | // ditch annoying query params and fragment, keep useful params |
116 | $url = new Url( | 117 | $url = new Url( |
117 | self::$baseUrl | 118 | self::$baseUrl |
118 | .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all' | 119 | . '?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all' |
119 | ); | 120 | ); |
120 | $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup()); | 121 | $this->assertEquals(self::$baseUrl . '?my=stuff&is=kept', $url->cleanup()); |
121 | 122 | ||
122 | 123 | ||
123 | // ditch annoying query params, keep useful params and fragment | 124 | // ditch annoying query params, keep useful params and fragment |
124 | $url = new Url( | 125 | $url = new Url( |
125 | self::$baseUrl | 126 | self::$baseUrl |
126 | .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again' | 127 | . '?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again' |
127 | ); | 128 | ); |
128 | $this->assertEquals( | 129 | $this->assertEquals( |
129 | self::$baseUrl.'?my=stuff&is=kept#again', | 130 | self::$baseUrl . '?my=stuff&is=kept#again', |
130 | $url->cleanup() | 131 | $url->cleanup() |
131 | ); | 132 | ); |
132 | 133 | ||
133 | // test firefox reader url | 134 | // test firefox reader url |
134 | $url = new Url( | 135 | $url = new Url( |
135 | 'about://reader?url=' . urlencode(self::$baseUrl .'?my=stuff&is=kept') | 136 | 'about://reader?url=' . urlencode(self::$baseUrl . '?my=stuff&is=kept') |
136 | ); | 137 | ); |
137 | $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup()); | 138 | $this->assertEquals(self::$baseUrl . '?my=stuff&is=kept', $url->cleanup()); |
138 | } | 139 | } |
139 | 140 | ||
140 | /** | 141 | /** |