aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--application/HttpUtils.php3
-rw-r--r--application/Url.php218
-rw-r--r--application/http/Url.php217
-rw-r--r--tests/http/UrlTest.php (renamed from tests/Url/UrlTest.php)23
4 files changed, 234 insertions, 227 deletions
diff --git a/application/HttpUtils.php b/application/HttpUtils.php
index 9c438160..51af5d0d 100644
--- a/application/HttpUtils.php
+++ b/application/HttpUtils.php
@@ -1,4 +1,7 @@
1<?php 1<?php
2
3use Shaarli\Http\Url;
4
2/** 5/**
3 * GET an HTTP URL to retrieve its content 6 * GET an HTTP URL to retrieve its content
4 * Uses the cURL library or a fallback method 7 * Uses the cURL library or a fallback method
diff --git a/application/Url.php b/application/Url.php
index 3b7f19c2..81f72fb0 100644
--- a/application/Url.php
+++ b/application/Url.php
@@ -34,7 +34,7 @@ function unparse_url($parsedUrl)
34 */ 34 */
35function cleanup_url($url) 35function cleanup_url($url)
36{ 36{
37 $obj_url = new Url($url); 37 $obj_url = new \Shaarli\Http\Url($url);
38 return $obj_url->cleanup(); 38 return $obj_url->cleanup();
39} 39}
40 40
@@ -47,7 +47,7 @@ function cleanup_url($url)
47 */ 47 */
48function get_url_scheme($url) 48function get_url_scheme($url)
49{ 49{
50 $obj_url = new Url($url); 50 $obj_url = new \Shaarli\Http\Url($url);
51 return $obj_url->getScheme(); 51 return $obj_url->getScheme();
52} 52}
53 53
@@ -86,217 +86,3 @@ function whitelist_protocols($url, $protocols)
86 } 86 }
87 return $url; 87 return $url;
88} 88}
89
90/**
91 * URL representation and cleanup utilities
92 *
93 * Form
94 * scheme://[username:password@]host[:port][/path][?query][#fragment]
95 *
96 * Examples
97 * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor
98 * https://host.name.tld
99 * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer
100 *
101 * @see http://www.faqs.org/rfcs/rfc3986.html
102 */
103class Url
104{
105 private static $annoyingQueryParams = array(
106 // Facebook
107 'action_object_map=',
108 'action_ref_map=',
109 'action_type_map=',
110 'fb_',
111 'fb=',
112 'PHPSESSID=',
113
114 // Scoop.it
115 '__scoop',
116
117 // Google Analytics & FeedProxy
118 'utm_',
119
120 // ATInternet
121 'xtor=',
122
123 // Other
124 'campaign_'
125 );
126
127 private static $annoyingFragments = array(
128 // ATInternet
129 'xtor=RSS-',
130
131 // Misc.
132 'tk.rss_all'
133 );
134
135 /*
136 * URL parts represented as an array
137 *
138 * @see http://php.net/parse_url
139 */
140 protected $parts;
141
142 /**
143 * Parses a string containing a URL
144 *
145 * @param string $url a string containing a URL
146 */
147 public function __construct($url)
148 {
149 $url = self::cleanupUnparsedUrl(trim($url));
150 $this->parts = parse_url($url);
151
152 if (!empty($url) && empty($this->parts['scheme'])) {
153 $this->parts['scheme'] = 'http';
154 }
155 }
156
157 /**
158 * Clean up URL before it's parsed.
159 * ie. handle urlencode, url prefixes, etc.
160 *
161 * @param string $url URL to clean.
162 *
163 * @return string cleaned URL.
164 */
165 protected static function cleanupUnparsedUrl($url)
166 {
167 return self::removeFirefoxAboutReader($url);
168 }
169
170 /**
171 * Remove Firefox Reader prefix if it's present.
172 *
173 * @param string $input url
174 *
175 * @return string cleaned url
176 */
177 protected static function removeFirefoxAboutReader($input)
178 {
179 $firefoxPrefix = 'about://reader?url=';
180 if (startsWith($input, $firefoxPrefix)) {
181 return urldecode(ltrim($input, $firefoxPrefix));
182 }
183 return $input;
184 }
185
186 /**
187 * Returns a string representation of this URL
188 */
189 public function toString()
190 {
191 return unparse_url($this->parts);
192 }
193
194 /**
195 * Removes undesired query parameters
196 */
197 protected function cleanupQuery()
198 {
199 if (! isset($this->parts['query'])) {
200 return;
201 }
202
203 $queryParams = explode('&', $this->parts['query']);
204
205 foreach (self::$annoyingQueryParams as $annoying) {
206 foreach ($queryParams as $param) {
207 if (startsWith($param, $annoying)) {
208 $queryParams = array_diff($queryParams, array($param));
209 continue;
210 }
211 }
212 }
213
214 if (count($queryParams) == 0) {
215 unset($this->parts['query']);
216 return;
217 }
218
219 $this->parts['query'] = implode('&', $queryParams);
220 }
221
222 /**
223 * Removes undesired fragments
224 */
225 protected function cleanupFragment()
226 {
227 if (! isset($this->parts['fragment'])) {
228 return;
229 }
230
231 foreach (self::$annoyingFragments as $annoying) {
232 if (startsWith($this->parts['fragment'], $annoying)) {
233 unset($this->parts['fragment']);
234 break;
235 }
236 }
237 }
238
239 /**
240 * Removes undesired query parameters and fragments
241 *
242 * @return string the string representation of this URL after cleanup
243 */
244 public function cleanup()
245 {
246 $this->cleanupQuery();
247 $this->cleanupFragment();
248 return $this->toString();
249 }
250
251 /**
252 * Converts an URL with an International Domain Name host to a ASCII one.
253 * This requires PHP-intl. If it's not available, just returns this->cleanup().
254 *
255 * @return string converted cleaned up URL.
256 */
257 public function idnToAscii()
258 {
259 $out = $this->cleanup();
260 if (! function_exists('idn_to_ascii') || ! isset($this->parts['host'])) {
261 return $out;
262 }
263 $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46);
264 return str_replace($this->parts['host'], $asciiHost, $out);
265 }
266
267 /**
268 * Get URL scheme.
269 *
270 * @return string the URL scheme or false if none is provided.
271 */
272 public function getScheme()
273 {
274 if (!isset($this->parts['scheme'])) {
275 return false;
276 }
277 return $this->parts['scheme'];
278 }
279
280 /**
281 * Get URL host.
282 *
283 * @return string the URL host or false if none is provided.
284 */
285 public function getHost()
286 {
287 if (empty($this->parts['host'])) {
288 return false;
289 }
290 return $this->parts['host'];
291 }
292
293 /**
294 * Test if the Url is an HTTP one.
295 *
296 * @return true is HTTP, false otherwise.
297 */
298 public function isHttp()
299 {
300 return strpos(strtolower($this->parts['scheme']), 'http') !== false;
301 }
302}
diff --git a/application/http/Url.php b/application/http/Url.php
new file mode 100644
index 00000000..260231c6
--- /dev/null
+++ b/application/http/Url.php
@@ -0,0 +1,217 @@
1<?php
2
3namespace Shaarli\Http;
4
5/**
6 * URL representation and cleanup utilities
7 *
8 * Form
9 * scheme://[username:password@]host[:port][/path][?query][#fragment]
10 *
11 * Examples
12 * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor
13 * https://host.name.tld
14 * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer
15 *
16 * @see http://www.faqs.org/rfcs/rfc3986.html
17 */
18class Url
19{
20 private static $annoyingQueryParams = array(
21 // Facebook
22 'action_object_map=',
23 'action_ref_map=',
24 'action_type_map=',
25 'fb_',
26 'fb=',
27 'PHPSESSID=',
28
29 // Scoop.it
30 '__scoop',
31
32 // Google Analytics & FeedProxy
33 'utm_',
34
35 // ATInternet
36 'xtor=',
37
38 // Other
39 'campaign_'
40 );
41
42 private static $annoyingFragments = array(
43 // ATInternet
44 'xtor=RSS-',
45
46 // Misc.
47 'tk.rss_all'
48 );
49
50 /*
51 * URL parts represented as an array
52 *
53 * @see http://php.net/parse_url
54 */
55 protected $parts;
56
57 /**
58 * Parses a string containing a URL
59 *
60 * @param string $url a string containing a URL
61 */
62 public function __construct($url)
63 {
64 $url = self::cleanupUnparsedUrl(trim($url));
65 $this->parts = parse_url($url);
66
67 if (!empty($url) && empty($this->parts['scheme'])) {
68 $this->parts['scheme'] = 'http';
69 }
70 }
71
72 /**
73 * Clean up URL before it's parsed.
74 * ie. handle urlencode, url prefixes, etc.
75 *
76 * @param string $url URL to clean.
77 *
78 * @return string cleaned URL.
79 */
80 protected static function cleanupUnparsedUrl($url)
81 {
82 return self::removeFirefoxAboutReader($url);
83 }
84
85 /**
86 * Remove Firefox Reader prefix if it's present.
87 *
88 * @param string $input url
89 *
90 * @return string cleaned url
91 */
92 protected static function removeFirefoxAboutReader($input)
93 {
94 $firefoxPrefix = 'about://reader?url=';
95 if (startsWith($input, $firefoxPrefix)) {
96 return urldecode(ltrim($input, $firefoxPrefix));
97 }
98 return $input;
99 }
100
101 /**
102 * Returns a string representation of this URL
103 */
104 public function toString()
105 {
106 return unparse_url($this->parts);
107 }
108
109 /**
110 * Removes undesired query parameters
111 */
112 protected function cleanupQuery()
113 {
114 if (!isset($this->parts['query'])) {
115 return;
116 }
117
118 $queryParams = explode('&', $this->parts['query']);
119
120 foreach (self::$annoyingQueryParams as $annoying) {
121 foreach ($queryParams as $param) {
122 if (startsWith($param, $annoying)) {
123 $queryParams = array_diff($queryParams, array($param));
124 continue;
125 }
126 }
127 }
128
129 if (count($queryParams) == 0) {
130 unset($this->parts['query']);
131 return;
132 }
133
134 $this->parts['query'] = implode('&', $queryParams);
135 }
136
137 /**
138 * Removes undesired fragments
139 */
140 protected function cleanupFragment()
141 {
142 if (!isset($this->parts['fragment'])) {
143 return;
144 }
145
146 foreach (self::$annoyingFragments as $annoying) {
147 if (startsWith($this->parts['fragment'], $annoying)) {
148 unset($this->parts['fragment']);
149 break;
150 }
151 }
152 }
153
154 /**
155 * Removes undesired query parameters and fragments
156 *
157 * @return string the string representation of this URL after cleanup
158 */
159 public function cleanup()
160 {
161 $this->cleanupQuery();
162 $this->cleanupFragment();
163 return $this->toString();
164 }
165
166 /**
167 * Converts an URL with an International Domain Name host to a ASCII one.
168 * This requires PHP-intl. If it's not available, just returns this->cleanup().
169 *
170 * @return string converted cleaned up URL.
171 */
172 public function idnToAscii()
173 {
174 $out = $this->cleanup();
175 if (!function_exists('idn_to_ascii') || !isset($this->parts['host'])) {
176 return $out;
177 }
178 $asciiHost = idn_to_ascii($this->parts['host'], 0, INTL_IDNA_VARIANT_UTS46);
179 return str_replace($this->parts['host'], $asciiHost, $out);
180 }
181
182 /**
183 * Get URL scheme.
184 *
185 * @return string the URL scheme or false if none is provided.
186 */
187 public function getScheme()
188 {
189 if (!isset($this->parts['scheme'])) {
190 return false;
191 }
192 return $this->parts['scheme'];
193 }
194
195 /**
196 * Get URL host.
197 *
198 * @return string the URL host or false if none is provided.
199 */
200 public function getHost()
201 {
202 if (empty($this->parts['host'])) {
203 return false;
204 }
205 return $this->parts['host'];
206 }
207
208 /**
209 * Test if the Url is an HTTP one.
210 *
211 * @return true is HTTP, false otherwise.
212 */
213 public function isHttp()
214 {
215 return strpos(strtolower($this->parts['scheme']), 'http') !== false;
216 }
217}
diff --git a/tests/Url/UrlTest.php b/tests/http/UrlTest.php
index db229ce0..011b416d 100644
--- a/tests/Url/UrlTest.php
+++ b/tests/http/UrlTest.php
@@ -3,12 +3,13 @@
3 * Url's tests 3 * Url's tests
4 */ 4 */
5 5
6require_once 'application/Url.php'; 6namespace Shaarli\Http;
7
7 8
8/** 9/**
9 * Unitary tests for URL utilities 10 * Unitary tests for URL utilities
10 */ 11 */
11class UrlTest extends PHPUnit_Framework_TestCase 12class UrlTest extends \PHPUnit\Framework\TestCase
12{ 13{
13 // base URL for tests 14 // base URL for tests
14 protected static $baseUrl = 'http://domain.tld:3000'; 15 protected static $baseUrl = 'http://domain.tld:3000';
@@ -18,7 +19,7 @@ class UrlTest extends PHPUnit_Framework_TestCase
18 */ 19 */
19 private function assertUrlIsCleaned($query = '', $fragment = '') 20 private function assertUrlIsCleaned($query = '', $fragment = '')
20 { 21 {
21 $url = new Url(self::$baseUrl.$query.$fragment); 22 $url = new Url(self::$baseUrl . $query . $fragment);
22 $url->cleanup(); 23 $url->cleanup();
23 $this->assertEquals(self::$baseUrl, $url->toString()); 24 $this->assertEquals(self::$baseUrl, $url->toString());
24 } 25 }
@@ -38,7 +39,7 @@ class UrlTest extends PHPUnit_Framework_TestCase
38 public function testConstruct() 39 public function testConstruct()
39 { 40 {
40 $ref = 'http://username:password@hostname:9090/path' 41 $ref = 'http://username:password@hostname:9090/path'
41 .'?arg1=value1&arg2=value2#anchor'; 42 . '?arg1=value1&arg2=value2#anchor';
42 $url = new Url($ref); 43 $url = new Url($ref);
43 $this->assertEquals($ref, $url->toString()); 44 $this->assertEquals($ref, $url->toString());
44 } 45 }
@@ -52,7 +53,7 @@ class UrlTest extends PHPUnit_Framework_TestCase
52 $this->assertUrlIsCleaned(); 53 $this->assertUrlIsCleaned();
53 54
54 // URL with no annoying elements 55 // URL with no annoying elements
55 $ref = self::$baseUrl.'?p1=val1&p2=1234#edit'; 56 $ref = self::$baseUrl . '?p1=val1&p2=1234#edit';
56 $url = new Url($ref); 57 $url = new Url($ref);
57 $this->assertEquals($ref, $url->cleanup()); 58 $this->assertEquals($ref, $url->cleanup());
58 } 59 }
@@ -115,26 +116,26 @@ class UrlTest extends PHPUnit_Framework_TestCase
115 // ditch annoying query params and fragment, keep useful params 116 // ditch annoying query params and fragment, keep useful params
116 $url = new Url( 117 $url = new Url(
117 self::$baseUrl 118 self::$baseUrl
118 .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all' 119 . '?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all'
119 ); 120 );
120 $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup()); 121 $this->assertEquals(self::$baseUrl . '?my=stuff&is=kept', $url->cleanup());
121 122
122 123
123 // ditch annoying query params, keep useful params and fragment 124 // ditch annoying query params, keep useful params and fragment
124 $url = new Url( 125 $url = new Url(
125 self::$baseUrl 126 self::$baseUrl
126 .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again' 127 . '?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again'
127 ); 128 );
128 $this->assertEquals( 129 $this->assertEquals(
129 self::$baseUrl.'?my=stuff&is=kept#again', 130 self::$baseUrl . '?my=stuff&is=kept#again',
130 $url->cleanup() 131 $url->cleanup()
131 ); 132 );
132 133
133 // test firefox reader url 134 // test firefox reader url
134 $url = new Url( 135 $url = new Url(
135 'about://reader?url=' . urlencode(self::$baseUrl .'?my=stuff&is=kept') 136 'about://reader?url=' . urlencode(self::$baseUrl . '?my=stuff&is=kept')
136 ); 137 );
137 $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup()); 138 $this->assertEquals(self::$baseUrl . '?my=stuff&is=kept', $url->cleanup());
138 } 139 }
139 140
140 /** 141 /**