diff options
author | VirtualTam <virtualtam@flibidi.net> | 2015-08-16 23:01:54 +0200 |
---|---|---|
committer | VirtualTam <virtualtam@flibidi.net> | 2015-08-16 23:01:54 +0200 |
commit | f8bf8d8e595a9128ebdf00091a648d186ba9a628 (patch) | |
tree | 8a53db5956e81bb8856333caa711ae202b84937e | |
parent | c622d32820685566c7c0228ae9cdc6c26f10fa29 (diff) | |
parent | d9d776af19fd0a191f82525991dafbb56e1bcfcb (diff) | |
download | Shaarli-f8bf8d8e595a9128ebdf00091a648d186ba9a628.tar.gz Shaarli-f8bf8d8e595a9128ebdf00091a648d186ba9a628.tar.zst Shaarli-f8bf8d8e595a9128ebdf00091a648d186ba9a628.zip |
Merge pull request #314 from shaarli/clean-utm_term
clean utm_term url parameter
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | application/Url.php | 150 | ||||
-rwxr-xr-x | index.php | 27 | ||||
-rw-r--r-- | tests/UrlTest.php | 154 |
4 files changed, 309 insertions, 23 deletions
@@ -19,4 +19,5 @@ composer.lock | |||
19 | # Ignore test data & output | 19 | # Ignore test data & output |
20 | coverage | 20 | coverage |
21 | tests/datastore.php | 21 | tests/datastore.php |
22 | tests/dummycache/ | ||
22 | phpmd.html | 23 | phpmd.html |
diff --git a/application/Url.php b/application/Url.php new file mode 100644 index 00000000..23356f39 --- /dev/null +++ b/application/Url.php | |||
@@ -0,0 +1,150 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Converts an array-represented URL to a string | ||
4 | * | ||
5 | * Source: http://php.net/manual/en/function.parse-url.php#106731 | ||
6 | * | ||
7 | * @see http://php.net/manual/en/function.parse-url.php | ||
8 | * | ||
9 | * @param array $parsedUrl an array-represented URL | ||
10 | * | ||
11 | * @return string the string representation of the URL | ||
12 | */ | ||
13 | function unparse_url($parsedUrl) | ||
14 | { | ||
15 | $scheme = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : ''; | ||
16 | $host = isset($parsedUrl['host']) ? $parsedUrl['host'] : ''; | ||
17 | $port = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : ''; | ||
18 | $user = isset($parsedUrl['user']) ? $parsedUrl['user'] : ''; | ||
19 | $pass = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass'] : ''; | ||
20 | $pass = ($user || $pass) ? "$pass@" : ''; | ||
21 | $path = isset($parsedUrl['path']) ? $parsedUrl['path'] : ''; | ||
22 | $query = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : ''; | ||
23 | $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : ''; | ||
24 | |||
25 | return "$scheme$user$pass$host$port$path$query$fragment"; | ||
26 | } | ||
27 | |||
28 | /** | ||
29 | * URL representation and cleanup utilities | ||
30 | * | ||
31 | * Form | ||
32 | * scheme://[username:password@]host[:port][/path][?query][#fragment] | ||
33 | * | ||
34 | * Examples | ||
35 | * http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor | ||
36 | * https://host.name.tld | ||
37 | * https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer | ||
38 | * | ||
39 | * @see http://www.faqs.org/rfcs/rfc3986.html | ||
40 | */ | ||
41 | class Url | ||
42 | { | ||
43 | private static $annoyingQueryParams = array( | ||
44 | |||
45 | 'action_object_map=', | ||
46 | 'action_ref_map=', | ||
47 | 'action_type_map=', | ||
48 | 'fb_', | ||
49 | 'fb=', | ||
50 | |||
51 | // Scoop.it | ||
52 | '__scoop', | ||
53 | |||
54 | // Google Analytics & FeedProxy | ||
55 | 'utm_', | ||
56 | |||
57 | // ATInternet | ||
58 | 'xtor=' | ||
59 | ); | ||
60 | |||
61 | private static $annoyingFragments = array( | ||
62 | // ATInternet | ||
63 | 'xtor=RSS-', | ||
64 | |||
65 | // Misc. | ||
66 | 'tk.rss_all' | ||
67 | ); | ||
68 | |||
69 | /* | ||
70 | * URL parts represented as an array | ||
71 | * | ||
72 | * @see http://php.net/parse_url | ||
73 | */ | ||
74 | protected $parts; | ||
75 | |||
76 | /** | ||
77 | * Parses a string containing a URL | ||
78 | * | ||
79 | * @param string $url a string containing a URL | ||
80 | */ | ||
81 | public function __construct($url) | ||
82 | { | ||
83 | $this->parts = parse_url($url); | ||
84 | } | ||
85 | |||
86 | /** | ||
87 | * Returns a string representation of this URL | ||
88 | */ | ||
89 | public function __toString() | ||
90 | { | ||
91 | return unparse_url($this->parts); | ||
92 | } | ||
93 | |||
94 | /** | ||
95 | * Removes undesired query parameters | ||
96 | */ | ||
97 | protected function cleanupQuery() | ||
98 | { | ||
99 | if (! isset($this->parts['query'])) { | ||
100 | return; | ||
101 | } | ||
102 | |||
103 | $queryParams = explode('&', $this->parts['query']); | ||
104 | |||
105 | foreach (self::$annoyingQueryParams as $annoying) { | ||
106 | foreach ($queryParams as $param) { | ||
107 | if (startsWith($param, $annoying)) { | ||
108 | $queryParams = array_diff($queryParams, array($param)); | ||
109 | continue; | ||
110 | } | ||
111 | } | ||
112 | } | ||
113 | |||
114 | if (count($queryParams) == 0) { | ||
115 | unset($this->parts['query']); | ||
116 | return; | ||
117 | } | ||
118 | |||
119 | $this->parts['query'] = implode('&', $queryParams); | ||
120 | } | ||
121 | |||
122 | /** | ||
123 | * Removes undesired fragments | ||
124 | */ | ||
125 | protected function cleanupFragment() | ||
126 | { | ||
127 | if (! isset($this->parts['fragment'])) { | ||
128 | return; | ||
129 | } | ||
130 | |||
131 | foreach (self::$annoyingFragments as $annoying) { | ||
132 | if (startsWith($this->parts['fragment'], $annoying)) { | ||
133 | unset($this->parts['fragment']); | ||
134 | break; | ||
135 | } | ||
136 | } | ||
137 | } | ||
138 | |||
139 | /** | ||
140 | * Removes undesired query parameters and fragments | ||
141 | * | ||
142 | * @return string the string representation of this URL after cleanup | ||
143 | */ | ||
144 | public function cleanup() | ||
145 | { | ||
146 | $this->cleanupQuery(); | ||
147 | $this->cleanupFragment(); | ||
148 | return $this->__toString(); | ||
149 | } | ||
150 | } | ||
@@ -74,6 +74,7 @@ require_once 'application/Cache.php'; | |||
74 | require_once 'application/CachedPage.php'; | 74 | require_once 'application/CachedPage.php'; |
75 | require_once 'application/LinkDB.php'; | 75 | require_once 'application/LinkDB.php'; |
76 | require_once 'application/TimeZone.php'; | 76 | require_once 'application/TimeZone.php'; |
77 | require_once 'application/Url.php'; | ||
77 | require_once 'application/Utils.php'; | 78 | require_once 'application/Utils.php'; |
78 | require_once 'application/Config.php'; | 79 | require_once 'application/Config.php'; |
79 | 80 | ||
@@ -1479,29 +1480,9 @@ function renderPage() | |||
1479 | } | 1480 | } |
1480 | 1481 | ||
1481 | // -------- User want to post a new link: Display link edit form. | 1482 | // -------- User want to post a new link: Display link edit form. |
1482 | if (isset($_GET['post'])) | 1483 | if (isset($_GET['post'])) { |
1483 | { | 1484 | $url = new Url($_GET['post']); |
1484 | $url=$_GET['post']; | 1485 | $url->cleanup(); |
1485 | |||
1486 | // We remove the annoying parameters added by FeedBurner, GoogleFeedProxy, Facebook... | ||
1487 | $annoyingpatterns = array('/[\?&]utm_source=[^&]*/', | ||
1488 | '/[\?&]utm_campaign=[^&]*/', | ||
1489 | '/[\?&]utm_medium=[^&]*/', | ||
1490 | '/#xtor=RSS-[^&]*/', | ||
1491 | '/[\?&]fb_[^&]*/', | ||
1492 | '/[\?&]__scoop[^&]*/', | ||
1493 | '/#tk\.rss_all\?/', | ||
1494 | '/[\?&]action_ref_map=[^&]*/', | ||
1495 | '/[\?&]action_type_map=[^&]*/', | ||
1496 | '/[\?&]action_object_map=[^&]*/', | ||
1497 | '/[\?&]utm_content=[^&]*/', | ||
1498 | '/[\?&]fb=[^&]*/', | ||
1499 | '/[\?&]xtor=[^&]*/' | ||
1500 | ); | ||
1501 | foreach($annoyingpatterns as $pattern) | ||
1502 | { | ||
1503 | $url = preg_replace($pattern, "", $url); | ||
1504 | } | ||
1505 | 1486 | ||
1506 | $link_is_new = false; | 1487 | $link_is_new = false; |
1507 | $link = $LINKSDB->getLinkFromUrl($url); // Check if URL is not already in database (in this case, we will edit the existing link) | 1488 | $link = $LINKSDB->getLinkFromUrl($url); // Check if URL is not already in database (in this case, we will edit the existing link) |
diff --git a/tests/UrlTest.php b/tests/UrlTest.php new file mode 100644 index 00000000..a39630f1 --- /dev/null +++ b/tests/UrlTest.php | |||
@@ -0,0 +1,154 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Url's tests | ||
4 | */ | ||
5 | |||
6 | require_once 'application/Url.php'; | ||
7 | |||
8 | /** | ||
9 | * Unitary tests for unparse_url() | ||
10 | */ | ||
11 | class UnparseUrlTest extends PHPUnit_Framework_TestCase | ||
12 | { | ||
13 | /** | ||
14 | * Thanks for building nothing | ||
15 | */ | ||
16 | public function testUnparseEmptyArray() | ||
17 | { | ||
18 | $this->assertEquals('', unparse_url(array())); | ||
19 | } | ||
20 | |||
21 | /** | ||
22 | * Rebuild a full-featured URL | ||
23 | */ | ||
24 | public function testUnparseFull() | ||
25 | { | ||
26 | $ref = 'http://username:password@hostname:9090/path' | ||
27 | .'?arg1=value1&arg2=value2#anchor'; | ||
28 | $this->assertEquals($ref, unparse_url(parse_url($ref))); | ||
29 | } | ||
30 | } | ||
31 | |||
32 | /** | ||
33 | * Unitary tests for URL utilities | ||
34 | */ | ||
35 | class UrlTest extends PHPUnit_Framework_TestCase | ||
36 | { | ||
37 | // base URL for tests | ||
38 | protected static $baseUrl = 'http://domain.tld:3000'; | ||
39 | |||
40 | /** | ||
41 | * Helper method | ||
42 | */ | ||
43 | private function assertUrlIsCleaned($query='', $fragment='') | ||
44 | { | ||
45 | $url = new Url(self::$baseUrl.$query.$fragment); | ||
46 | $url->cleanup(); | ||
47 | $this->assertEquals(self::$baseUrl, $url->__toString()); | ||
48 | } | ||
49 | |||
50 | /** | ||
51 | * Instantiate an empty URL | ||
52 | */ | ||
53 | public function testEmptyConstruct() | ||
54 | { | ||
55 | $this->assertEquals('', new Url('')); | ||
56 | } | ||
57 | |||
58 | /** | ||
59 | * Instantiate a URL | ||
60 | */ | ||
61 | public function testConstruct() | ||
62 | { | ||
63 | $ref = 'http://username:password@hostname:9090/path' | ||
64 | .'?arg1=value1&arg2=value2#anchor'; | ||
65 | $this->assertEquals($ref, new Url($ref)); | ||
66 | } | ||
67 | |||
68 | /** | ||
69 | * URL cleanup - nothing to do | ||
70 | */ | ||
71 | public function testNoCleanup() | ||
72 | { | ||
73 | // URL with no query nor fragment | ||
74 | $this->assertUrlIsCleaned(); | ||
75 | |||
76 | // URL with no annoying elements | ||
77 | $ref = self::$baseUrl.'?p1=val1&p2=1234#edit'; | ||
78 | $url = new Url($ref); | ||
79 | $this->assertEquals($ref, $url->cleanup()); | ||
80 | } | ||
81 | |||
82 | /** | ||
83 | * URL cleanup - annoying fragment | ||
84 | */ | ||
85 | public function testCleanupFragment() | ||
86 | { | ||
87 | $this->assertUrlIsCleaned('', '#tk.rss_all'); | ||
88 | $this->assertUrlIsCleaned('', '#xtor=RSS-'); | ||
89 | $this->assertUrlIsCleaned('', '#xtor=RSS-U3ht0tkc4b'); | ||
90 | } | ||
91 | |||
92 | /** | ||
93 | * URL cleanup - single annoying query parameter | ||
94 | */ | ||
95 | public function testCleanupSingleQueryParam() | ||
96 | { | ||
97 | $this->assertUrlIsCleaned('?action_object_map=junk'); | ||
98 | $this->assertUrlIsCleaned('?action_ref_map=Cr4p!'); | ||
99 | $this->assertUrlIsCleaned('?action_type_map=g4R84g3'); | ||
100 | |||
101 | $this->assertUrlIsCleaned('?fb_stuff=v41u3'); | ||
102 | $this->assertUrlIsCleaned('?fb=71m3w4573'); | ||
103 | |||
104 | $this->assertUrlIsCleaned('?utm_campaign=zomg'); | ||
105 | $this->assertUrlIsCleaned('?utm_medium=numnum'); | ||
106 | $this->assertUrlIsCleaned('?utm_source=c0d3'); | ||
107 | $this->assertUrlIsCleaned('?utm_term=1n4l'); | ||
108 | |||
109 | $this->assertUrlIsCleaned('?xtor=some-url'); | ||
110 | } | ||
111 | |||
112 | /** | ||
113 | * URL cleanup - multiple annoying query parameters | ||
114 | */ | ||
115 | public function testCleanupMultipleQueryParams() | ||
116 | { | ||
117 | $this->assertUrlIsCleaned('?xtor=some-url&fb=som3th1ng'); | ||
118 | $this->assertUrlIsCleaned( | ||
119 | '?fb=stuff&utm_campaign=zomg&utm_medium=numnum&utm_source=c0d3' | ||
120 | ); | ||
121 | } | ||
122 | |||
123 | /** | ||
124 | * URL cleanup - multiple annoying query parameters, annoying fragment | ||
125 | */ | ||
126 | public function testCleanupMultipleQueryParamsAndFragment() | ||
127 | { | ||
128 | $this->assertUrlIsCleaned('?xtor=some-url&fb=som3th1ng', '#tk.rss_all'); | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * Nominal case - the URL contains both useful and annoying parameters | ||
133 | */ | ||
134 | public function testCleanupMixedContent() | ||
135 | { | ||
136 | // ditch annoying query params and fragment, keep useful params | ||
137 | $url = new Url( | ||
138 | self::$baseUrl | ||
139 | .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all' | ||
140 | ); | ||
141 | $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup()); | ||
142 | |||
143 | |||
144 | // ditch annoying query params, keep useful params and fragment | ||
145 | $url = new Url( | ||
146 | self::$baseUrl | ||
147 | .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again' | ||
148 | ); | ||
149 | $this->assertEquals( | ||
150 | self::$baseUrl.'?my=stuff&is=kept#again', | ||
151 | $url->cleanup() | ||
152 | ); | ||
153 | } | ||
154 | } | ||