]> git.immae.eu Git - github/shaarli/Shaarli.git/commitdiff
Links: refactor & improve URL cleanup 314/head
authorVirtualTam <virtualtam@flibidi.net>
Thu, 13 Aug 2015 23:14:07 +0000 (01:14 +0200)
committerVirtualTam <virtualtam@flibidi.net>
Sat, 15 Aug 2015 13:58:38 +0000 (15:58 +0200)
Relates to #141
Relates to #133

Modifications
 - move URL cleanup to `application/Url.php`
 - rework the cleanup function
   - fragments: `#stuff`
   - GET parameters: `?var1=val1&var2=val2`
 - add documentation (APIs the params belong to)
 - add test coverage

Reference
 - http://php.net/parse_url
 - http://php.net/manual/en/language.oop5.magic.php#language.oop5.magic.tostring

Signed-off-by: VirtualTam <virtualtam@flibidi.net>
.gitignore
application/Url.php [new file with mode: 0644]
index.php
tests/UrlTest.php [new file with mode: 0644]

index 6fd0ccd80c189ad1f6aec6e412878854ca439fd8..3ffedb31160484d614a949a2e651b65685549a34 100644 (file)
@@ -19,4 +19,5 @@ composer.lock
 # Ignore test data & output
 coverage
 tests/datastore.php
+tests/dummycache/
 phpmd.html
diff --git a/application/Url.php b/application/Url.php
new file mode 100644 (file)
index 0000000..23356f3
--- /dev/null
@@ -0,0 +1,150 @@
+<?php
+/**
+ * Converts an array-represented URL to a string
+ *
+ * Source: http://php.net/manual/en/function.parse-url.php#106731
+ *
+ * @see http://php.net/manual/en/function.parse-url.php
+ *
+ * @param array $parsedUrl an array-represented URL
+ *
+ * @return string the string representation of the URL
+ */
+function unparse_url($parsedUrl)
+{
+    $scheme   = isset($parsedUrl['scheme']) ? $parsedUrl['scheme'].'://' : '';
+    $host     = isset($parsedUrl['host']) ? $parsedUrl['host'] : '';
+    $port     = isset($parsedUrl['port']) ? ':'.$parsedUrl['port'] : '';
+    $user     = isset($parsedUrl['user']) ? $parsedUrl['user'] : '';
+    $pass     = isset($parsedUrl['pass']) ? ':'.$parsedUrl['pass']  : '';
+    $pass     = ($user || $pass) ? "$pass@" : '';
+    $path     = isset($parsedUrl['path']) ? $parsedUrl['path'] : '';
+    $query    = isset($parsedUrl['query']) ? '?'.$parsedUrl['query'] : '';
+    $fragment = isset($parsedUrl['fragment']) ? '#'.$parsedUrl['fragment'] : '';
+
+    return "$scheme$user$pass$host$port$path$query$fragment";
+}
+
+/**
+ * URL representation and cleanup utilities
+ *
+ * Form
+ *   scheme://[username:password@]host[:port][/path][?query][#fragment]
+ *
+ * Examples
+ *   http://username:password@hostname:9090/path?arg1=value1&arg2=value2#anchor
+ *   https://host.name.tld
+ *   https://h2.g2/faq/?vendor=hitchhiker&item=guide&dest=galaxy#answer
+ *
+ * @see http://www.faqs.org/rfcs/rfc3986.html
+ */
+class Url
+{
+    private static $annoyingQueryParams = array(
+        // Facebook
+        'action_object_map=',
+        'action_ref_map=',
+        'action_type_map=',
+        'fb_',
+        'fb=',
+
+        // Scoop.it
+        '__scoop',
+
+        // Google Analytics & FeedProxy
+        'utm_',
+
+        // ATInternet
+        'xtor='
+    );
+
+    private static $annoyingFragments = array(
+        // ATInternet
+        'xtor=RSS-',
+
+        // Misc.
+        'tk.rss_all'
+    );
+
+    /*
+     * URL parts represented as an array
+     *
+     * @see http://php.net/parse_url
+     */
+    protected $parts;
+
+    /**
+     * Parses a string containing a URL
+     *
+     * @param string $url a string containing a URL
+     */
+    public function __construct($url)
+    {
+        $this->parts = parse_url($url);
+    }
+
+    /**
+     * Returns a string representation of this URL
+     */
+    public function __toString()
+    {
+        return unparse_url($this->parts);
+    }
+
+    /**
+     * Removes undesired query parameters
+     */
+    protected function cleanupQuery()
+    {
+        if (! isset($this->parts['query'])) {
+            return;
+        }
+
+        $queryParams = explode('&', $this->parts['query']);
+
+        foreach (self::$annoyingQueryParams as $annoying) {
+            foreach ($queryParams as $param) {
+                if (startsWith($param, $annoying)) {
+                    $queryParams = array_diff($queryParams, array($param));
+                    continue;
+                }
+            }
+        }
+
+        if (count($queryParams) == 0) {
+            unset($this->parts['query']);
+            return;
+        }
+
+        $this->parts['query'] = implode('&', $queryParams);
+    }    
+
+    /**
+     * Removes undesired fragments
+     */
+    protected function cleanupFragment()
+    {
+        if (! isset($this->parts['fragment'])) {
+            return;
+        }
+
+        foreach (self::$annoyingFragments as $annoying) {
+            if (startsWith($this->parts['fragment'], $annoying)) {
+                unset($this->parts['fragment']);
+                break;
+            }
+        }
+    }
+
+    /**
+     * Removes undesired query parameters and fragments
+     *
+     * @return string the string representation of this URL after cleanup
+     */
+    public function cleanup()
+    {
+        $this->cleanupQuery();
+        $this->cleanupFragment();
+        return $this->__toString();
+    }
+}
index 84b8f015ad1ae693ec36f3fbc24112628adf5796..74f95497ef39883de67a0d500e3156ccf34210dc 100755 (executable)
--- a/index.php
+++ b/index.php
@@ -74,6 +74,7 @@ require_once 'application/Cache.php';
 require_once 'application/CachedPage.php';
 require_once 'application/LinkDB.php';
 require_once 'application/TimeZone.php';
+require_once 'application/Url.php';
 require_once 'application/Utils.php';
 require_once 'application/Config.php';
 
@@ -1479,29 +1480,9 @@ function renderPage()
     }
 
     // -------- User want to post a new link: Display link edit form.
-    if (isset($_GET['post']))
-    {
-        $url=$_GET['post'];
-
-        // We remove the annoying parameters added by FeedBurner, GoogleFeedProxy, Facebook...
-        $annoyingpatterns = array('/[\?&]utm_source=[^&]*/',
-            '/[\?&]utm_campaign=[^&]*/',
-            '/[\?&]utm_medium=[^&]*/',
-            '/#xtor=RSS-[^&]*/',
-            '/[\?&]fb_[^&]*/',
-            '/[\?&]__scoop[^&]*/',
-            '/#tk\.rss_all\?/',
-            '/[\?&]action_ref_map=[^&]*/',
-            '/[\?&]action_type_map=[^&]*/',
-            '/[\?&]action_object_map=[^&]*/',
-            '/[\?&]utm_content=[^&]*/',
-            '/[\?&]fb=[^&]*/',
-            '/[\?&]xtor=[^&]*/'
-            );
-        foreach($annoyingpatterns as $pattern)
-        {
-            $url = preg_replace($pattern, "", $url);
-        }
+    if (isset($_GET['post'])) {
+        $url = new Url($_GET['post']);
+        $url->cleanup();
 
         $link_is_new = false;
         $link = $LINKSDB->getLinkFromUrl($url); // Check if URL is not already in database (in this case, we will edit the existing link)
diff --git a/tests/UrlTest.php b/tests/UrlTest.php
new file mode 100644 (file)
index 0000000..a39630f
--- /dev/null
@@ -0,0 +1,154 @@
+<?php
+/**
+ * Url's tests
+ */
+
+require_once 'application/Url.php';
+
+/**
+ * Unitary tests for unparse_url()
+ */
+class UnparseUrlTest extends PHPUnit_Framework_TestCase
+{
+    /**
+     * Thanks for building nothing
+     */
+    public function testUnparseEmptyArray()
+    {
+        $this->assertEquals('', unparse_url(array()));
+    }
+
+    /**
+     * Rebuild a full-featured URL
+     */
+    public function testUnparseFull()
+    {
+        $ref = 'http://username:password@hostname:9090/path'
+              .'?arg1=value1&arg2=value2#anchor';
+        $this->assertEquals($ref, unparse_url(parse_url($ref)));
+    }
+}
+
+/**
+ * Unitary tests for URL utilities
+ */
+class UrlTest extends PHPUnit_Framework_TestCase
+{
+    // base URL for tests
+    protected static $baseUrl = 'http://domain.tld:3000';
+
+    /**
+     * Helper method
+     */
+    private function assertUrlIsCleaned($query='', $fragment='')
+    {
+        $url = new Url(self::$baseUrl.$query.$fragment);
+        $url->cleanup();
+        $this->assertEquals(self::$baseUrl, $url->__toString());
+    }
+
+    /**
+     * Instantiate an empty URL
+     */
+    public function testEmptyConstruct()
+    {
+        $this->assertEquals('', new Url(''));
+    }
+
+    /**
+     * Instantiate a URL
+     */
+    public function testConstruct()
+    {
+        $ref = 'http://username:password@hostname:9090/path'
+              .'?arg1=value1&arg2=value2#anchor';
+        $this->assertEquals($ref, new Url($ref));
+    }
+
+    /**
+     * URL cleanup - nothing to do
+     */
+    public function testNoCleanup()
+    {
+        // URL with no query nor fragment
+        $this->assertUrlIsCleaned();
+
+        // URL with no annoying elements
+        $ref = self::$baseUrl.'?p1=val1&p2=1234#edit';
+        $url = new Url($ref);
+        $this->assertEquals($ref, $url->cleanup());
+    }
+
+    /**
+     * URL cleanup - annoying fragment
+     */
+    public function testCleanupFragment()
+    {
+        $this->assertUrlIsCleaned('', '#tk.rss_all');
+        $this->assertUrlIsCleaned('', '#xtor=RSS-');
+        $this->assertUrlIsCleaned('', '#xtor=RSS-U3ht0tkc4b');
+    }
+
+    /**
+     * URL cleanup - single annoying query parameter
+     */
+    public function testCleanupSingleQueryParam()
+    {
+        $this->assertUrlIsCleaned('?action_object_map=junk');
+        $this->assertUrlIsCleaned('?action_ref_map=Cr4p!');
+        $this->assertUrlIsCleaned('?action_type_map=g4R84g3');
+
+        $this->assertUrlIsCleaned('?fb_stuff=v41u3');
+        $this->assertUrlIsCleaned('?fb=71m3w4573');
+
+        $this->assertUrlIsCleaned('?utm_campaign=zomg');
+        $this->assertUrlIsCleaned('?utm_medium=numnum');
+        $this->assertUrlIsCleaned('?utm_source=c0d3');
+        $this->assertUrlIsCleaned('?utm_term=1n4l');
+
+        $this->assertUrlIsCleaned('?xtor=some-url');
+    }
+
+    /**
+     * URL cleanup - multiple annoying query parameters
+     */
+    public function testCleanupMultipleQueryParams()
+    {
+        $this->assertUrlIsCleaned('?xtor=some-url&fb=som3th1ng');
+        $this->assertUrlIsCleaned(
+            '?fb=stuff&utm_campaign=zomg&utm_medium=numnum&utm_source=c0d3'
+        );
+    }
+
+    /**
+     * URL cleanup - multiple annoying query parameters, annoying fragment
+     */
+    public function testCleanupMultipleQueryParamsAndFragment()
+    {
+        $this->assertUrlIsCleaned('?xtor=some-url&fb=som3th1ng', '#tk.rss_all');
+    }
+
+    /**
+     * Nominal case - the URL contains both useful and annoying parameters
+     */
+    public function testCleanupMixedContent()
+    {
+        // ditch annoying query params and fragment, keep useful params
+        $url = new Url(
+            self::$baseUrl
+            .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#tk.rss_all'
+        );
+        $this->assertEquals(self::$baseUrl.'?my=stuff&is=kept', $url->cleanup());
+
+
+        // ditch annoying query params, keep useful params and fragment
+        $url = new Url(
+            self::$baseUrl
+            .'?fb=zomg&my=stuff&utm_medium=numnum&is=kept#again'
+        );
+        $this->assertEquals(
+            self::$baseUrl.'?my=stuff&is=kept#again',
+            $url->cleanup()
+        );
+    }
+}