aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorKevin Decherf <kevin@kdecherf.com>2018-10-26 11:31:41 +0200
committerGitHub <noreply@github.com>2018-10-26 11:31:41 +0200
commita6e4e83809ab7abf51e5c06de503ef1b900bb219 (patch)
treea3dc33ea43eadc3528ce494dfc05df6b500d1024
parentae4f7dceec030439d3c05cc3ab3223764a62e0f6 (diff)
parent1b220426e2e8139364b4a34678a2843c2e8bccf5 (diff)
downloadwallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.tar.gz
wallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.tar.zst
wallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.zip
Merge pull request #3553 from wallabag/url-3529
Swap entry url with origin url if graby provides an updated one
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php118
-rw-r--r--tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php127
2 files changed, 244 insertions, 1 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index d4ea608f..d38811a2 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -66,6 +66,13 @@ class ContentProxy
66 // so we'll be able to refetch it in the future 66 // so we'll be able to refetch it in the future
67 $content['url'] = !empty($content['url']) ? $content['url'] : $url; 67 $content['url'] = !empty($content['url']) ? $content['url'] : $url;
68 68
69 // In one case (at least in tests), url is empty here
70 // so we set it using $url provided in the updateEntry call.
71 // Not sure what are the other possible cases where this property is empty
72 if (empty($entry->getUrl()) && !empty($url)) {
73 $entry->setUrl($url);
74 }
75
69 $this->stockEntry($entry, $content); 76 $this->stockEntry($entry, $content);
70 } 77 }
71 78
@@ -239,7 +246,7 @@ class ContentProxy
239 */ 246 */
240 private function stockEntry(Entry $entry, array $content) 247 private function stockEntry(Entry $entry, array $content)
241 { 248 {
242 $entry->setUrl($content['url']); 249 $this->updateOriginUrl($entry, $content['url']);
243 250
244 $this->setEntryDomainName($entry); 251 $this->setEntryDomainName($entry);
245 252
@@ -306,6 +313,115 @@ class ContentProxy
306 } 313 }
307 314
308 /** 315 /**
316 * Update the origin_url field when a redirection occurs
317 * This field is set if it is empty and new url does not match ignore list.
318 *
319 * @param Entry $entry
320 * @param string $url
321 */
322 private function updateOriginUrl(Entry $entry, $url)
323 {
324 if (empty($url) || $entry->getUrl() === $url) {
325 return false;
326 }
327
328 $parsed_entry_url = parse_url($entry->getUrl());
329 $parsed_content_url = parse_url($url);
330
331 /**
332 * The following part computes the list of part changes between two
333 * parse_url arrays.
334 *
335 * As array_diff_assoc only computes changes to go from the left array
336 * to the right one, we make two differents arrays to have both
337 * directions. We merge these two arrays and sort keys before passing
338 * the result to the switch.
339 *
340 * The resulting array gives us all changing parts between the two
341 * urls: scheme, host, path, query and/or fragment.
342 */
343 $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url);
344 $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url);
345
346 $diff = array_merge($diff_ec, $diff_ce);
347 $diff_keys = array_keys($diff);
348 sort($diff_keys);
349
350 if ($this->ignoreUrl($entry->getUrl())) {
351 $entry->setUrl($url);
352
353 return false;
354 }
355
356 /**
357 * This switch case lets us apply different behaviors according to
358 * changing parts of urls.
359 *
360 * As $diff_keys is an array, we provide arrays as cases. ['path'] means
361 * 'only the path is different between the two urls' whereas
362 * ['fragment', 'query'] means 'only fragment and query string parts are
363 * different between the two urls'.
364 *
365 * Note that values in $diff_keys are sorted.
366 */
367 switch ($diff_keys) {
368 case ['path']:
369 if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
370 || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
371 $entry->setUrl($url);
372 }
373 break;
374 case ['scheme']:
375 $entry->setUrl($url);
376 break;
377 case ['fragment']:
378 // noop
379 break;
380 default:
381 if (empty($entry->getOriginUrl())) {
382 $entry->setOriginUrl($entry->getUrl());
383 }
384 $entry->setUrl($url);
385 break;
386 }
387 }
388
389 /**
390 * Check entry url against an ignore list to replace with content url.
391 *
392 * XXX: move the ignore list in the database to let users handle it
393 *
394 * @param string $url url to test
395 *
396 * @return bool true if url matches ignore list otherwise false
397 */
398 private function ignoreUrl($url)
399 {
400 $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
401 $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
402
403 $parsed_url = parse_url($url);
404
405 $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
406 return $var === $parsed_url['host'];
407 });
408
409 if ([] !== $filtered) {
410 return true;
411 }
412
413 $filtered = array_filter($ignored_patterns, function ($var) use ($url) {
414 return preg_match("`$var`i", $url);
415 });
416
417 if ([] !== $filtered) {
418 return true;
419 }
420
421 return false;
422 }
423
424 /**
309 * Validate that the given content has at least a title, an html and a url. 425 * Validate that the given content has at least a title, an html and a url.
310 * 426 *
311 * @param array $content 427 * @param array $content
diff --git a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
index 3f3c60d0..3dd9273c 100644
--- a/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
+++ b/tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
@@ -740,6 +740,133 @@ class ContentProxyTest extends TestCase
740 } 740 }
741 741
742 /** 742 /**
743 * Data provider for testWithChangedUrl.
744 *
745 * Arrays contain the following values:
746 * $entry_url
747 * $origin_url
748 * $content_url
749 * $expected_entry_url
750 * $expected_origin_url
751 * $expected_domain
752 */
753 public function dataForChangedUrl()
754 {
755 return [
756 'normal' => [
757 'http://0.0.0.0',
758 null,
759 'http://1.1.1.1',
760 'http://1.1.1.1',
761 'http://0.0.0.0',
762 '1.1.1.1',
763 ],
764 'origin already set' => [
765 'http://0.0.0.0',
766 'http://hello',
767 'http://1.1.1.1',
768 'http://1.1.1.1',
769 'http://hello',
770 '1.1.1.1',
771 ],
772 'trailing slash' => [
773 'https://example.com/hello-world',
774 null,
775 'https://example.com/hello-world/',
776 'https://example.com/hello-world/',
777 null,
778 'example.com',
779 ],
780 'query string in fetched content' => [
781 'https://example.org/hello',
782 null,
783 'https://example.org/hello?world=1',
784 'https://example.org/hello?world=1',
785 'https://example.org/hello',
786 'example.org',
787 ],
788 'fragment in fetched content' => [
789 'https://example.org/hello',
790 null,
791 'https://example.org/hello#world',
792 'https://example.org/hello',
793 null,
794 'example.org',
795 ],
796 'fragment and query string in fetched content' => [
797 'https://example.org/hello',
798 null,
799 'https://example.org/hello?foo#world',
800 'https://example.org/hello?foo#world',
801 'https://example.org/hello',
802 'example.org',
803 ],
804 'different path and query string in fetch content' => [
805 'https://example.org/hello',
806 null,
807 'https://example.org/world?foo',
808 'https://example.org/world?foo',
809 'https://example.org/hello',
810 'example.org',
811 ],
812 'feedproxy ignore list test' => [
813 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
814 null,
815 'https://example.org/hello-wallabag',
816 'https://example.org/hello-wallabag',
817 null,
818 'example.org',
819 ],
820 'feedproxy ignore list test with origin url already set' => [
821 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
822 'https://example.org/this-is-source',
823 'https://example.org/hello-wallabag',
824 'https://example.org/hello-wallabag',
825 'https://example.org/this-is-source',
826 'example.org',
827 ],
828 'lemonde ignore pattern test' => [
829 'http://www.lemonde.fr/tiny/url',
830 null,
831 'http://example.com/hello-world',
832 'http://example.com/hello-world',
833 null,
834 'example.com',
835 ],
836 ];
837 }
838
839 /**
840 * @dataProvider dataForChangedUrl
841 */
842 public function testWithChangedUrl($entry_url, $origin_url, $content_url, $expected_entry_url, $expected_origin_url, $expected_domain)
843 {
844 $tagger = $this->getTaggerMock();
845 $tagger->expects($this->once())
846 ->method('tag');
847
848 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true);
849 $entry = new Entry(new User());
850 $entry->setOriginUrl($origin_url);
851 $proxy->updateEntry(
852 $entry,
853 $entry_url,
854 [
855 'html' => false,
856 'title' => '',
857 'url' => $content_url,
858 'content_type' => '',
859 'language' => '',
860 ],
861 true
862 );
863
864 $this->assertSame($expected_entry_url, $entry->getUrl());
865 $this->assertSame($expected_domain, $entry->getDomainName());
866 $this->assertSame($expected_origin_url, $entry->getOriginUrl());
867 }
868
869 /**
743 * https://stackoverflow.com/a/18506801. 870 * https://stackoverflow.com/a/18506801.
744 * 871 *
745 * @param $string 872 * @param $string