aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorKevin Decherf <kevin@kdecherf.com>2018-10-26 11:31:41 +0200
committerGitHub <noreply@github.com>2018-10-26 11:31:41 +0200
commita6e4e83809ab7abf51e5c06de503ef1b900bb219 (patch)
treea3dc33ea43eadc3528ce494dfc05df6b500d1024 /src
parentae4f7dceec030439d3c05cc3ab3223764a62e0f6 (diff)
parent1b220426e2e8139364b4a34678a2843c2e8bccf5 (diff)
downloadwallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.tar.gz
wallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.tar.zst
wallabag-a6e4e83809ab7abf51e5c06de503ef1b900bb219.zip
Merge pull request #3553 from wallabag/url-3529
Swap entry url with origin url if graby provides an updated one
Diffstat (limited to 'src')
-rw-r--r--src/Wallabag/CoreBundle/Helper/ContentProxy.php118
1 files changed, 117 insertions, 1 deletions
diff --git a/src/Wallabag/CoreBundle/Helper/ContentProxy.php b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
index d4ea608f..d38811a2 100644
--- a/src/Wallabag/CoreBundle/Helper/ContentProxy.php
+++ b/src/Wallabag/CoreBundle/Helper/ContentProxy.php
@@ -66,6 +66,13 @@ class ContentProxy
66 // so we'll be able to refetch it in the future 66 // so we'll be able to refetch it in the future
67 $content['url'] = !empty($content['url']) ? $content['url'] : $url; 67 $content['url'] = !empty($content['url']) ? $content['url'] : $url;
68 68
69 // In one case (at least in tests), url is empty here
70 // so we set it using $url provided in the updateEntry call.
71 // Not sure what are the other possible cases where this property is empty
72 if (empty($entry->getUrl()) && !empty($url)) {
73 $entry->setUrl($url);
74 }
75
69 $this->stockEntry($entry, $content); 76 $this->stockEntry($entry, $content);
70 } 77 }
71 78
@@ -239,7 +246,7 @@ class ContentProxy
239 */ 246 */
240 private function stockEntry(Entry $entry, array $content) 247 private function stockEntry(Entry $entry, array $content)
241 { 248 {
242 $entry->setUrl($content['url']); 249 $this->updateOriginUrl($entry, $content['url']);
243 250
244 $this->setEntryDomainName($entry); 251 $this->setEntryDomainName($entry);
245 252
@@ -306,6 +313,115 @@ class ContentProxy
306 } 313 }
307 314
308 /** 315 /**
316 * Update the origin_url field when a redirection occurs
317 * This field is set if it is empty and new url does not match ignore list.
318 *
319 * @param Entry $entry
320 * @param string $url
321 */
322 private function updateOriginUrl(Entry $entry, $url)
323 {
324 if (empty($url) || $entry->getUrl() === $url) {
325 return false;
326 }
327
328 $parsed_entry_url = parse_url($entry->getUrl());
329 $parsed_content_url = parse_url($url);
330
331 /**
332 * The following part computes the list of part changes between two
333 * parse_url arrays.
334 *
335 * As array_diff_assoc only computes changes to go from the left array
336 * to the right one, we make two differents arrays to have both
337 * directions. We merge these two arrays and sort keys before passing
338 * the result to the switch.
339 *
340 * The resulting array gives us all changing parts between the two
341 * urls: scheme, host, path, query and/or fragment.
342 */
343 $diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url);
344 $diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url);
345
346 $diff = array_merge($diff_ec, $diff_ce);
347 $diff_keys = array_keys($diff);
348 sort($diff_keys);
349
350 if ($this->ignoreUrl($entry->getUrl())) {
351 $entry->setUrl($url);
352
353 return false;
354 }
355
356 /**
357 * This switch case lets us apply different behaviors according to
358 * changing parts of urls.
359 *
360 * As $diff_keys is an array, we provide arrays as cases. ['path'] means
361 * 'only the path is different between the two urls' whereas
362 * ['fragment', 'query'] means 'only fragment and query string parts are
363 * different between the two urls'.
364 *
365 * Note that values in $diff_keys are sorted.
366 */
367 switch ($diff_keys) {
368 case ['path']:
369 if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry
370 || ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId
371 $entry->setUrl($url);
372 }
373 break;
374 case ['scheme']:
375 $entry->setUrl($url);
376 break;
377 case ['fragment']:
378 // noop
379 break;
380 default:
381 if (empty($entry->getOriginUrl())) {
382 $entry->setOriginUrl($entry->getUrl());
383 }
384 $entry->setUrl($url);
385 break;
386 }
387 }
388
389 /**
390 * Check entry url against an ignore list to replace with content url.
391 *
392 * XXX: move the ignore list in the database to let users handle it
393 *
394 * @param string $url url to test
395 *
396 * @return bool true if url matches ignore list otherwise false
397 */
398 private function ignoreUrl($url)
399 {
400 $ignored_hosts = ['feedproxy.google.com', 'feeds.reuters.com'];
401 $ignored_patterns = ['https?://www\.lemonde\.fr/tiny.*'];
402
403 $parsed_url = parse_url($url);
404
405 $filtered = array_filter($ignored_hosts, function ($var) use ($parsed_url) {
406 return $var === $parsed_url['host'];
407 });
408
409 if ([] !== $filtered) {
410 return true;
411 }
412
413 $filtered = array_filter($ignored_patterns, function ($var) use ($url) {
414 return preg_match("`$var`i", $url);
415 });
416
417 if ([] !== $filtered) {
418 return true;
419 }
420
421 return false;
422 }
423
424 /**
309 * Validate that the given content has at least a title, an html and a url. 425 * Validate that the given content has at least a title, an html and a url.
310 * 426 *
311 * @param array $content 427 * @param array $content