Merge pull request #1540 from ArthurHoaro/fix/metadata-regexes

[github/shaarli/Shaarli.git] / application / bookmark / LinkUtils.php
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php

index 68914fcab749a19b1ba15193decd99247158375a..faf5dbfd4fe24906bf980d8f4cc72e0472b7e008 100644 (file)
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -26,7 +26,7 @@ function html_extract_title($html)
   */
  function header_extract_charset($header)
  {
-    preg_match('/charset="?([^; ]+)/i', $header, $match);
+    preg_match('/charset=["\']?([^; "\']+)/i', $header, $match);
      if (! empty($match[1])) {
          return strtolower(trim($match[1]));
      }
@@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
  {
      $propertiesKey = ['property', 'name', 'itemprop'];
      $properties = implode('|', $propertiesKey);
+    // We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
+    $orCondition  = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
      // Try to retrieve OpenGraph image.
-    $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
+    $ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
      // If the attributes are not in the order property => content (e.g. Github)
      // New regex to keep this readable... more or less.
-    $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
+    $ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';
  
      if (preg_match($ogRegex, $html, $matches) > 0
          || preg_match($ogRegexReverse, $html, $matches) > 0