From 6a4872520cbbc012b5a8358cd50c78844afe8d07 Mon Sep 17 00:00:00 2001
From: ArthurHoaro <arthur@hoa.ro>
Date: Sat, 8 Jun 2019 13:59:19 +0200
Subject: Automatically retrieve description for new bookmarks

If the option is enabled, it will try to find a meta tag containing
the page description and keywords, just like we do for the page title.
It will either look for regular meta tag or OpenGraph ones.

The option is disabled by default.

Note that keywords meta tags is mostly not used.

In `configure` template, the variable associated with this setting
is `$retrieve_description`.

Fixes #1302
---
 application/bookmark/LinkUtils.php | 85 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 4 deletions(-)

(limited to 'application/bookmark')

diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 35a5b290..77eb2d95 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
  *
  * @param string $charset     to extract from the downloaded page (reference)
  * @param string $title       to extract from the downloaded page (reference)
+ * @param string $description to extract from the downloaded page (reference)
+ * @param string $keywords    to extract from the downloaded page (reference)
+ * @param bool   $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
  * @param string $curlGetInfo Optionally overrides curl_getinfo function
  *
  * @return Closure
  */
-function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo')
-{
+function get_curl_download_callback(
+    &$charset,
+    &$title,
+    &$description,
+    &$keywords,
+    $retrieveDescription,
+    $curlGetInfo = 'curl_getinfo'
+) {
     $isRedirected = false;
+    $currentChunk = 0;
+    $foundChunk = null;
+
     /**
      * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
      *
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
      *
      * @return int|bool length of $data or false if we need to stop the download
      */
-    return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) {
+    return function (&$ch, $data) use (
+        $retrieveDescription,
+        $curlGetInfo,
+        &$charset,
+        &$title,
+        &$description,
+        &$keywords,
+        &$isRedirected,
+        &$currentChunk,
+        &$foundChunk
+    ) {
+        $currentChunk++;
         $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
         if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
             $isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
         }
         if (empty($title)) {
             $title = html_extract_title($data);
+            $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
+        }
+        if ($retrieveDescription && empty($description)) {
+            $description = html_extract_tag('description', $data);
+            $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
         }
+        if ($retrieveDescription && empty($keywords)) {
+            $keywords = html_extract_tag('keywords', $data);
+            if (! empty($keywords)) {
+                $foundChunk = $currentChunk;
+                // Keywords use the format tag1, tag2 multiple words, tag
+                // So we format them to match Shaarli's separator and glue multiple words with '-'
+                $keywords = implode(' ', array_map(function($keyword) {
+                    return implode('-', preg_split('/\s+/', trim($keyword)));
+                }, explode(',', $keywords)));
+            }
+        }
+
         // We got everything we want, stop the download.
-        if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) {
+        // If we already found either the title, description or keywords,
+        // it's highly unlikely that we'll found the other metas further than
+        // in the same chunk of data or the next one. So we also stop the download after that.
+        if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
+            && (! $retrieveDescription
+                || $foundChunk < $currentChunk
+                || (!empty($title) && !empty($description) && !empty($keywords))
+            )
+        ) {
             return false;
         }
 
@@ -110,6 +158,35 @@ function html_extract_charset($html)
     return false;
 }
 
+/**
+ * Extract meta tag from HTML content in either:
+ *   - OpenGraph: <meta property="og:[tag]" ...>
+ *   - Meta tag: <meta name="[tag]" ...>
+ *
+ * @param string $tag  Name of the tag to retrieve.
+ * @param string $html HTML content where to look for charset.
+ *
+ * @return bool|string Charset string if found, false otherwise.
+ */
+function html_extract_tag($tag, $html)
+{
+    $propertiesKey = ['property', 'name', 'itemprop'];
+    $properties = implode('|', $propertiesKey);
+    // Try to retrieve OpenGraph image.
+    $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
+    // If the attributes are not in the order property => content (e.g. Github)
+    // New regex to keep this readable... more or less.
+    $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
+
+    if (preg_match($ogRegex, $html, $matches) > 0
+        || preg_match($ogRegexReverse, $html, $matches) > 0
+    ) {
+        return $matches[1];
+    }
+
+    return false;
+}
+
 /**
  * Count private links in given linklist.
  *
-- 
cgit v1.2.3