aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/bookmark
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2019-06-08 13:59:19 +0200
committerArthurHoaro <arthur@hoa.ro>2019-07-06 12:21:52 +0200
commit6a4872520cbbc012b5a8358cd50c78844afe8d07 (patch)
treebf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark
parent5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
downloadShaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.gz
Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.zst
Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.zip
Automatically retrieve description for new bookmarks
If the option is enabled, it will try to find a meta tag containing the page description and keywords, just like we do for the page title. It will either look for regular meta tag or OpenGraph ones. The option is disabled by default. Note that keywords meta tags is mostly not used. In `configure` template, the variable associated with this setting is `$retrieve_description`. Fixes #1302
Diffstat (limited to 'application/bookmark')
-rw-r--r--application/bookmark/LinkUtils.php85
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 35a5b290..77eb2d95 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
7 * 7 *
8 * @param string $charset to extract from the downloaded page (reference) 8 * @param string $charset to extract from the downloaded page (reference)
9 * @param string $title to extract from the downloaded page (reference) 9 * @param string $title to extract from the downloaded page (reference)
10 * @param string $description to extract from the downloaded page (reference)
11 * @param string $keywords to extract from the downloaded page (reference)
12 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
10 * @param string $curlGetInfo Optionally overrides curl_getinfo function 13 * @param string $curlGetInfo Optionally overrides curl_getinfo function
11 * 14 *
12 * @return Closure 15 * @return Closure
13 */ 16 */
14function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') 17function get_curl_download_callback(
15{ 18 &$charset,
19 &$title,
20 &$description,
21 &$keywords,
22 $retrieveDescription,
23 $curlGetInfo = 'curl_getinfo'
24) {
16 $isRedirected = false; 25 $isRedirected = false;
26 $currentChunk = 0;
27 $foundChunk = null;
28
17 /** 29 /**
18 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). 30 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
19 * 31 *
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
25 * 37 *
26 * @return int|bool length of $data or false if we need to stop the download 38 * @return int|bool length of $data or false if we need to stop the download
27 */ 39 */
28 return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { 40 return function (&$ch, $data) use (
41 $retrieveDescription,
42 $curlGetInfo,
43 &$charset,
44 &$title,
45 &$description,
46 &$keywords,
47 &$isRedirected,
48 &$currentChunk,
49 &$foundChunk
50 ) {
51 $currentChunk++;
29 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); 52 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
30 if (!empty($responseCode) && in_array($responseCode, [301, 302])) { 53 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
31 $isRedirected = true; 54 $isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
50 } 73 }
51 if (empty($title)) { 74 if (empty($title)) {
52 $title = html_extract_title($data); 75 $title = html_extract_title($data);
76 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
77 }
78 if ($retrieveDescription && empty($description)) {
79 $description = html_extract_tag('description', $data);
80 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
53 } 81 }
82 if ($retrieveDescription && empty($keywords)) {
83 $keywords = html_extract_tag('keywords', $data);
84 if (! empty($keywords)) {
85 $foundChunk = $currentChunk;
86 // Keywords use the format tag1, tag2 multiple words, tag
87 // So we format them to match Shaarli's separator and glue multiple words with '-'
88 $keywords = implode(' ', array_map(function($keyword) {
89 return implode('-', preg_split('/\s+/', trim($keyword)));
90 }, explode(',', $keywords)));
91 }
92 }
93
54 // We got everything we want, stop the download. 94 // We got everything we want, stop the download.
55 if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { 95 // If we already found either the title, description or keywords,
96 // it's highly unlikely that we'll found the other metas further than
97 // in the same chunk of data or the next one. So we also stop the download after that.
98 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
99 && (! $retrieveDescription
100 || $foundChunk < $currentChunk
101 || (!empty($title) && !empty($description) && !empty($keywords))
102 )
103 ) {
56 return false; 104 return false;
57 } 105 }
58 106
@@ -111,6 +159,35 @@ function html_extract_charset($html)
111} 159}
112 160
113/** 161/**
162 * Extract meta tag from HTML content in either:
163 * - OpenGraph: <meta property="og:[tag]" ...>
164 * - Meta tag: <meta name="[tag]" ...>
165 *
166 * @param string $tag Name of the tag to retrieve.
167 * @param string $html HTML content where to look for charset.
168 *
169 * @return bool|string Charset string if found, false otherwise.
170 */
171function html_extract_tag($tag, $html)
172{
173 $propertiesKey = ['property', 'name', 'itemprop'];
174 $properties = implode('|', $propertiesKey);
175 // Try to retrieve OpenGraph image.
176 $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
177 // If the attributes are not in the order property => content (e.g. Github)
178 // New regex to keep this readable... more or less.
179 $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
180
181 if (preg_match($ogRegex, $html, $matches) > 0
182 || preg_match($ogRegexReverse, $html, $matches) > 0
183 ) {
184 return $matches[1];
185 }
186
187 return false;
188}
189
190/**
114 * Count private links in given linklist. 191 * Count private links in given linklist.
115 * 192 *
116 * @param array|Countable $links Linklist. 193 * @param array|Countable $links Linklist.