diff options
author | ArthurHoaro <arthur@hoa.ro> | 2019-06-08 13:59:19 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2019-07-06 12:21:52 +0200 |
commit | 6a4872520cbbc012b5a8358cd50c78844afe8d07 (patch) | |
tree | bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark | |
parent | 5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff) | |
download | Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.gz Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.tar.zst Shaarli-6a4872520cbbc012b5a8358cd50c78844afe8d07.zip |
Automatically retrieve description for new bookmarks
If the option is enabled, it will try to find a meta tag containing
the page description and keywords, just like we do for the page title.
It will either look for regular meta tag or OpenGraph ones.
The option is disabled by default.
Note that keywords meta tags is mostly not used.
In `configure` template, the variable associated with this setting
is `$retrieve_description`.
Fixes #1302
Diffstat (limited to 'application/bookmark')
-rw-r--r-- | application/bookmark/LinkUtils.php | 85 |
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 35a5b290..77eb2d95 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php | |||
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB; | |||
7 | * | 7 | * |
8 | * @param string $charset to extract from the downloaded page (reference) | 8 | * @param string $charset to extract from the downloaded page (reference) |
9 | * @param string $title to extract from the downloaded page (reference) | 9 | * @param string $title to extract from the downloaded page (reference) |
10 | * @param string $description to extract from the downloaded page (reference) | ||
11 | * @param string $keywords to extract from the downloaded page (reference) | ||
12 | * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content | ||
10 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | 13 | * @param string $curlGetInfo Optionally overrides curl_getinfo function |
11 | * | 14 | * |
12 | * @return Closure | 15 | * @return Closure |
13 | */ | 16 | */ |
14 | function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') | 17 | function get_curl_download_callback( |
15 | { | 18 | &$charset, |
19 | &$title, | ||
20 | &$description, | ||
21 | &$keywords, | ||
22 | $retrieveDescription, | ||
23 | $curlGetInfo = 'curl_getinfo' | ||
24 | ) { | ||
16 | $isRedirected = false; | 25 | $isRedirected = false; |
26 | $currentChunk = 0; | ||
27 | $foundChunk = null; | ||
28 | |||
17 | /** | 29 | /** |
18 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). | 30 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). |
19 | * | 31 | * |
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get | |||
25 | * | 37 | * |
26 | * @return int|bool length of $data or false if we need to stop the download | 38 | * @return int|bool length of $data or false if we need to stop the download |
27 | */ | 39 | */ |
28 | return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { | 40 | return function (&$ch, $data) use ( |
41 | $retrieveDescription, | ||
42 | $curlGetInfo, | ||
43 | &$charset, | ||
44 | &$title, | ||
45 | &$description, | ||
46 | &$keywords, | ||
47 | &$isRedirected, | ||
48 | &$currentChunk, | ||
49 | &$foundChunk | ||
50 | ) { | ||
51 | $currentChunk++; | ||
29 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | 52 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); |
30 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | 53 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { |
31 | $isRedirected = true; | 54 | $isRedirected = true; |
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get | |||
50 | } | 73 | } |
51 | if (empty($title)) { | 74 | if (empty($title)) { |
52 | $title = html_extract_title($data); | 75 | $title = html_extract_title($data); |
76 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
77 | } | ||
78 | if ($retrieveDescription && empty($description)) { | ||
79 | $description = html_extract_tag('description', $data); | ||
80 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | ||
53 | } | 81 | } |
82 | if ($retrieveDescription && empty($keywords)) { | ||
83 | $keywords = html_extract_tag('keywords', $data); | ||
84 | if (! empty($keywords)) { | ||
85 | $foundChunk = $currentChunk; | ||
86 | // Keywords use the format tag1, tag2 multiple words, tag | ||
87 | // So we format them to match Shaarli's separator and glue multiple words with '-' | ||
88 | $keywords = implode(' ', array_map(function($keyword) { | ||
89 | return implode('-', preg_split('/\s+/', trim($keyword))); | ||
90 | }, explode(',', $keywords))); | ||
91 | } | ||
92 | } | ||
93 | |||
54 | // We got everything we want, stop the download. | 94 | // We got everything we want, stop the download. |
55 | if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { | 95 | // If we already found either the title, description or keywords, |
96 | // it's highly unlikely that we'll found the other metas further than | ||
97 | // in the same chunk of data or the next one. So we also stop the download after that. | ||
98 | if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | ||
99 | && (! $retrieveDescription | ||
100 | || $foundChunk < $currentChunk | ||
101 | || (!empty($title) && !empty($description) && !empty($keywords)) | ||
102 | ) | ||
103 | ) { | ||
56 | return false; | 104 | return false; |
57 | } | 105 | } |
58 | 106 | ||
@@ -111,6 +159,35 @@ function html_extract_charset($html) | |||
111 | } | 159 | } |
112 | 160 | ||
113 | /** | 161 | /** |
162 | * Extract meta tag from HTML content in either: | ||
163 | * - OpenGraph: <meta property="og:[tag]" ...> | ||
164 | * - Meta tag: <meta name="[tag]" ...> | ||
165 | * | ||
166 | * @param string $tag Name of the tag to retrieve. | ||
167 | * @param string $html HTML content where to look for charset. | ||
168 | * | ||
169 | * @return bool|string Charset string if found, false otherwise. | ||
170 | */ | ||
171 | function html_extract_tag($tag, $html) | ||
172 | { | ||
173 | $propertiesKey = ['property', 'name', 'itemprop']; | ||
174 | $properties = implode('|', $propertiesKey); | ||
175 | // Try to retrieve OpenGraph image. | ||
176 | $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; | ||
177 | // If the attributes are not in the order property => content (e.g. Github) | ||
178 | // New regex to keep this readable... more or less. | ||
179 | $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; | ||
180 | |||
181 | if (preg_match($ogRegex, $html, $matches) > 0 | ||
182 | || preg_match($ogRegexReverse, $html, $matches) > 0 | ||
183 | ) { | ||
184 | return $matches[1]; | ||
185 | } | ||
186 | |||
187 | return false; | ||
188 | } | ||
189 | |||
190 | /** | ||
114 | * Count private links in given linklist. | 191 | * Count private links in given linklist. |
115 | * | 192 | * |
116 | * @param array|Countable $links Linklist. | 193 | * @param array|Countable $links Linklist. |