aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/bookmark/LinkUtils.php
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2019-07-06 12:34:02 +0200
committerGitHub <noreply@github.com>2019-07-06 12:34:02 +0200
commitc03c90a13e1356ca9cf40cc664547c49305cb24b (patch)
treebf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark/LinkUtils.php
parent5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff)
parent6a4872520cbbc012b5a8358cd50c78844afe8d07 (diff)
downloadShaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.gz
Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.zst
Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.zip
Merge pull request #1313 from ArthurHoaro/feature/desc-retrieval
Automatically retrieve description for new bookmarks
Diffstat (limited to 'application/bookmark/LinkUtils.php')
-rw-r--r--application/bookmark/LinkUtils.php85
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php
index 35a5b290..77eb2d95 100644
--- a/application/bookmark/LinkUtils.php
+++ b/application/bookmark/LinkUtils.php
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB;
7 * 7 *
8 * @param string $charset to extract from the downloaded page (reference) 8 * @param string $charset to extract from the downloaded page (reference)
9 * @param string $title to extract from the downloaded page (reference) 9 * @param string $title to extract from the downloaded page (reference)
10 * @param string $description to extract from the downloaded page (reference)
11 * @param string $keywords to extract from the downloaded page (reference)
12 * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content
10 * @param string $curlGetInfo Optionally overrides curl_getinfo function 13 * @param string $curlGetInfo Optionally overrides curl_getinfo function
11 * 14 *
12 * @return Closure 15 * @return Closure
13 */ 16 */
14function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') 17function get_curl_download_callback(
15{ 18 &$charset,
19 &$title,
20 &$description,
21 &$keywords,
22 $retrieveDescription,
23 $curlGetInfo = 'curl_getinfo'
24) {
16 $isRedirected = false; 25 $isRedirected = false;
26 $currentChunk = 0;
27 $foundChunk = null;
28
17 /** 29 /**
18 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). 30 * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
19 * 31 *
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
25 * 37 *
26 * @return int|bool length of $data or false if we need to stop the download 38 * @return int|bool length of $data or false if we need to stop the download
27 */ 39 */
28 return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { 40 return function (&$ch, $data) use (
41 $retrieveDescription,
42 $curlGetInfo,
43 &$charset,
44 &$title,
45 &$description,
46 &$keywords,
47 &$isRedirected,
48 &$currentChunk,
49 &$foundChunk
50 ) {
51 $currentChunk++;
29 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); 52 $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE);
30 if (!empty($responseCode) && in_array($responseCode, [301, 302])) { 53 if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
31 $isRedirected = true; 54 $isRedirected = true;
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get
50 } 73 }
51 if (empty($title)) { 74 if (empty($title)) {
52 $title = html_extract_title($data); 75 $title = html_extract_title($data);
76 $foundChunk = ! empty($title) ? $currentChunk : $foundChunk;
77 }
78 if ($retrieveDescription && empty($description)) {
79 $description = html_extract_tag('description', $data);
80 $foundChunk = ! empty($description) ? $currentChunk : $foundChunk;
53 } 81 }
82 if ($retrieveDescription && empty($keywords)) {
83 $keywords = html_extract_tag('keywords', $data);
84 if (! empty($keywords)) {
85 $foundChunk = $currentChunk;
86 // Keywords use the format tag1, tag2 multiple words, tag
87 // So we format them to match Shaarli's separator and glue multiple words with '-'
88 $keywords = implode(' ', array_map(function($keyword) {
89 return implode('-', preg_split('/\s+/', trim($keyword)));
90 }, explode(',', $keywords)));
91 }
92 }
93
54 // We got everything we want, stop the download. 94 // We got everything we want, stop the download.
55 if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { 95 // If we already found either the title, description or keywords,
96 // it's highly unlikely that we'll found the other metas further than
97 // in the same chunk of data or the next one. So we also stop the download after that.
98 if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null
99 && (! $retrieveDescription
100 || $foundChunk < $currentChunk
101 || (!empty($title) && !empty($description) && !empty($keywords))
102 )
103 ) {
56 return false; 104 return false;
57 } 105 }
58 106
@@ -111,6 +159,35 @@ function html_extract_charset($html)
111} 159}
112 160
113/** 161/**
162 * Extract meta tag from HTML content in either:
163 * - OpenGraph: <meta property="og:[tag]" ...>
164 * - Meta tag: <meta name="[tag]" ...>
165 *
166 * @param string $tag Name of the tag to retrieve.
167 * @param string $html HTML content where to look for charset.
168 *
169 * @return bool|string Charset string if found, false otherwise.
170 */
171function html_extract_tag($tag, $html)
172{
173 $propertiesKey = ['property', 'name', 'itemprop'];
174 $properties = implode('|', $propertiesKey);
175 // Try to retrieve OpenGraph image.
176 $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
177 // If the attributes are not in the order property => content (e.g. Github)
178 // New regex to keep this readable... more or less.
179 $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
180
181 if (preg_match($ogRegex, $html, $matches) > 0
182 || preg_match($ogRegexReverse, $html, $matches) > 0
183 ) {
184 return $matches[1];
185 }
186
187 return false;
188}
189
190/**
114 * Count private links in given linklist. 191 * Count private links in given linklist.
115 * 192 *
116 * @param array|Countable $links Linklist. 193 * @param array|Countable $links Linklist.