diff options
author | ArthurHoaro <arthur@hoa.ro> | 2019-07-06 12:34:02 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-06 12:34:02 +0200 |
commit | c03c90a13e1356ca9cf40cc664547c49305cb24b (patch) | |
tree | bf7ce75b0c93a95d8e7a805b1ebfbe8d90a9565a /application/bookmark | |
parent | 5d8a958d5d139337546bb3f4091a6ef7592ea752 (diff) | |
parent | 6a4872520cbbc012b5a8358cd50c78844afe8d07 (diff) | |
download | Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.gz Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.tar.zst Shaarli-c03c90a13e1356ca9cf40cc664547c49305cb24b.zip |
Merge pull request #1313 from ArthurHoaro/feature/desc-retrieval
Automatically retrieve description for new bookmarks
Diffstat (limited to 'application/bookmark')
-rw-r--r-- | application/bookmark/LinkUtils.php | 85 |
1 files changed, 81 insertions, 4 deletions
diff --git a/application/bookmark/LinkUtils.php b/application/bookmark/LinkUtils.php index 35a5b290..77eb2d95 100644 --- a/application/bookmark/LinkUtils.php +++ b/application/bookmark/LinkUtils.php | |||
@@ -7,13 +7,25 @@ use Shaarli\Bookmark\LinkDB; | |||
7 | * | 7 | * |
8 | * @param string $charset to extract from the downloaded page (reference) | 8 | * @param string $charset to extract from the downloaded page (reference) |
9 | * @param string $title to extract from the downloaded page (reference) | 9 | * @param string $title to extract from the downloaded page (reference) |
10 | * @param string $description to extract from the downloaded page (reference) | ||
11 | * @param string $keywords to extract from the downloaded page (reference) | ||
12 | * @param bool $retrieveDescription Automatically tries to retrieve description and keywords from HTML content | ||
10 | * @param string $curlGetInfo Optionally overrides curl_getinfo function | 13 | * @param string $curlGetInfo Optionally overrides curl_getinfo function |
11 | * | 14 | * |
12 | * @return Closure | 15 | * @return Closure |
13 | */ | 16 | */ |
14 | function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_getinfo') | 17 | function get_curl_download_callback( |
15 | { | 18 | &$charset, |
19 | &$title, | ||
20 | &$description, | ||
21 | &$keywords, | ||
22 | $retrieveDescription, | ||
23 | $curlGetInfo = 'curl_getinfo' | ||
24 | ) { | ||
16 | $isRedirected = false; | 25 | $isRedirected = false; |
26 | $currentChunk = 0; | ||
27 | $foundChunk = null; | ||
28 | |||
17 | /** | 29 | /** |
18 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). | 30 | * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). |
19 | * | 31 | * |
@@ -25,7 +37,18 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get | |||
25 | * | 37 | * |
26 | * @return int|bool length of $data or false if we need to stop the download | 38 | * @return int|bool length of $data or false if we need to stop the download |
27 | */ | 39 | */ |
28 | return function (&$ch, $data) use ($curlGetInfo, &$charset, &$title, &$isRedirected) { | 40 | return function (&$ch, $data) use ( |
41 | $retrieveDescription, | ||
42 | $curlGetInfo, | ||
43 | &$charset, | ||
44 | &$title, | ||
45 | &$description, | ||
46 | &$keywords, | ||
47 | &$isRedirected, | ||
48 | &$currentChunk, | ||
49 | &$foundChunk | ||
50 | ) { | ||
51 | $currentChunk++; | ||
29 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); | 52 | $responseCode = $curlGetInfo($ch, CURLINFO_RESPONSE_CODE); |
30 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { | 53 | if (!empty($responseCode) && in_array($responseCode, [301, 302])) { |
31 | $isRedirected = true; | 54 | $isRedirected = true; |
@@ -50,9 +73,34 @@ function get_curl_download_callback(&$charset, &$title, $curlGetInfo = 'curl_get | |||
50 | } | 73 | } |
51 | if (empty($title)) { | 74 | if (empty($title)) { |
52 | $title = html_extract_title($data); | 75 | $title = html_extract_title($data); |
76 | $foundChunk = ! empty($title) ? $currentChunk : $foundChunk; | ||
77 | } | ||
78 | if ($retrieveDescription && empty($description)) { | ||
79 | $description = html_extract_tag('description', $data); | ||
80 | $foundChunk = ! empty($description) ? $currentChunk : $foundChunk; | ||
53 | } | 81 | } |
82 | if ($retrieveDescription && empty($keywords)) { | ||
83 | $keywords = html_extract_tag('keywords', $data); | ||
84 | if (! empty($keywords)) { | ||
85 | $foundChunk = $currentChunk; | ||
86 | // Keywords use the format tag1, tag2 multiple words, tag | ||
87 | // So we format them to match Shaarli's separator and glue multiple words with '-' | ||
88 | $keywords = implode(' ', array_map(function($keyword) { | ||
89 | return implode('-', preg_split('/\s+/', trim($keyword))); | ||
90 | }, explode(',', $keywords))); | ||
91 | } | ||
92 | } | ||
93 | |||
54 | // We got everything we want, stop the download. | 94 | // We got everything we want, stop the download. |
55 | if (!empty($responseCode) && !empty($contentType) && !empty($charset) && !empty($title)) { | 95 | // If we already found either the title, description or keywords, |
96 | // it's highly unlikely that we'll found the other metas further than | ||
97 | // in the same chunk of data or the next one. So we also stop the download after that. | ||
98 | if ((!empty($responseCode) && !empty($contentType) && !empty($charset)) && $foundChunk !== null | ||
99 | && (! $retrieveDescription | ||
100 | || $foundChunk < $currentChunk | ||
101 | || (!empty($title) && !empty($description) && !empty($keywords)) | ||
102 | ) | ||
103 | ) { | ||
56 | return false; | 104 | return false; |
57 | } | 105 | } |
58 | 106 | ||
@@ -111,6 +159,35 @@ function html_extract_charset($html) | |||
111 | } | 159 | } |
112 | 160 | ||
113 | /** | 161 | /** |
162 | * Extract meta tag from HTML content in either: | ||
163 | * - OpenGraph: <meta property="og:[tag]" ...> | ||
164 | * - Meta tag: <meta name="[tag]" ...> | ||
165 | * | ||
166 | * @param string $tag Name of the tag to retrieve. | ||
167 | * @param string $html HTML content where to look for charset. | ||
168 | * | ||
169 | * @return bool|string Charset string if found, false otherwise. | ||
170 | */ | ||
171 | function html_extract_tag($tag, $html) | ||
172 | { | ||
173 | $propertiesKey = ['property', 'name', 'itemprop']; | ||
174 | $properties = implode('|', $propertiesKey); | ||
175 | // Try to retrieve OpenGraph image. | ||
176 | $ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#'; | ||
177 | // If the attributes are not in the order property => content (e.g. Github) | ||
178 | // New regex to keep this readable... more or less. | ||
179 | $ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#'; | ||
180 | |||
181 | if (preg_match($ogRegex, $html, $matches) > 0 | ||
182 | || preg_match($ogRegexReverse, $html, $matches) > 0 | ||
183 | ) { | ||
184 | return $matches[1]; | ||
185 | } | ||
186 | |||
187 | return false; | ||
188 | } | ||
189 | |||
190 | /** | ||
114 | * Count private links in given linklist. | 191 | * Count private links in given linklist. |
115 | * | 192 | * |
116 | * @param array|Countable $links Linklist. | 193 | * @param array|Countable $links Linklist. |