]>
Commit | Line | Data |
---|---|---|
1 | <?php | |
2 | ||
3 | declare(strict_types=1); | |
4 | ||
5 | namespace Shaarli\Bookmark; | |
6 | ||
7 | use Shaarli\Bookmark\Exception\BookmarkNotFoundException; | |
8 | use Shaarli\Config\ConfigManager; | |
9 | use Shaarli\Plugin\PluginManager; | |
10 | ||
11 | /** | |
12 | * Class LinkFilter. | |
13 | * | |
14 | * Perform search and filter operation on link data list. | |
15 | */ | |
16 | class BookmarkFilter | |
17 | { | |
18 | /** | |
19 | * @var string permalinks. | |
20 | */ | |
21 | public static $FILTER_HASH = 'permalink'; | |
22 | ||
23 | /** | |
24 | * @var string text search. | |
25 | */ | |
26 | public static $FILTER_TEXT = 'fulltext'; | |
27 | ||
28 | /** | |
29 | * @var string tag filter. | |
30 | */ | |
31 | public static $FILTER_TAG = 'tags'; | |
32 | ||
33 | /** | |
34 | * @var string filter by day. | |
35 | */ | |
36 | public static $DEFAULT = 'NO_FILTER'; | |
37 | ||
38 | /** @var string Visibility: all */ | |
39 | public static $ALL = 'all'; | |
40 | ||
41 | /** @var string Visibility: public */ | |
42 | public static $PUBLIC = 'public'; | |
43 | ||
44 | /** @var string Visibility: private */ | |
45 | public static $PRIVATE = 'private'; | |
46 | ||
47 | /** | |
48 | * @var string Allowed characters for hashtags (regex syntax). | |
49 | */ | |
50 | public static $HASHTAG_CHARS = '\p{Pc}\p{N}\p{L}\p{Mn}'; | |
51 | ||
52 | /** | |
53 | * @var Bookmark[] all available bookmarks. | |
54 | */ | |
55 | private $bookmarks; | |
56 | ||
57 | /** @var ConfigManager */ | |
58 | protected $conf; | |
59 | ||
60 | /** @var PluginManager */ | |
61 | protected $pluginManager; | |
62 | ||
63 | /** | |
64 | * @param Bookmark[] $bookmarks initialization. | |
65 | */ | |
66 | public function __construct($bookmarks, ConfigManager $conf, PluginManager $pluginManager) | |
67 | { | |
68 | $this->bookmarks = $bookmarks; | |
69 | $this->conf = $conf; | |
70 | $this->pluginManager = $pluginManager; | |
71 | } | |
72 | ||
73 | /** | |
74 | * Filter bookmarks according to parameters. | |
75 | * | |
76 | * @param string $type Type of filter (eg. tags, permalink, etc.). | |
77 | * @param mixed $request Filter content. | |
78 | * @param bool $casesensitive Optional: Perform case sensitive filter if true. | |
79 | * @param string $visibility Optional: return only all/private/public bookmarks | |
80 | * @param bool $untaggedonly Optional: return only untagged bookmarks. Applies only if $type includes FILTER_TAG | |
81 | * | |
82 | * @return Bookmark[] filtered bookmark list. | |
83 | * | |
84 | * @throws BookmarkNotFoundException | |
85 | */ | |
86 | public function filter( | |
87 | string $type, | |
88 | $request, | |
89 | bool $casesensitive = false, | |
90 | string $visibility = 'all', | |
91 | bool $untaggedonly = false | |
92 | ) { | |
93 | if (!in_array($visibility, ['all', 'public', 'private'])) { | |
94 | $visibility = 'all'; | |
95 | } | |
96 | ||
97 | switch ($type) { | |
98 | case self::$FILTER_HASH: | |
99 | return $this->filterSmallHash($request); | |
100 | case self::$FILTER_TAG | self::$FILTER_TEXT: // == "vuotext" | |
101 | $noRequest = empty($request) || (empty($request[0]) && empty($request[1])); | |
102 | if ($noRequest) { | |
103 | if ($untaggedonly) { | |
104 | return $this->filterUntagged($visibility); | |
105 | } | |
106 | return $this->noFilter($visibility); | |
107 | } | |
108 | if ($untaggedonly) { | |
109 | $filtered = $this->filterUntagged($visibility); | |
110 | } else { | |
111 | $filtered = $this->bookmarks; | |
112 | } | |
113 | if (!empty($request[0])) { | |
114 | $filtered = (new BookmarkFilter($filtered, $this->conf, $this->pluginManager)) | |
115 | ->filterTags($request[0], $casesensitive, $visibility) | |
116 | ; | |
117 | } | |
118 | if (!empty($request[1])) { | |
119 | $filtered = (new BookmarkFilter($filtered, $this->conf, $this->pluginManager)) | |
120 | ->filterFulltext($request[1], $visibility) | |
121 | ; | |
122 | } | |
123 | return $filtered; | |
124 | case self::$FILTER_TEXT: | |
125 | return $this->filterFulltext($request, $visibility); | |
126 | case self::$FILTER_TAG: | |
127 | if ($untaggedonly) { | |
128 | return $this->filterUntagged($visibility); | |
129 | } else { | |
130 | return $this->filterTags($request, $casesensitive, $visibility); | |
131 | } | |
132 | default: | |
133 | return $this->noFilter($visibility); | |
134 | } | |
135 | } | |
136 | ||
137 | /** | |
138 | * Unknown filter, but handle private only. | |
139 | * | |
140 | * @param string $visibility Optional: return only all/private/public bookmarks | |
141 | * | |
142 | * @return Bookmark[] filtered bookmarks. | |
143 | */ | |
144 | private function noFilter(string $visibility = 'all') | |
145 | { | |
146 | $out = []; | |
147 | foreach ($this->bookmarks as $key => $value) { | |
148 | if ( | |
149 | !$this->pluginManager->filterSearchEntry( | |
150 | $value, | |
151 | ['source' => 'no_filter', 'visibility' => $visibility] | |
152 | ) | |
153 | ) { | |
154 | continue; | |
155 | } | |
156 | ||
157 | if ($visibility === 'all') { | |
158 | $out[$key] = $value; | |
159 | } elseif ($value->isPrivate() && $visibility === 'private') { | |
160 | $out[$key] = $value; | |
161 | } elseif (!$value->isPrivate() && $visibility === 'public') { | |
162 | $out[$key] = $value; | |
163 | } | |
164 | } | |
165 | ||
166 | return $out; | |
167 | } | |
168 | ||
169 | /** | |
170 | * Returns the shaare corresponding to a smallHash. | |
171 | * | |
172 | * @param string $smallHash permalink hash. | |
173 | * | |
174 | * @return Bookmark[] $filtered array containing permalink data. | |
175 | * | |
176 | * @throws BookmarkNotFoundException if the smallhash doesn't match any link. | |
177 | */ | |
178 | private function filterSmallHash(string $smallHash) | |
179 | { | |
180 | foreach ($this->bookmarks as $key => $l) { | |
181 | if ($smallHash == $l->getShortUrl()) { | |
182 | // Yes, this is ugly and slow | |
183 | return [$key => $l]; | |
184 | } | |
185 | } | |
186 | ||
187 | throw new BookmarkNotFoundException(); | |
188 | } | |
189 | ||
190 | /** | |
191 | * Returns the list of bookmarks corresponding to a full-text search | |
192 | * | |
193 | * Searches: | |
194 | * - in the URLs, title and description; | |
195 | * - are case-insensitive; | |
196 | * - terms surrounded by quotes " are exact terms search. | |
197 | * - terms starting with a dash - are excluded (except exact terms). | |
198 | * | |
199 | * Example: | |
200 | * print_r($mydb->filterFulltext('hollandais')); | |
201 | * | |
202 | * mb_convert_case($val, MB_CASE_LOWER, 'UTF-8') | |
203 | * - allows to perform searches on Unicode text | |
204 | * - see https://github.com/shaarli/Shaarli/issues/75 for examples | |
205 | * | |
206 | * @param string $searchterms search query. | |
207 | * @param string $visibility Optional: return only all/private/public bookmarks. | |
208 | * | |
209 | * @return Bookmark[] search results. | |
210 | */ | |
211 | private function filterFulltext(string $searchterms, string $visibility = 'all') | |
212 | { | |
213 | if (empty($searchterms)) { | |
214 | return $this->noFilter($visibility); | |
215 | } | |
216 | ||
217 | $filtered = []; | |
218 | $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); | |
219 | $exactRegex = '/"([^"]+)"/'; | |
220 | // Retrieve exact search terms. | |
221 | preg_match_all($exactRegex, $search, $exactSearch); | |
222 | $exactSearch = array_values(array_filter($exactSearch[1])); | |
223 | ||
224 | // Remove exact search terms to get AND terms search. | |
225 | $explodedSearchAnd = explode(' ', trim(preg_replace($exactRegex, '', $search))); | |
226 | $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); | |
227 | ||
228 | // Filter excluding terms and update andSearch. | |
229 | $excludeSearch = []; | |
230 | $andSearch = []; | |
231 | foreach ($explodedSearchAnd as $needle) { | |
232 | if ($needle[0] == '-' && strlen($needle) > 1) { | |
233 | $excludeSearch[] = substr($needle, 1); | |
234 | } else { | |
235 | $andSearch[] = $needle; | |
236 | } | |
237 | } | |
238 | ||
239 | // Iterate over every stored link. | |
240 | foreach ($this->bookmarks as $id => $bookmark) { | |
241 | if ( | |
242 | !$this->pluginManager->filterSearchEntry( | |
243 | $bookmark, | |
244 | [ | |
245 | 'source' => 'fulltext', | |
246 | 'searchterms' => $searchterms, | |
247 | 'andSearch' => $andSearch, | |
248 | 'exactSearch' => $exactSearch, | |
249 | 'excludeSearch' => $excludeSearch, | |
250 | 'visibility' => $visibility | |
251 | ] | |
252 | ) | |
253 | ) { | |
254 | continue; | |
255 | } | |
256 | ||
257 | // ignore non private bookmarks when 'privatonly' is on. | |
258 | if ($visibility !== 'all') { | |
259 | if (!$bookmark->isPrivate() && $visibility === 'private') { | |
260 | continue; | |
261 | } elseif ($bookmark->isPrivate() && $visibility === 'public') { | |
262 | continue; | |
263 | } | |
264 | } | |
265 | ||
266 | $lengths = []; | |
267 | $content = $this->buildFullTextSearchableLink($bookmark, $lengths); | |
268 | ||
269 | // Be optimistic | |
270 | $found = true; | |
271 | $foundPositions = []; | |
272 | ||
273 | // First, we look for exact term search | |
274 | // Then iterate over keywords, if keyword is not found, | |
275 | // no need to check for the others. We want all or nothing. | |
276 | foreach ([$exactSearch, $andSearch] as $search) { | |
277 | for ($i = 0; $i < count($search) && $found !== false; $i++) { | |
278 | $found = mb_strpos($content, $search[$i]); | |
279 | if ($found === false) { | |
280 | break; | |
281 | } | |
282 | ||
283 | $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])]; | |
284 | } | |
285 | } | |
286 | ||
287 | // Exclude terms. | |
288 | for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) { | |
289 | $found = strpos($content, $excludeSearch[$i]) === false; | |
290 | } | |
291 | ||
292 | if ($found !== false) { | |
293 | $bookmark->addAdditionalContentEntry( | |
294 | 'search_highlight', | |
295 | $this->postProcessFoundPositions($lengths, $foundPositions) | |
296 | ); | |
297 | ||
298 | $filtered[$id] = $bookmark; | |
299 | } | |
300 | } | |
301 | ||
302 | return $filtered; | |
303 | } | |
304 | ||
305 | /** | |
306 | * Returns the list of bookmarks associated with a given list of tags | |
307 | * | |
308 | * You can specify one or more tags, separated by space or a comma, e.g. | |
309 | * print_r($mydb->filterTags('linux programming')); | |
310 | * | |
311 | * @param string|array $tags list of tags, separated by commas or blank spaces if passed as string. | |
312 | * @param bool $casesensitive ignore case if false. | |
313 | * @param string $visibility Optional: return only all/private/public bookmarks. | |
314 | * | |
315 | * @return Bookmark[] filtered bookmarks. | |
316 | */ | |
317 | public function filterTags($tags, bool $casesensitive = false, string $visibility = 'all') | |
318 | { | |
319 | $tagsSeparator = $this->conf->get('general.tags_separator', ' '); | |
320 | // get single tags (we may get passed an array, even though the docs say different) | |
321 | $inputTags = $tags; | |
322 | if (!is_array($tags)) { | |
323 | // we got an input string, split tags | |
324 | $inputTags = tags_str2array($inputTags, $tagsSeparator); | |
325 | } | |
326 | ||
327 | if (count($inputTags) === 0) { | |
328 | // no input tags | |
329 | return $this->noFilter($visibility); | |
330 | } | |
331 | ||
332 | // If we only have public visibility, we can't look for hidden tags | |
333 | if ($visibility === self::$PUBLIC) { | |
334 | $inputTags = array_values(array_filter($inputTags, function ($tag) { | |
335 | return ! startsWith($tag, '.'); | |
336 | })); | |
337 | ||
338 | if (empty($inputTags)) { | |
339 | return []; | |
340 | } | |
341 | } | |
342 | ||
343 | // build regex from all tags | |
344 | $re = '/^' . implode(array_map([$this, 'tag2regex'], $inputTags)) . '.*$/'; | |
345 | if (!$casesensitive) { | |
346 | // make regex case insensitive | |
347 | $re .= 'i'; | |
348 | } | |
349 | ||
350 | // create resulting array | |
351 | $filtered = []; | |
352 | ||
353 | // iterate over each link | |
354 | foreach ($this->bookmarks as $key => $bookmark) { | |
355 | if ( | |
356 | !$this->pluginManager->filterSearchEntry( | |
357 | $bookmark, | |
358 | [ | |
359 | 'source' => 'tags', | |
360 | 'tags' => $tags, | |
361 | 'casesensitive' => $casesensitive, | |
362 | 'visibility' => $visibility | |
363 | ] | |
364 | ) | |
365 | ) { | |
366 | continue; | |
367 | } | |
368 | ||
369 | // check level of visibility | |
370 | // ignore non private bookmarks when 'privateonly' is on. | |
371 | if ($visibility !== 'all') { | |
372 | if (!$bookmark->isPrivate() && $visibility === 'private') { | |
373 | continue; | |
374 | } elseif ($bookmark->isPrivate() && $visibility === 'public') { | |
375 | continue; | |
376 | } | |
377 | } | |
378 | // build search string, start with tags of current link | |
379 | $search = $bookmark->getTagsString($tagsSeparator); | |
380 | if (strlen(trim($bookmark->getDescription())) && strpos($bookmark->getDescription(), '#') !== false) { | |
381 | // description given and at least one possible tag found | |
382 | $descTags = []; | |
383 | // find all tags in the form of #tag in the description | |
384 | preg_match_all( | |
385 | '/(?<![' . self::$HASHTAG_CHARS . '])#([' . self::$HASHTAG_CHARS . ']+?)\b/sm', | |
386 | $bookmark->getDescription(), | |
387 | $descTags | |
388 | ); | |
389 | if (count($descTags[1])) { | |
390 | // there were some tags in the description, add them to the search string | |
391 | $search .= $tagsSeparator . tags_array2str($descTags[1], $tagsSeparator); | |
392 | } | |
393 | } | |
394 | // match regular expression with search string | |
395 | if (!preg_match($re, $search)) { | |
396 | // this entry does _not_ match our regex | |
397 | continue; | |
398 | } | |
399 | $filtered[$key] = $bookmark; | |
400 | } | |
401 | ||
402 | return $filtered; | |
403 | } | |
404 | ||
405 | /** | |
406 | * Return only bookmarks without any tag. | |
407 | * | |
408 | * @param string $visibility return only all/private/public bookmarks. | |
409 | * | |
410 | * @return Bookmark[] filtered bookmarks. | |
411 | */ | |
412 | public function filterUntagged(string $visibility) | |
413 | { | |
414 | $filtered = []; | |
415 | foreach ($this->bookmarks as $key => $bookmark) { | |
416 | if ( | |
417 | !$this->pluginManager->filterSearchEntry( | |
418 | $bookmark, | |
419 | ['source' => 'untagged', 'visibility' => $visibility] | |
420 | ) | |
421 | ) { | |
422 | continue; | |
423 | } | |
424 | ||
425 | if ($visibility !== 'all') { | |
426 | if (!$bookmark->isPrivate() && $visibility === 'private') { | |
427 | continue; | |
428 | } elseif ($bookmark->isPrivate() && $visibility === 'public') { | |
429 | continue; | |
430 | } | |
431 | } | |
432 | ||
433 | if (empty($bookmark->getTags())) { | |
434 | $filtered[$key] = $bookmark; | |
435 | } | |
436 | } | |
437 | ||
438 | return $filtered; | |
439 | } | |
440 | ||
441 | /** | |
442 | * Convert a list of tags (str) to an array. Also | |
443 | * - handle case sensitivity. | |
444 | * - accepts spaces commas as separator. | |
445 | * | |
446 | * @param string $tags string containing a list of tags. | |
447 | * @param bool $casesensitive will convert everything to lowercase if false. | |
448 | * | |
449 | * @return string[] filtered tags string. | |
450 | */ | |
451 | public static function tagsStrToArray(string $tags, bool $casesensitive): array | |
452 | { | |
453 | // We use UTF-8 conversion to handle various graphemes (i.e. cyrillic, or greek) | |
454 | $tagsOut = $casesensitive ? $tags : mb_convert_case($tags, MB_CASE_LOWER, 'UTF-8'); | |
455 | $tagsOut = str_replace(',', ' ', $tagsOut); | |
456 | ||
457 | return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); | |
458 | } | |
459 | ||
460 | /** | |
461 | * generate a regex fragment out of a tag | |
462 | * | |
463 | * @param string $tag to to generate regexs from. may start with '-' to negate, contain '*' as wildcard | |
464 | * | |
465 | * @return string generated regex fragment | |
466 | */ | |
467 | protected function tag2regex(string $tag): string | |
468 | { | |
469 | $tagsSeparator = $this->conf->get('general.tags_separator', ' '); | |
470 | $len = strlen($tag); | |
471 | if (!$len || $tag === "-" || $tag === "*") { | |
472 | // nothing to search, return empty regex | |
473 | return ''; | |
474 | } | |
475 | if ($tag[0] === "-") { | |
476 | // query is negated | |
477 | $i = 1; // use offset to start after '-' character | |
478 | $regex = '(?!'; // create negative lookahead | |
479 | } else { | |
480 | $i = 0; // start at first character | |
481 | $regex = '(?='; // use positive lookahead | |
482 | } | |
483 | // before tag may only be the separator or the beginning | |
484 | $regex .= '.*(?:^|' . $tagsSeparator . ')'; | |
485 | // iterate over string, separating it into placeholder and content | |
486 | for (; $i < $len; $i++) { | |
487 | if ($tag[$i] === '*') { | |
488 | // placeholder found | |
489 | $regex .= '[^' . $tagsSeparator . ']*?'; | |
490 | } else { | |
491 | // regular characters | |
492 | $offset = strpos($tag, '*', $i); | |
493 | if ($offset === false) { | |
494 | // no placeholder found, set offset to end of string | |
495 | $offset = $len; | |
496 | } | |
497 | // subtract one, as we want to get before the placeholder or end of string | |
498 | $offset -= 1; | |
499 | // we got a tag name that we want to search for. escape any regex characters to prevent conflicts. | |
500 | $regex .= preg_quote(substr($tag, $i, $offset - $i + 1), '/'); | |
501 | // move $i on | |
502 | $i = $offset; | |
503 | } | |
504 | } | |
505 | // after the tag may only be the separator or the end | |
506 | $regex .= '(?:$|' . $tagsSeparator . '))'; | |
507 | return $regex; | |
508 | } | |
509 | ||
510 | /** | |
511 | * This method finalize the content of the foundPositions array, | |
512 | * by associated all search results to their associated bookmark field, | |
513 | * making sure that there is no overlapping results, etc. | |
514 | * | |
515 | * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content. | |
516 | * @param array $foundPositions Positions where the search results were found in the aggregated content. | |
517 | * | |
518 | * @return array Updated $foundPositions, by bookmark field. | |
519 | */ | |
520 | protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array | |
521 | { | |
522 | // Sort results by starting position ASC. | |
523 | usort($foundPositions, function (array $entryA, array $entryB): int { | |
524 | return $entryA['start'] > $entryB['start'] ? 1 : -1; | |
525 | }); | |
526 | ||
527 | $out = []; | |
528 | $currentMax = -1; | |
529 | foreach ($foundPositions as $foundPosition) { | |
530 | // we do not allow overlapping highlights | |
531 | if ($foundPosition['start'] < $currentMax) { | |
532 | continue; | |
533 | } | |
534 | ||
535 | $currentMax = $foundPosition['end']; | |
536 | foreach ($fieldLengths as $part => $length) { | |
537 | if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) { | |
538 | continue; | |
539 | } | |
540 | ||
541 | $out[$part][] = [ | |
542 | 'start' => $foundPosition['start'] - $length['start'], | |
543 | 'end' => $foundPosition['end'] - $length['start'], | |
544 | ]; | |
545 | break; | |
546 | } | |
547 | } | |
548 | ||
549 | return $out; | |
550 | } | |
551 | ||
552 | /** | |
553 | * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms. | |
554 | * Also populate $length array with starting and ending positions of every bookmark field | |
555 | * inside concatenated content. | |
556 | * | |
557 | * @param Bookmark $link | |
558 | * @param array $lengths (by reference) | |
559 | * | |
560 | * @return string Lowercase concatenated fields content. | |
561 | */ | |
562 | protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string | |
563 | { | |
564 | $tagString = $link->getTagsString($this->conf->get('general.tags_separator', ' ')); | |
565 | $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') . '\\'; | |
566 | $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') . '\\'; | |
567 | $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') . '\\'; | |
568 | $content .= mb_convert_case($tagString, MB_CASE_LOWER, 'UTF-8') . '\\'; | |
569 | ||
570 | $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())]; | |
571 | $nextField = $lengths['title']['end'] + 1; | |
572 | $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())]; | |
573 | $nextField = $lengths['description']['end'] + 1; | |
574 | $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())]; | |
575 | $nextField = $lengths['url']['end'] + 1; | |
576 | $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($tagString)]; | |
577 | ||
578 | return $content; | |
579 | } | |
580 | } |