]> git.immae.eu Git - github/shaarli/Shaarli.git/blob - application/bookmark/BookmarkFilter.php
New plugin hook: ability to add custom filters to Shaarli search engine
[github/shaarli/Shaarli.git] / application / bookmark / BookmarkFilter.php
1 <?php
2
3 declare(strict_types=1);
4
5 namespace Shaarli\Bookmark;
6
7 use Shaarli\Bookmark\Exception\BookmarkNotFoundException;
8 use Shaarli\Config\ConfigManager;
9 use Shaarli\Plugin\PluginManager;
10
11 /**
12 * Class LinkFilter.
13 *
14 * Perform search and filter operation on link data list.
15 */
16 class BookmarkFilter
17 {
18 /**
19 * @var string permalinks.
20 */
21 public static $FILTER_HASH = 'permalink';
22
23 /**
24 * @var string text search.
25 */
26 public static $FILTER_TEXT = 'fulltext';
27
28 /**
29 * @var string tag filter.
30 */
31 public static $FILTER_TAG = 'tags';
32
33 /**
34 * @var string filter by day.
35 */
36 public static $DEFAULT = 'NO_FILTER';
37
38 /** @var string Visibility: all */
39 public static $ALL = 'all';
40
41 /** @var string Visibility: public */
42 public static $PUBLIC = 'public';
43
44 /** @var string Visibility: private */
45 public static $PRIVATE = 'private';
46
47 /**
48 * @var string Allowed characters for hashtags (regex syntax).
49 */
50 public static $HASHTAG_CHARS = '\p{Pc}\p{N}\p{L}\p{Mn}';
51
52 /**
53 * @var Bookmark[] all available bookmarks.
54 */
55 private $bookmarks;
56
57 /** @var ConfigManager */
58 protected $conf;
59
60 /** @var PluginManager */
61 protected $pluginManager;
62
63 /**
64 * @param Bookmark[] $bookmarks initialization.
65 */
66 public function __construct($bookmarks, ConfigManager $conf, PluginManager $pluginManager)
67 {
68 $this->bookmarks = $bookmarks;
69 $this->conf = $conf;
70 $this->pluginManager = $pluginManager;
71 }
72
73 /**
74 * Filter bookmarks according to parameters.
75 *
76 * @param string $type Type of filter (eg. tags, permalink, etc.).
77 * @param mixed $request Filter content.
78 * @param bool $casesensitive Optional: Perform case sensitive filter if true.
79 * @param string $visibility Optional: return only all/private/public bookmarks
80 * @param bool $untaggedonly Optional: return only untagged bookmarks. Applies only if $type includes FILTER_TAG
81 *
82 * @return Bookmark[] filtered bookmark list.
83 *
84 * @throws BookmarkNotFoundException
85 */
86 public function filter(
87 string $type,
88 $request,
89 bool $casesensitive = false,
90 string $visibility = 'all',
91 bool $untaggedonly = false
92 ) {
93 if (!in_array($visibility, ['all', 'public', 'private'])) {
94 $visibility = 'all';
95 }
96
97 switch ($type) {
98 case self::$FILTER_HASH:
99 return $this->filterSmallHash($request);
100 case self::$FILTER_TAG | self::$FILTER_TEXT: // == "vuotext"
101 $noRequest = empty($request) || (empty($request[0]) && empty($request[1]));
102 if ($noRequest) {
103 if ($untaggedonly) {
104 return $this->filterUntagged($visibility);
105 }
106 return $this->noFilter($visibility);
107 }
108 if ($untaggedonly) {
109 $filtered = $this->filterUntagged($visibility);
110 } else {
111 $filtered = $this->bookmarks;
112 }
113 if (!empty($request[0])) {
114 $filtered = (new BookmarkFilter($filtered, $this->conf, $this->pluginManager))
115 ->filterTags($request[0], $casesensitive, $visibility)
116 ;
117 }
118 if (!empty($request[1])) {
119 $filtered = (new BookmarkFilter($filtered, $this->conf, $this->pluginManager))
120 ->filterFulltext($request[1], $visibility)
121 ;
122 }
123 return $filtered;
124 case self::$FILTER_TEXT:
125 return $this->filterFulltext($request, $visibility);
126 case self::$FILTER_TAG:
127 if ($untaggedonly) {
128 return $this->filterUntagged($visibility);
129 } else {
130 return $this->filterTags($request, $casesensitive, $visibility);
131 }
132 default:
133 return $this->noFilter($visibility);
134 }
135 }
136
137 /**
138 * Unknown filter, but handle private only.
139 *
140 * @param string $visibility Optional: return only all/private/public bookmarks
141 *
142 * @return Bookmark[] filtered bookmarks.
143 */
144 private function noFilter(string $visibility = 'all')
145 {
146 $out = [];
147 foreach ($this->bookmarks as $key => $value) {
148 if (
149 !$this->pluginManager->filterSearchEntry(
150 $value,
151 ['source' => 'no_filter', 'visibility' => $visibility]
152 )
153 ) {
154 continue;
155 }
156
157 if ($visibility === 'all') {
158 $out[$key] = $value;
159 } elseif ($value->isPrivate() && $visibility === 'private') {
160 $out[$key] = $value;
161 } elseif (!$value->isPrivate() && $visibility === 'public') {
162 $out[$key] = $value;
163 }
164 }
165
166 return $out;
167 }
168
169 /**
170 * Returns the shaare corresponding to a smallHash.
171 *
172 * @param string $smallHash permalink hash.
173 *
174 * @return Bookmark[] $filtered array containing permalink data.
175 *
176 * @throws BookmarkNotFoundException if the smallhash doesn't match any link.
177 */
178 private function filterSmallHash(string $smallHash)
179 {
180 foreach ($this->bookmarks as $key => $l) {
181 if ($smallHash == $l->getShortUrl()) {
182 // Yes, this is ugly and slow
183 return [$key => $l];
184 }
185 }
186
187 throw new BookmarkNotFoundException();
188 }
189
190 /**
191 * Returns the list of bookmarks corresponding to a full-text search
192 *
193 * Searches:
194 * - in the URLs, title and description;
195 * - are case-insensitive;
196 * - terms surrounded by quotes " are exact terms search.
197 * - terms starting with a dash - are excluded (except exact terms).
198 *
199 * Example:
200 * print_r($mydb->filterFulltext('hollandais'));
201 *
202 * mb_convert_case($val, MB_CASE_LOWER, 'UTF-8')
203 * - allows to perform searches on Unicode text
204 * - see https://github.com/shaarli/Shaarli/issues/75 for examples
205 *
206 * @param string $searchterms search query.
207 * @param string $visibility Optional: return only all/private/public bookmarks.
208 *
209 * @return Bookmark[] search results.
210 */
211 private function filterFulltext(string $searchterms, string $visibility = 'all')
212 {
213 if (empty($searchterms)) {
214 return $this->noFilter($visibility);
215 }
216
217 $filtered = [];
218 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8');
219 $exactRegex = '/"([^"]+)"/';
220 // Retrieve exact search terms.
221 preg_match_all($exactRegex, $search, $exactSearch);
222 $exactSearch = array_values(array_filter($exactSearch[1]));
223
224 // Remove exact search terms to get AND terms search.
225 $explodedSearchAnd = explode(' ', trim(preg_replace($exactRegex, '', $search)));
226 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd));
227
228 // Filter excluding terms and update andSearch.
229 $excludeSearch = [];
230 $andSearch = [];
231 foreach ($explodedSearchAnd as $needle) {
232 if ($needle[0] == '-' && strlen($needle) > 1) {
233 $excludeSearch[] = substr($needle, 1);
234 } else {
235 $andSearch[] = $needle;
236 }
237 }
238
239 // Iterate over every stored link.
240 foreach ($this->bookmarks as $id => $bookmark) {
241 if (
242 !$this->pluginManager->filterSearchEntry(
243 $bookmark,
244 [
245 'source' => 'fulltext',
246 'searchterms' => $searchterms,
247 'andSearch' => $andSearch,
248 'exactSearch' => $exactSearch,
249 'excludeSearch' => $excludeSearch,
250 'visibility' => $visibility
251 ]
252 )
253 ) {
254 continue;
255 }
256
257 // ignore non private bookmarks when 'privatonly' is on.
258 if ($visibility !== 'all') {
259 if (!$bookmark->isPrivate() && $visibility === 'private') {
260 continue;
261 } elseif ($bookmark->isPrivate() && $visibility === 'public') {
262 continue;
263 }
264 }
265
266 $lengths = [];
267 $content = $this->buildFullTextSearchableLink($bookmark, $lengths);
268
269 // Be optimistic
270 $found = true;
271 $foundPositions = [];
272
273 // First, we look for exact term search
274 // Then iterate over keywords, if keyword is not found,
275 // no need to check for the others. We want all or nothing.
276 foreach ([$exactSearch, $andSearch] as $search) {
277 for ($i = 0; $i < count($search) && $found !== false; $i++) {
278 $found = mb_strpos($content, $search[$i]);
279 if ($found === false) {
280 break;
281 }
282
283 $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])];
284 }
285 }
286
287 // Exclude terms.
288 for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) {
289 $found = strpos($content, $excludeSearch[$i]) === false;
290 }
291
292 if ($found !== false) {
293 $bookmark->addAdditionalContentEntry(
294 'search_highlight',
295 $this->postProcessFoundPositions($lengths, $foundPositions)
296 );
297
298 $filtered[$id] = $bookmark;
299 }
300 }
301
302 return $filtered;
303 }
304
305 /**
306 * Returns the list of bookmarks associated with a given list of tags
307 *
308 * You can specify one or more tags, separated by space or a comma, e.g.
309 * print_r($mydb->filterTags('linux programming'));
310 *
311 * @param string|array $tags list of tags, separated by commas or blank spaces if passed as string.
312 * @param bool $casesensitive ignore case if false.
313 * @param string $visibility Optional: return only all/private/public bookmarks.
314 *
315 * @return Bookmark[] filtered bookmarks.
316 */
317 public function filterTags($tags, bool $casesensitive = false, string $visibility = 'all')
318 {
319 $tagsSeparator = $this->conf->get('general.tags_separator', ' ');
320 // get single tags (we may get passed an array, even though the docs say different)
321 $inputTags = $tags;
322 if (!is_array($tags)) {
323 // we got an input string, split tags
324 $inputTags = tags_str2array($inputTags, $tagsSeparator);
325 }
326
327 if (count($inputTags) === 0) {
328 // no input tags
329 return $this->noFilter($visibility);
330 }
331
332 // If we only have public visibility, we can't look for hidden tags
333 if ($visibility === self::$PUBLIC) {
334 $inputTags = array_values(array_filter($inputTags, function ($tag) {
335 return ! startsWith($tag, '.');
336 }));
337
338 if (empty($inputTags)) {
339 return [];
340 }
341 }
342
343 // build regex from all tags
344 $re = '/^' . implode(array_map([$this, 'tag2regex'], $inputTags)) . '.*$/';
345 if (!$casesensitive) {
346 // make regex case insensitive
347 $re .= 'i';
348 }
349
350 // create resulting array
351 $filtered = [];
352
353 // iterate over each link
354 foreach ($this->bookmarks as $key => $bookmark) {
355 if (
356 !$this->pluginManager->filterSearchEntry(
357 $bookmark,
358 [
359 'source' => 'tags',
360 'tags' => $tags,
361 'casesensitive' => $casesensitive,
362 'visibility' => $visibility
363 ]
364 )
365 ) {
366 continue;
367 }
368
369 // check level of visibility
370 // ignore non private bookmarks when 'privateonly' is on.
371 if ($visibility !== 'all') {
372 if (!$bookmark->isPrivate() && $visibility === 'private') {
373 continue;
374 } elseif ($bookmark->isPrivate() && $visibility === 'public') {
375 continue;
376 }
377 }
378 // build search string, start with tags of current link
379 $search = $bookmark->getTagsString($tagsSeparator);
380 if (strlen(trim($bookmark->getDescription())) && strpos($bookmark->getDescription(), '#') !== false) {
381 // description given and at least one possible tag found
382 $descTags = [];
383 // find all tags in the form of #tag in the description
384 preg_match_all(
385 '/(?<![' . self::$HASHTAG_CHARS . '])#([' . self::$HASHTAG_CHARS . ']+?)\b/sm',
386 $bookmark->getDescription(),
387 $descTags
388 );
389 if (count($descTags[1])) {
390 // there were some tags in the description, add them to the search string
391 $search .= $tagsSeparator . tags_array2str($descTags[1], $tagsSeparator);
392 }
393 }
394 // match regular expression with search string
395 if (!preg_match($re, $search)) {
396 // this entry does _not_ match our regex
397 continue;
398 }
399 $filtered[$key] = $bookmark;
400 }
401
402 return $filtered;
403 }
404
405 /**
406 * Return only bookmarks without any tag.
407 *
408 * @param string $visibility return only all/private/public bookmarks.
409 *
410 * @return Bookmark[] filtered bookmarks.
411 */
412 public function filterUntagged(string $visibility)
413 {
414 $filtered = [];
415 foreach ($this->bookmarks as $key => $bookmark) {
416 if (
417 !$this->pluginManager->filterSearchEntry(
418 $bookmark,
419 ['source' => 'untagged', 'visibility' => $visibility]
420 )
421 ) {
422 continue;
423 }
424
425 if ($visibility !== 'all') {
426 if (!$bookmark->isPrivate() && $visibility === 'private') {
427 continue;
428 } elseif ($bookmark->isPrivate() && $visibility === 'public') {
429 continue;
430 }
431 }
432
433 if (empty($bookmark->getTags())) {
434 $filtered[$key] = $bookmark;
435 }
436 }
437
438 return $filtered;
439 }
440
441 /**
442 * Convert a list of tags (str) to an array. Also
443 * - handle case sensitivity.
444 * - accepts spaces commas as separator.
445 *
446 * @param string $tags string containing a list of tags.
447 * @param bool $casesensitive will convert everything to lowercase if false.
448 *
449 * @return string[] filtered tags string.
450 */
451 public static function tagsStrToArray(string $tags, bool $casesensitive): array
452 {
453 // We use UTF-8 conversion to handle various graphemes (i.e. cyrillic, or greek)
454 $tagsOut = $casesensitive ? $tags : mb_convert_case($tags, MB_CASE_LOWER, 'UTF-8');
455 $tagsOut = str_replace(',', ' ', $tagsOut);
456
457 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY);
458 }
459
460 /**
461 * generate a regex fragment out of a tag
462 *
463 * @param string $tag to to generate regexs from. may start with '-' to negate, contain '*' as wildcard
464 *
465 * @return string generated regex fragment
466 */
467 protected function tag2regex(string $tag): string
468 {
469 $tagsSeparator = $this->conf->get('general.tags_separator', ' ');
470 $len = strlen($tag);
471 if (!$len || $tag === "-" || $tag === "*") {
472 // nothing to search, return empty regex
473 return '';
474 }
475 if ($tag[0] === "-") {
476 // query is negated
477 $i = 1; // use offset to start after '-' character
478 $regex = '(?!'; // create negative lookahead
479 } else {
480 $i = 0; // start at first character
481 $regex = '(?='; // use positive lookahead
482 }
483 // before tag may only be the separator or the beginning
484 $regex .= '.*(?:^|' . $tagsSeparator . ')';
485 // iterate over string, separating it into placeholder and content
486 for (; $i < $len; $i++) {
487 if ($tag[$i] === '*') {
488 // placeholder found
489 $regex .= '[^' . $tagsSeparator . ']*?';
490 } else {
491 // regular characters
492 $offset = strpos($tag, '*', $i);
493 if ($offset === false) {
494 // no placeholder found, set offset to end of string
495 $offset = $len;
496 }
497 // subtract one, as we want to get before the placeholder or end of string
498 $offset -= 1;
499 // we got a tag name that we want to search for. escape any regex characters to prevent conflicts.
500 $regex .= preg_quote(substr($tag, $i, $offset - $i + 1), '/');
501 // move $i on
502 $i = $offset;
503 }
504 }
505 // after the tag may only be the separator or the end
506 $regex .= '(?:$|' . $tagsSeparator . '))';
507 return $regex;
508 }
509
510 /**
511 * This method finalize the content of the foundPositions array,
512 * by associated all search results to their associated bookmark field,
513 * making sure that there is no overlapping results, etc.
514 *
515 * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content.
516 * @param array $foundPositions Positions where the search results were found in the aggregated content.
517 *
518 * @return array Updated $foundPositions, by bookmark field.
519 */
520 protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array
521 {
522 // Sort results by starting position ASC.
523 usort($foundPositions, function (array $entryA, array $entryB): int {
524 return $entryA['start'] > $entryB['start'] ? 1 : -1;
525 });
526
527 $out = [];
528 $currentMax = -1;
529 foreach ($foundPositions as $foundPosition) {
530 // we do not allow overlapping highlights
531 if ($foundPosition['start'] < $currentMax) {
532 continue;
533 }
534
535 $currentMax = $foundPosition['end'];
536 foreach ($fieldLengths as $part => $length) {
537 if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) {
538 continue;
539 }
540
541 $out[$part][] = [
542 'start' => $foundPosition['start'] - $length['start'],
543 'end' => $foundPosition['end'] - $length['start'],
544 ];
545 break;
546 }
547 }
548
549 return $out;
550 }
551
552 /**
553 * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms.
554 * Also populate $length array with starting and ending positions of every bookmark field
555 * inside concatenated content.
556 *
557 * @param Bookmark $link
558 * @param array $lengths (by reference)
559 *
560 * @return string Lowercase concatenated fields content.
561 */
562 protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string
563 {
564 $tagString = $link->getTagsString($this->conf->get('general.tags_separator', ' '));
565 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') . '\\';
566 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') . '\\';
567 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') . '\\';
568 $content .= mb_convert_case($tagString, MB_CASE_LOWER, 'UTF-8') . '\\';
569
570 $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())];
571 $nextField = $lengths['title']['end'] + 1;
572 $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())];
573 $nextField = $lengths['description']['end'] + 1;
574 $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())];
575 $nextField = $lengths['url']['end'] + 1;
576 $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($tagString)];
577
578 return $content;
579 }
580 }