]> git.immae.eu Git - github/shaarli/Shaarli.git/blob - application/bookmark/BookmarkFilter.php
Apply PHP Code Beautifier on source code for linter automatic fixes
[github/shaarli/Shaarli.git] / application / bookmark / BookmarkFilter.php
1 <?php
2
3 declare(strict_types=1);
4
5 namespace Shaarli\Bookmark;
6
7 use Exception;
8 use Shaarli\Bookmark\Exception\BookmarkNotFoundException;
9 use Shaarli\Config\ConfigManager;
10
11 /**
12 * Class LinkFilter.
13 *
14 * Perform search and filter operation on link data list.
15 */
16 class BookmarkFilter
17 {
18 /**
19 * @var string permalinks.
20 */
21 public static $FILTER_HASH = 'permalink';
22
23 /**
24 * @var string text search.
25 */
26 public static $FILTER_TEXT = 'fulltext';
27
28 /**
29 * @var string tag filter.
30 */
31 public static $FILTER_TAG = 'tags';
32
33 /**
34 * @var string filter by day.
35 */
36 public static $FILTER_DAY = 'FILTER_DAY';
37
38 /**
39 * @var string filter by day.
40 */
41 public static $DEFAULT = 'NO_FILTER';
42
43 /** @var string Visibility: all */
44 public static $ALL = 'all';
45
46 /** @var string Visibility: public */
47 public static $PUBLIC = 'public';
48
49 /** @var string Visibility: private */
50 public static $PRIVATE = 'private';
51
52 /**
53 * @var string Allowed characters for hashtags (regex syntax).
54 */
55 public static $HASHTAG_CHARS = '\p{Pc}\p{N}\p{L}\p{Mn}';
56
57 /**
58 * @var Bookmark[] all available bookmarks.
59 */
60 private $bookmarks;
61
62 /** @var ConfigManager */
63 protected $conf;
64
65 /**
66 * @param Bookmark[] $bookmarks initialization.
67 */
68 public function __construct($bookmarks, ConfigManager $conf)
69 {
70 $this->bookmarks = $bookmarks;
71 $this->conf = $conf;
72 }
73
74 /**
75 * Filter bookmarks according to parameters.
76 *
77 * @param string $type Type of filter (eg. tags, permalink, etc.).
78 * @param mixed $request Filter content.
79 * @param bool $casesensitive Optional: Perform case sensitive filter if true.
80 * @param string $visibility Optional: return only all/private/public bookmarks
81 * @param bool $untaggedonly Optional: return only untagged bookmarks. Applies only if $type includes FILTER_TAG
82 *
83 * @return Bookmark[] filtered bookmark list.
84 *
85 * @throws BookmarkNotFoundException
86 */
87 public function filter(
88 string $type,
89 $request,
90 bool $casesensitive = false,
91 string $visibility = 'all',
92 bool $untaggedonly = false
93 ) {
94 if (!in_array($visibility, ['all', 'public', 'private'])) {
95 $visibility = 'all';
96 }
97
98 switch ($type) {
99 case self::$FILTER_HASH:
100 return $this->filterSmallHash($request);
101 case self::$FILTER_TAG | self::$FILTER_TEXT: // == "vuotext"
102 $noRequest = empty($request) || (empty($request[0]) && empty($request[1]));
103 if ($noRequest) {
104 if ($untaggedonly) {
105 return $this->filterUntagged($visibility);
106 }
107 return $this->noFilter($visibility);
108 }
109 if ($untaggedonly) {
110 $filtered = $this->filterUntagged($visibility);
111 } else {
112 $filtered = $this->bookmarks;
113 }
114 if (!empty($request[0])) {
115 $filtered = (new BookmarkFilter($filtered, $this->conf))
116 ->filterTags($request[0], $casesensitive, $visibility)
117 ;
118 }
119 if (!empty($request[1])) {
120 $filtered = (new BookmarkFilter($filtered, $this->conf))
121 ->filterFulltext($request[1], $visibility)
122 ;
123 }
124 return $filtered;
125 case self::$FILTER_TEXT:
126 return $this->filterFulltext($request, $visibility);
127 case self::$FILTER_TAG:
128 if ($untaggedonly) {
129 return $this->filterUntagged($visibility);
130 } else {
131 return $this->filterTags($request, $casesensitive, $visibility);
132 }
133 case self::$FILTER_DAY:
134 return $this->filterDay($request, $visibility);
135 default:
136 return $this->noFilter($visibility);
137 }
138 }
139
140 /**
141 * Unknown filter, but handle private only.
142 *
143 * @param string $visibility Optional: return only all/private/public bookmarks
144 *
145 * @return Bookmark[] filtered bookmarks.
146 */
147 private function noFilter(string $visibility = 'all')
148 {
149 if ($visibility === 'all') {
150 return $this->bookmarks;
151 }
152
153 $out = [];
154 foreach ($this->bookmarks as $key => $value) {
155 if ($value->isPrivate() && $visibility === 'private') {
156 $out[$key] = $value;
157 } elseif (!$value->isPrivate() && $visibility === 'public') {
158 $out[$key] = $value;
159 }
160 }
161
162 return $out;
163 }
164
165 /**
166 * Returns the shaare corresponding to a smallHash.
167 *
168 * @param string $smallHash permalink hash.
169 *
170 * @return Bookmark[] $filtered array containing permalink data.
171 *
172 * @throws BookmarkNotFoundException if the smallhash doesn't match any link.
173 */
174 private function filterSmallHash(string $smallHash)
175 {
176 foreach ($this->bookmarks as $key => $l) {
177 if ($smallHash == $l->getShortUrl()) {
178 // Yes, this is ugly and slow
179 return [$key => $l];
180 }
181 }
182
183 throw new BookmarkNotFoundException();
184 }
185
186 /**
187 * Returns the list of bookmarks corresponding to a full-text search
188 *
189 * Searches:
190 * - in the URLs, title and description;
191 * - are case-insensitive;
192 * - terms surrounded by quotes " are exact terms search.
193 * - terms starting with a dash - are excluded (except exact terms).
194 *
195 * Example:
196 * print_r($mydb->filterFulltext('hollandais'));
197 *
198 * mb_convert_case($val, MB_CASE_LOWER, 'UTF-8')
199 * - allows to perform searches on Unicode text
200 * - see https://github.com/shaarli/Shaarli/issues/75 for examples
201 *
202 * @param string $searchterms search query.
203 * @param string $visibility Optional: return only all/private/public bookmarks.
204 *
205 * @return Bookmark[] search results.
206 */
207 private function filterFulltext(string $searchterms, string $visibility = 'all')
208 {
209 if (empty($searchterms)) {
210 return $this->noFilter($visibility);
211 }
212
213 $filtered = [];
214 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8');
215 $exactRegex = '/"([^"]+)"/';
216 // Retrieve exact search terms.
217 preg_match_all($exactRegex, $search, $exactSearch);
218 $exactSearch = array_values(array_filter($exactSearch[1]));
219
220 // Remove exact search terms to get AND terms search.
221 $explodedSearchAnd = explode(' ', trim(preg_replace($exactRegex, '', $search)));
222 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd));
223
224 // Filter excluding terms and update andSearch.
225 $excludeSearch = [];
226 $andSearch = [];
227 foreach ($explodedSearchAnd as $needle) {
228 if ($needle[0] == '-' && strlen($needle) > 1) {
229 $excludeSearch[] = substr($needle, 1);
230 } else {
231 $andSearch[] = $needle;
232 }
233 }
234
235 // Iterate over every stored link.
236 foreach ($this->bookmarks as $id => $link) {
237 // ignore non private bookmarks when 'privatonly' is on.
238 if ($visibility !== 'all') {
239 if (!$link->isPrivate() && $visibility === 'private') {
240 continue;
241 } elseif ($link->isPrivate() && $visibility === 'public') {
242 continue;
243 }
244 }
245
246 $lengths = [];
247 $content = $this->buildFullTextSearchableLink($link, $lengths);
248
249 // Be optimistic
250 $found = true;
251 $foundPositions = [];
252
253 // First, we look for exact term search
254 // Then iterate over keywords, if keyword is not found,
255 // no need to check for the others. We want all or nothing.
256 foreach ([$exactSearch, $andSearch] as $search) {
257 for ($i = 0; $i < count($search) && $found !== false; $i++) {
258 $found = mb_strpos($content, $search[$i]);
259 if ($found === false) {
260 break;
261 }
262
263 $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])];
264 }
265 }
266
267 // Exclude terms.
268 for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) {
269 $found = strpos($content, $excludeSearch[$i]) === false;
270 }
271
272 if ($found !== false) {
273 $link->addAdditionalContentEntry(
274 'search_highlight',
275 $this->postProcessFoundPositions($lengths, $foundPositions)
276 );
277
278 $filtered[$id] = $link;
279 }
280 }
281
282 return $filtered;
283 }
284
285 /**
286 * generate a regex fragment out of a tag
287 *
288 * @param string $tag to to generate regexs from. may start with '-' to negate, contain '*' as wildcard
289 *
290 * @return string generated regex fragment
291 */
292 protected function tag2regex(string $tag): string
293 {
294 $tagsSeparator = $this->conf->get('general.tags_separator', ' ');
295 $len = strlen($tag);
296 if (!$len || $tag === "-" || $tag === "*") {
297 // nothing to search, return empty regex
298 return '';
299 }
300 if ($tag[0] === "-") {
301 // query is negated
302 $i = 1; // use offset to start after '-' character
303 $regex = '(?!'; // create negative lookahead
304 } else {
305 $i = 0; // start at first character
306 $regex = '(?='; // use positive lookahead
307 }
308 // before tag may only be the separator or the beginning
309 $regex .= '.*(?:^|' . $tagsSeparator . ')';
310 // iterate over string, separating it into placeholder and content
311 for (; $i < $len; $i++) {
312 if ($tag[$i] === '*') {
313 // placeholder found
314 $regex .= '[^' . $tagsSeparator . ']*?';
315 } else {
316 // regular characters
317 $offset = strpos($tag, '*', $i);
318 if ($offset === false) {
319 // no placeholder found, set offset to end of string
320 $offset = $len;
321 }
322 // subtract one, as we want to get before the placeholder or end of string
323 $offset -= 1;
324 // we got a tag name that we want to search for. escape any regex characters to prevent conflicts.
325 $regex .= preg_quote(substr($tag, $i, $offset - $i + 1), '/');
326 // move $i on
327 $i = $offset;
328 }
329 }
330 // after the tag may only be the separator or the end
331 $regex .= '(?:$|' . $tagsSeparator . '))';
332 return $regex;
333 }
334
335 /**
336 * Returns the list of bookmarks associated with a given list of tags
337 *
338 * You can specify one or more tags, separated by space or a comma, e.g.
339 * print_r($mydb->filterTags('linux programming'));
340 *
341 * @param string|array $tags list of tags, separated by commas or blank spaces if passed as string.
342 * @param bool $casesensitive ignore case if false.
343 * @param string $visibility Optional: return only all/private/public bookmarks.
344 *
345 * @return Bookmark[] filtered bookmarks.
346 */
347 public function filterTags($tags, bool $casesensitive = false, string $visibility = 'all')
348 {
349 $tagsSeparator = $this->conf->get('general.tags_separator', ' ');
350 // get single tags (we may get passed an array, even though the docs say different)
351 $inputTags = $tags;
352 if (!is_array($tags)) {
353 // we got an input string, split tags
354 $inputTags = tags_str2array($inputTags, $tagsSeparator);
355 }
356
357 if (count($inputTags) === 0) {
358 // no input tags
359 return $this->noFilter($visibility);
360 }
361
362 // If we only have public visibility, we can't look for hidden tags
363 if ($visibility === self::$PUBLIC) {
364 $inputTags = array_values(array_filter($inputTags, function ($tag) {
365 return ! startsWith($tag, '.');
366 }));
367
368 if (empty($inputTags)) {
369 return [];
370 }
371 }
372
373 // build regex from all tags
374 $re = '/^' . implode(array_map([$this, 'tag2regex'], $inputTags)) . '.*$/';
375 if (!$casesensitive) {
376 // make regex case insensitive
377 $re .= 'i';
378 }
379
380 // create resulting array
381 $filtered = [];
382
383 // iterate over each link
384 foreach ($this->bookmarks as $key => $link) {
385 // check level of visibility
386 // ignore non private bookmarks when 'privateonly' is on.
387 if ($visibility !== 'all') {
388 if (!$link->isPrivate() && $visibility === 'private') {
389 continue;
390 } elseif ($link->isPrivate() && $visibility === 'public') {
391 continue;
392 }
393 }
394 // build search string, start with tags of current link
395 $search = $link->getTagsString($tagsSeparator);
396 if (strlen(trim($link->getDescription())) && strpos($link->getDescription(), '#') !== false) {
397 // description given and at least one possible tag found
398 $descTags = [];
399 // find all tags in the form of #tag in the description
400 preg_match_all(
401 '/(?<![' . self::$HASHTAG_CHARS . '])#([' . self::$HASHTAG_CHARS . ']+?)\b/sm',
402 $link->getDescription(),
403 $descTags
404 );
405 if (count($descTags[1])) {
406 // there were some tags in the description, add them to the search string
407 $search .= $tagsSeparator . tags_array2str($descTags[1], $tagsSeparator);
408 }
409 }
410 // match regular expression with search string
411 if (!preg_match($re, $search)) {
412 // this entry does _not_ match our regex
413 continue;
414 }
415 $filtered[$key] = $link;
416 }
417 return $filtered;
418 }
419
420 /**
421 * Return only bookmarks without any tag.
422 *
423 * @param string $visibility return only all/private/public bookmarks.
424 *
425 * @return Bookmark[] filtered bookmarks.
426 */
427 public function filterUntagged(string $visibility)
428 {
429 $filtered = [];
430 foreach ($this->bookmarks as $key => $link) {
431 if ($visibility !== 'all') {
432 if (!$link->isPrivate() && $visibility === 'private') {
433 continue;
434 } elseif ($link->isPrivate() && $visibility === 'public') {
435 continue;
436 }
437 }
438
439 if (empty($link->getTags())) {
440 $filtered[$key] = $link;
441 }
442 }
443
444 return $filtered;
445 }
446
447 /**
448 * Returns the list of articles for a given day, chronologically sorted
449 *
450 * Day must be in the form 'YYYYMMDD' (e.g. '20120125'), e.g.
451 * print_r($mydb->filterDay('20120125'));
452 *
453 * @param string $day day to filter.
454 * @param string $visibility return only all/private/public bookmarks.
455
456 * @return Bookmark[] all link matching given day.
457 *
458 * @throws Exception if date format is invalid.
459 */
460 public function filterDay(string $day, string $visibility)
461 {
462 if (!checkDateFormat('Ymd', $day)) {
463 throw new Exception('Invalid date format');
464 }
465
466 $filtered = [];
467 foreach ($this->bookmarks as $key => $bookmark) {
468 if ($visibility === static::$PUBLIC && $bookmark->isPrivate()) {
469 continue;
470 }
471
472 if ($bookmark->getCreated()->format('Ymd') == $day) {
473 $filtered[$key] = $bookmark;
474 }
475 }
476
477 // sort by date ASC
478 return array_reverse($filtered, true);
479 }
480
481 /**
482 * Convert a list of tags (str) to an array. Also
483 * - handle case sensitivity.
484 * - accepts spaces commas as separator.
485 *
486 * @param string $tags string containing a list of tags.
487 * @param bool $casesensitive will convert everything to lowercase if false.
488 *
489 * @return string[] filtered tags string.
490 */
491 public static function tagsStrToArray(string $tags, bool $casesensitive): array
492 {
493 // We use UTF-8 conversion to handle various graphemes (i.e. cyrillic, or greek)
494 $tagsOut = $casesensitive ? $tags : mb_convert_case($tags, MB_CASE_LOWER, 'UTF-8');
495 $tagsOut = str_replace(',', ' ', $tagsOut);
496
497 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY);
498 }
499
500 /**
501 * This method finalize the content of the foundPositions array,
502 * by associated all search results to their associated bookmark field,
503 * making sure that there is no overlapping results, etc.
504 *
505 * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content.
506 * @param array $foundPositions Positions where the search results were found in the aggregated content.
507 *
508 * @return array Updated $foundPositions, by bookmark field.
509 */
510 protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array
511 {
512 // Sort results by starting position ASC.
513 usort($foundPositions, function (array $entryA, array $entryB): int {
514 return $entryA['start'] > $entryB['start'] ? 1 : -1;
515 });
516
517 $out = [];
518 $currentMax = -1;
519 foreach ($foundPositions as $foundPosition) {
520 // we do not allow overlapping highlights
521 if ($foundPosition['start'] < $currentMax) {
522 continue;
523 }
524
525 $currentMax = $foundPosition['end'];
526 foreach ($fieldLengths as $part => $length) {
527 if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) {
528 continue;
529 }
530
531 $out[$part][] = [
532 'start' => $foundPosition['start'] - $length['start'],
533 'end' => $foundPosition['end'] - $length['start'],
534 ];
535 break;
536 }
537 }
538
539 return $out;
540 }
541
542 /**
543 * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms.
544 * Also populate $length array with starting and ending positions of every bookmark field
545 * inside concatenated content.
546 *
547 * @param Bookmark $link
548 * @param array $lengths (by reference)
549 *
550 * @return string Lowercase concatenated fields content.
551 */
552 protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string
553 {
554 $tagString = $link->getTagsString($this->conf->get('general.tags_separator', ' '));
555 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') . '\\';
556 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') . '\\';
557 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') . '\\';
558 $content .= mb_convert_case($tagString, MB_CASE_LOWER, 'UTF-8') . '\\';
559
560 $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())];
561 $nextField = $lengths['title']['end'] + 1;
562 $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())];
563 $nextField = $lengths['description']['end'] + 1;
564 $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())];
565 $nextField = $lengths['url']['end'] + 1;
566 $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($tagString)];
567
568 return $content;
569 }
570 }