diff options
author | ArthurHoaro <arthur@hoa.ro> | 2020-10-12 11:35:55 +0200 |
---|---|---|
committer | ArthurHoaro <arthur@hoa.ro> | 2020-10-16 20:31:12 +0200 |
commit | 4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b (patch) | |
tree | 4deb157f03ce7d5402dbfeb65743951d97e527cf /application/bookmark/BookmarkFilter.php | |
parent | 64cac2562661c55f679dba5a7c308e7764f430b5 (diff) | |
download | Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.gz Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.zst Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.zip |
Feature: highlight fulltext search results
How it works:
1. when a fulltext search is made, Shaarli looks for the first
occurence position of every term matching the search. No change here,
but we store these positions in an array, in Bookmark's additionalContent.
2. when formatting bookmarks (through BookmarkFormatter
implementation):
1. first we insert specific tokens at every search result positions
2. we format the content (escape HTML, apply markdown, etc.)
3. as a last step, we replace our token with displayable span
elements
Cons: this tightens coupling between search filters and formatters
Pros: it was absolutely necessary not to perform the
search twice. this solution has close to no impact on performances.
Fixes #205
Diffstat (limited to 'application/bookmark/BookmarkFilter.php')
-rw-r--r-- | application/bookmark/BookmarkFilter.php | 111 |
1 files changed, 93 insertions, 18 deletions
diff --git a/application/bookmark/BookmarkFilter.php b/application/bookmark/BookmarkFilter.php index 4232f114..c79386ea 100644 --- a/application/bookmark/BookmarkFilter.php +++ b/application/bookmark/BookmarkFilter.php | |||
@@ -201,7 +201,7 @@ class BookmarkFilter | |||
201 | return $this->noFilter($visibility); | 201 | return $this->noFilter($visibility); |
202 | } | 202 | } |
203 | 203 | ||
204 | $filtered = array(); | 204 | $filtered = []; |
205 | $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); | 205 | $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); |
206 | $exactRegex = '/"([^"]+)"/'; | 206 | $exactRegex = '/"([^"]+)"/'; |
207 | // Retrieve exact search terms. | 207 | // Retrieve exact search terms. |
@@ -213,8 +213,8 @@ class BookmarkFilter | |||
213 | $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); | 213 | $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); |
214 | 214 | ||
215 | // Filter excluding terms and update andSearch. | 215 | // Filter excluding terms and update andSearch. |
216 | $excludeSearch = array(); | 216 | $excludeSearch = []; |
217 | $andSearch = array(); | 217 | $andSearch = []; |
218 | foreach ($explodedSearchAnd as $needle) { | 218 | foreach ($explodedSearchAnd as $needle) { |
219 | if ($needle[0] == '-' && strlen($needle) > 1) { | 219 | if ($needle[0] == '-' && strlen($needle) > 1) { |
220 | $excludeSearch[] = substr($needle, 1); | 220 | $excludeSearch[] = substr($needle, 1); |
@@ -234,33 +234,38 @@ class BookmarkFilter | |||
234 | } | 234 | } |
235 | } | 235 | } |
236 | 236 | ||
237 | // Concatenate link fields to search across fields. | 237 | $lengths = []; |
238 | // Adds a '\' separator for exact search terms. | 238 | $content = $this->buildFullTextSearchableLink($link, $lengths); |
239 | $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
240 | $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
241 | $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
242 | $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
243 | 239 | ||
244 | // Be optimistic | 240 | // Be optimistic |
245 | $found = true; | 241 | $found = true; |
242 | $foundPositions = []; | ||
246 | 243 | ||
247 | // First, we look for exact term search | 244 | // First, we look for exact term search |
248 | for ($i = 0; $i < count($exactSearch) && $found; $i++) { | 245 | // Then iterate over keywords, if keyword is not found, |
249 | $found = strpos($content, $exactSearch[$i]) !== false; | ||
250 | } | ||
251 | |||
252 | // Iterate over keywords, if keyword is not found, | ||
253 | // no need to check for the others. We want all or nothing. | 246 | // no need to check for the others. We want all or nothing. |
254 | for ($i = 0; $i < count($andSearch) && $found; $i++) { | 247 | foreach ([$exactSearch, $andSearch] as $search) { |
255 | $found = strpos($content, $andSearch[$i]) !== false; | 248 | for ($i = 0; $i < count($search) && $found !== false; $i++) { |
249 | $found = mb_strpos($content, $search[$i]); | ||
250 | if ($found === false) { | ||
251 | break; | ||
252 | } | ||
253 | |||
254 | $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])]; | ||
255 | } | ||
256 | } | 256 | } |
257 | 257 | ||
258 | // Exclude terms. | 258 | // Exclude terms. |
259 | for ($i = 0; $i < count($excludeSearch) && $found; $i++) { | 259 | for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) { |
260 | $found = strpos($content, $excludeSearch[$i]) === false; | 260 | $found = strpos($content, $excludeSearch[$i]) === false; |
261 | } | 261 | } |
262 | 262 | ||
263 | if ($found) { | 263 | if ($found !== false) { |
264 | $link->addAdditionalContentEntry( | ||
265 | 'search_highlight', | ||
266 | $this->postProcessFoundPositions($lengths, $foundPositions) | ||
267 | ); | ||
268 | |||
264 | $filtered[$id] = $link; | 269 | $filtered[$id] = $link; |
265 | } | 270 | } |
266 | } | 271 | } |
@@ -477,4 +482,74 @@ class BookmarkFilter | |||
477 | 482 | ||
478 | return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); | 483 | return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); |
479 | } | 484 | } |
485 | |||
486 | /** | ||
487 | * This method finalize the content of the foundPositions array, | ||
488 | * by associated all search results to their associated bookmark field, | ||
489 | * making sure that there is no overlapping results, etc. | ||
490 | * | ||
491 | * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content. | ||
492 | * @param array $foundPositions Positions where the search results were found in the aggregated content. | ||
493 | * | ||
494 | * @return array Updated $foundPositions, by bookmark field. | ||
495 | */ | ||
496 | protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array | ||
497 | { | ||
498 | // Sort results by starting position ASC. | ||
499 | usort($foundPositions, function (array $entryA, array $entryB): int { | ||
500 | return $entryA['start'] > $entryB['start'] ? 1 : -1; | ||
501 | }); | ||
502 | |||
503 | $out = []; | ||
504 | $currentMax = -1; | ||
505 | foreach ($foundPositions as $foundPosition) { | ||
506 | // we do not allow overlapping highlights | ||
507 | if ($foundPosition['start'] < $currentMax) { | ||
508 | continue; | ||
509 | } | ||
510 | |||
511 | $currentMax = $foundPosition['end']; | ||
512 | foreach ($fieldLengths as $part => $length) { | ||
513 | if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) { | ||
514 | continue; | ||
515 | } | ||
516 | |||
517 | $out[$part][] = [ | ||
518 | 'start' => $foundPosition['start'] - $length['start'], | ||
519 | 'end' => $foundPosition['end'] - $length['start'], | ||
520 | ]; | ||
521 | break; | ||
522 | } | ||
523 | } | ||
524 | |||
525 | return $out; | ||
526 | } | ||
527 | |||
528 | /** | ||
529 | * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms. | ||
530 | * Also populate $length array with starting and ending positions of every bookmark field | ||
531 | * inside concatenated content. | ||
532 | * | ||
533 | * @param Bookmark $link | ||
534 | * @param array $lengths (by reference) | ||
535 | * | ||
536 | * @return string Lowercase concatenated fields content. | ||
537 | */ | ||
538 | protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string | ||
539 | { | ||
540 | $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
541 | $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
542 | $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
543 | $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\'; | ||
544 | |||
545 | $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())]; | ||
546 | $nextField = $lengths['title']['end'] + 1; | ||
547 | $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())]; | ||
548 | $nextField = $lengths['description']['end'] + 1; | ||
549 | $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())]; | ||
550 | $nextField = $lengths['url']['end'] + 1; | ||
551 | $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getTagsString())]; | ||
552 | |||
553 | return $content; | ||
554 | } | ||
480 | } | 555 | } |