aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/bookmark/BookmarkFilter.php
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-10-12 11:35:55 +0200
committerArthurHoaro <arthur@hoa.ro>2020-10-16 20:31:12 +0200
commit4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b (patch)
tree4deb157f03ce7d5402dbfeb65743951d97e527cf /application/bookmark/BookmarkFilter.php
parent64cac2562661c55f679dba5a7c308e7764f430b5 (diff)
downloadShaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.gz
Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.zst
Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.zip
Feature: highlight fulltext search results
How it works: 1. when a fulltext search is made, Shaarli looks for the first occurence position of every term matching the search. No change here, but we store these positions in an array, in Bookmark's additionalContent. 2. when formatting bookmarks (through BookmarkFormatter implementation): 1. first we insert specific tokens at every search result positions 2. we format the content (escape HTML, apply markdown, etc.) 3. as a last step, we replace our token with displayable span elements Cons: this tightens coupling between search filters and formatters Pros: it was absolutely necessary not to perform the search twice. this solution has close to no impact on performances. Fixes #205
Diffstat (limited to 'application/bookmark/BookmarkFilter.php')
-rw-r--r--application/bookmark/BookmarkFilter.php111
1 files changed, 93 insertions, 18 deletions
diff --git a/application/bookmark/BookmarkFilter.php b/application/bookmark/BookmarkFilter.php
index 4232f114..c79386ea 100644
--- a/application/bookmark/BookmarkFilter.php
+++ b/application/bookmark/BookmarkFilter.php
@@ -201,7 +201,7 @@ class BookmarkFilter
201 return $this->noFilter($visibility); 201 return $this->noFilter($visibility);
202 } 202 }
203 203
204 $filtered = array(); 204 $filtered = [];
205 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); 205 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8');
206 $exactRegex = '/"([^"]+)"/'; 206 $exactRegex = '/"([^"]+)"/';
207 // Retrieve exact search terms. 207 // Retrieve exact search terms.
@@ -213,8 +213,8 @@ class BookmarkFilter
213 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); 213 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd));
214 214
215 // Filter excluding terms and update andSearch. 215 // Filter excluding terms and update andSearch.
216 $excludeSearch = array(); 216 $excludeSearch = [];
217 $andSearch = array(); 217 $andSearch = [];
218 foreach ($explodedSearchAnd as $needle) { 218 foreach ($explodedSearchAnd as $needle) {
219 if ($needle[0] == '-' && strlen($needle) > 1) { 219 if ($needle[0] == '-' && strlen($needle) > 1) {
220 $excludeSearch[] = substr($needle, 1); 220 $excludeSearch[] = substr($needle, 1);
@@ -234,33 +234,38 @@ class BookmarkFilter
234 } 234 }
235 } 235 }
236 236
237 // Concatenate link fields to search across fields. 237 $lengths = [];
238 // Adds a '\' separator for exact search terms. 238 $content = $this->buildFullTextSearchableLink($link, $lengths);
239 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
240 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
241 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
242 $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
243 239
244 // Be optimistic 240 // Be optimistic
245 $found = true; 241 $found = true;
242 $foundPositions = [];
246 243
247 // First, we look for exact term search 244 // First, we look for exact term search
248 for ($i = 0; $i < count($exactSearch) && $found; $i++) { 245 // Then iterate over keywords, if keyword is not found,
249 $found = strpos($content, $exactSearch[$i]) !== false;
250 }
251
252 // Iterate over keywords, if keyword is not found,
253 // no need to check for the others. We want all or nothing. 246 // no need to check for the others. We want all or nothing.
254 for ($i = 0; $i < count($andSearch) && $found; $i++) { 247 foreach ([$exactSearch, $andSearch] as $search) {
255 $found = strpos($content, $andSearch[$i]) !== false; 248 for ($i = 0; $i < count($search) && $found !== false; $i++) {
249 $found = mb_strpos($content, $search[$i]);
250 if ($found === false) {
251 break;
252 }
253
254 $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])];
255 }
256 } 256 }
257 257
258 // Exclude terms. 258 // Exclude terms.
259 for ($i = 0; $i < count($excludeSearch) && $found; $i++) { 259 for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) {
260 $found = strpos($content, $excludeSearch[$i]) === false; 260 $found = strpos($content, $excludeSearch[$i]) === false;
261 } 261 }
262 262
263 if ($found) { 263 if ($found !== false) {
264 $link->addAdditionalContentEntry(
265 'search_highlight',
266 $this->postProcessFoundPositions($lengths, $foundPositions)
267 );
268
264 $filtered[$id] = $link; 269 $filtered[$id] = $link;
265 } 270 }
266 } 271 }
@@ -477,4 +482,74 @@ class BookmarkFilter
477 482
478 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); 483 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY);
479 } 484 }
485
486 /**
487 * This method finalize the content of the foundPositions array,
488 * by associated all search results to their associated bookmark field,
489 * making sure that there is no overlapping results, etc.
490 *
491 * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content.
492 * @param array $foundPositions Positions where the search results were found in the aggregated content.
493 *
494 * @return array Updated $foundPositions, by bookmark field.
495 */
496 protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array
497 {
498 // Sort results by starting position ASC.
499 usort($foundPositions, function (array $entryA, array $entryB): int {
500 return $entryA['start'] > $entryB['start'] ? 1 : -1;
501 });
502
503 $out = [];
504 $currentMax = -1;
505 foreach ($foundPositions as $foundPosition) {
506 // we do not allow overlapping highlights
507 if ($foundPosition['start'] < $currentMax) {
508 continue;
509 }
510
511 $currentMax = $foundPosition['end'];
512 foreach ($fieldLengths as $part => $length) {
513 if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) {
514 continue;
515 }
516
517 $out[$part][] = [
518 'start' => $foundPosition['start'] - $length['start'],
519 'end' => $foundPosition['end'] - $length['start'],
520 ];
521 break;
522 }
523 }
524
525 return $out;
526 }
527
528 /**
529 * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms.
530 * Also populate $length array with starting and ending positions of every bookmark field
531 * inside concatenated content.
532 *
533 * @param Bookmark $link
534 * @param array $lengths (by reference)
535 *
536 * @return string Lowercase concatenated fields content.
537 */
538 protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string
539 {
540 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
541 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
542 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
543 $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
544
545 $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())];
546 $nextField = $lengths['title']['end'] + 1;
547 $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())];
548 $nextField = $lengths['description']['end'] + 1;
549 $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())];
550 $nextField = $lengths['url']['end'] + 1;
551 $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getTagsString())];
552
553 return $content;
554 }
480} 555}