aboutsummaryrefslogtreecommitdiffhomepage
path: root/application/bookmark
diff options
context:
space:
mode:
authorArthurHoaro <arthur@hoa.ro>2020-10-12 11:35:55 +0200
committerArthurHoaro <arthur@hoa.ro>2020-10-16 20:31:12 +0200
commit4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b (patch)
tree4deb157f03ce7d5402dbfeb65743951d97e527cf /application/bookmark
parent64cac2562661c55f679dba5a7c308e7764f430b5 (diff)
downloadShaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.gz
Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.tar.zst
Shaarli-4e3875c0ce7f3b17e3d358dc5ecb1f8bed64546b.zip
Feature: highlight fulltext search results
How it works: 1. when a fulltext search is made, Shaarli looks for the first occurence position of every term matching the search. No change here, but we store these positions in an array, in Bookmark's additionalContent. 2. when formatting bookmarks (through BookmarkFormatter implementation): 1. first we insert specific tokens at every search result positions 2. we format the content (escape HTML, apply markdown, etc.) 3. as a last step, we replace our token with displayable span elements Cons: this tightens coupling between search filters and formatters Pros: it was absolutely necessary not to perform the search twice. this solution has close to no impact on performances. Fixes #205
Diffstat (limited to 'application/bookmark')
-rw-r--r--application/bookmark/Bookmark.php46
-rw-r--r--application/bookmark/BookmarkFilter.php111
2 files changed, 139 insertions, 18 deletions
diff --git a/application/bookmark/Bookmark.php b/application/bookmark/Bookmark.php
index fa45d2fc..ea565d1f 100644
--- a/application/bookmark/Bookmark.php
+++ b/application/bookmark/Bookmark.php
@@ -54,6 +54,9 @@ class Bookmark
54 /** @var bool True if the bookmark can only be seen while logged in */ 54 /** @var bool True if the bookmark can only be seen while logged in */
55 protected $private; 55 protected $private;
56 56
57 /** @var mixed[] Available to store any additional content for a bookmark. Currently used for search highlight. */
58 protected $additionalContent = [];
59
57 /** 60 /**
58 * Initialize a link from array data. Especially useful to create a Bookmark from former link storage format. 61 * Initialize a link from array data. Especially useful to create a Bookmark from former link storage format.
59 * 62 *
@@ -95,6 +98,8 @@ class Bookmark
95 * - the URL with the permalink 98 * - the URL with the permalink
96 * - the title with the URL 99 * - the title with the URL
97 * 100 *
101 * Also make sure that we do not save search highlights in the datastore.
102 *
98 * @throws InvalidBookmarkException 103 * @throws InvalidBookmarkException
99 */ 104 */
100 public function validate(): void 105 public function validate(): void
@@ -112,6 +117,9 @@ class Bookmark
112 if (empty($this->title)) { 117 if (empty($this->title)) {
113 $this->title = $this->url; 118 $this->title = $this->url;
114 } 119 }
120 if (array_key_exists('search_highlight', $this->additionalContent)) {
121 unset($this->additionalContent['search_highlight']);
122 }
115 } 123 }
116 124
117 /** 125 /**
@@ -436,6 +444,44 @@ class Bookmark
436 } 444 }
437 445
438 /** 446 /**
447 * Get entire additionalContent array.
448 *
449 * @return mixed[]
450 */
451 public function getAdditionalContent(): array
452 {
453 return $this->additionalContent;
454 }
455
456 /**
457 * Set a single entry in additionalContent, by key.
458 *
459 * @param string $key
460 * @param mixed|null $value Any type of value can be set.
461 *
462 * @return $this
463 */
464 public function addAdditionalContentEntry(string $key, $value): self
465 {
466 $this->additionalContent[$key] = $value;
467
468 return $this;
469 }
470
471 /**
472 * Get a single entry in additionalContent, by key.
473 *
474 * @param string $key
475 * @param mixed|null $default
476 *
477 * @return mixed|null can be any type or even null.
478 */
479 public function getAdditionalContentEntry(string $key, $default = null)
480 {
481 return array_key_exists($key, $this->additionalContent) ? $this->additionalContent[$key] : $default;
482 }
483
484 /**
439 * Rename a tag in tags list. 485 * Rename a tag in tags list.
440 * 486 *
441 * @param string $fromTag 487 * @param string $fromTag
diff --git a/application/bookmark/BookmarkFilter.php b/application/bookmark/BookmarkFilter.php
index 4232f114..c79386ea 100644
--- a/application/bookmark/BookmarkFilter.php
+++ b/application/bookmark/BookmarkFilter.php
@@ -201,7 +201,7 @@ class BookmarkFilter
201 return $this->noFilter($visibility); 201 return $this->noFilter($visibility);
202 } 202 }
203 203
204 $filtered = array(); 204 $filtered = [];
205 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); 205 $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8');
206 $exactRegex = '/"([^"]+)"/'; 206 $exactRegex = '/"([^"]+)"/';
207 // Retrieve exact search terms. 207 // Retrieve exact search terms.
@@ -213,8 +213,8 @@ class BookmarkFilter
213 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); 213 $explodedSearchAnd = array_values(array_filter($explodedSearchAnd));
214 214
215 // Filter excluding terms and update andSearch. 215 // Filter excluding terms and update andSearch.
216 $excludeSearch = array(); 216 $excludeSearch = [];
217 $andSearch = array(); 217 $andSearch = [];
218 foreach ($explodedSearchAnd as $needle) { 218 foreach ($explodedSearchAnd as $needle) {
219 if ($needle[0] == '-' && strlen($needle) > 1) { 219 if ($needle[0] == '-' && strlen($needle) > 1) {
220 $excludeSearch[] = substr($needle, 1); 220 $excludeSearch[] = substr($needle, 1);
@@ -234,33 +234,38 @@ class BookmarkFilter
234 } 234 }
235 } 235 }
236 236
237 // Concatenate link fields to search across fields. 237 $lengths = [];
238 // Adds a '\' separator for exact search terms. 238 $content = $this->buildFullTextSearchableLink($link, $lengths);
239 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
240 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
241 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
242 $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
243 239
244 // Be optimistic 240 // Be optimistic
245 $found = true; 241 $found = true;
242 $foundPositions = [];
246 243
247 // First, we look for exact term search 244 // First, we look for exact term search
248 for ($i = 0; $i < count($exactSearch) && $found; $i++) { 245 // Then iterate over keywords, if keyword is not found,
249 $found = strpos($content, $exactSearch[$i]) !== false;
250 }
251
252 // Iterate over keywords, if keyword is not found,
253 // no need to check for the others. We want all or nothing. 246 // no need to check for the others. We want all or nothing.
254 for ($i = 0; $i < count($andSearch) && $found; $i++) { 247 foreach ([$exactSearch, $andSearch] as $search) {
255 $found = strpos($content, $andSearch[$i]) !== false; 248 for ($i = 0; $i < count($search) && $found !== false; $i++) {
249 $found = mb_strpos($content, $search[$i]);
250 if ($found === false) {
251 break;
252 }
253
254 $foundPositions[] = ['start' => $found, 'end' => $found + mb_strlen($search[$i])];
255 }
256 } 256 }
257 257
258 // Exclude terms. 258 // Exclude terms.
259 for ($i = 0; $i < count($excludeSearch) && $found; $i++) { 259 for ($i = 0; $i < count($excludeSearch) && $found !== false; $i++) {
260 $found = strpos($content, $excludeSearch[$i]) === false; 260 $found = strpos($content, $excludeSearch[$i]) === false;
261 } 261 }
262 262
263 if ($found) { 263 if ($found !== false) {
264 $link->addAdditionalContentEntry(
265 'search_highlight',
266 $this->postProcessFoundPositions($lengths, $foundPositions)
267 );
268
264 $filtered[$id] = $link; 269 $filtered[$id] = $link;
265 } 270 }
266 } 271 }
@@ -477,4 +482,74 @@ class BookmarkFilter
477 482
478 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); 483 return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY);
479 } 484 }
485
486 /**
487 * This method finalize the content of the foundPositions array,
488 * by associated all search results to their associated bookmark field,
489 * making sure that there is no overlapping results, etc.
490 *
491 * @param array $fieldLengths Start and end positions of every bookmark fields in the aggregated bookmark content.
492 * @param array $foundPositions Positions where the search results were found in the aggregated content.
493 *
494 * @return array Updated $foundPositions, by bookmark field.
495 */
496 protected function postProcessFoundPositions(array $fieldLengths, array $foundPositions): array
497 {
498 // Sort results by starting position ASC.
499 usort($foundPositions, function (array $entryA, array $entryB): int {
500 return $entryA['start'] > $entryB['start'] ? 1 : -1;
501 });
502
503 $out = [];
504 $currentMax = -1;
505 foreach ($foundPositions as $foundPosition) {
506 // we do not allow overlapping highlights
507 if ($foundPosition['start'] < $currentMax) {
508 continue;
509 }
510
511 $currentMax = $foundPosition['end'];
512 foreach ($fieldLengths as $part => $length) {
513 if ($foundPosition['start'] < $length['start'] || $foundPosition['start'] > $length['end']) {
514 continue;
515 }
516
517 $out[$part][] = [
518 'start' => $foundPosition['start'] - $length['start'],
519 'end' => $foundPosition['end'] - $length['start'],
520 ];
521 break;
522 }
523 }
524
525 return $out;
526 }
527
528 /**
529 * Concatenate link fields to search across fields. Adds a '\' separator for exact search terms.
530 * Also populate $length array with starting and ending positions of every bookmark field
531 * inside concatenated content.
532 *
533 * @param Bookmark $link
534 * @param array $lengths (by reference)
535 *
536 * @return string Lowercase concatenated fields content.
537 */
538 protected function buildFullTextSearchableLink(Bookmark $link, array &$lengths): string
539 {
540 $content = mb_convert_case($link->getTitle(), MB_CASE_LOWER, 'UTF-8') .'\\';
541 $content .= mb_convert_case($link->getDescription(), MB_CASE_LOWER, 'UTF-8') .'\\';
542 $content .= mb_convert_case($link->getUrl(), MB_CASE_LOWER, 'UTF-8') .'\\';
543 $content .= mb_convert_case($link->getTagsString(), MB_CASE_LOWER, 'UTF-8') .'\\';
544
545 $lengths['title'] = ['start' => 0, 'end' => mb_strlen($link->getTitle())];
546 $nextField = $lengths['title']['end'] + 1;
547 $lengths['description'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getDescription())];
548 $nextField = $lengths['description']['end'] + 1;
549 $lengths['url'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getUrl())];
550 $nextField = $lengths['url']['end'] + 1;
551 $lengths['tags'] = ['start' => $nextField, 'end' => $nextField + mb_strlen($link->getTagsString())];
552
553 return $content;
554 }
480} 555}