]>
Commit | Line | Data |
---|---|---|
822bffce A |
1 | <?php |
2 | ||
336a28fa | 3 | namespace Shaarli\Legacy; |
6696729b V |
4 | |
5 | use Exception; | |
336a28fa | 6 | use Shaarli\Bookmark\Exception\BookmarkNotFoundException; |
f24896b2 | 7 | |
822bffce A |
8 | /** |
9 | * Class LinkFilter. | |
10 | * | |
11 | * Perform search and filter operation on link data list. | |
336a28fa A |
12 | * |
13 | * @deprecated | |
822bffce | 14 | */ |
336a28fa | 15 | class LegacyLinkFilter |
822bffce A |
16 | { |
17 | /** | |
18 | * @var string permalinks. | |
19 | */ | |
6696729b | 20 | public static $FILTER_HASH = 'permalink'; |
822bffce A |
21 | |
22 | /** | |
23 | * @var string text search. | |
24 | */ | |
6696729b | 25 | public static $FILTER_TEXT = 'fulltext'; |
822bffce A |
26 | |
27 | /** | |
28 | * @var string tag filter. | |
29 | */ | |
6696729b | 30 | public static $FILTER_TAG = 'tags'; |
822bffce A |
31 | |
32 | /** | |
33 | * @var string filter by day. | |
34 | */ | |
6696729b | 35 | public static $FILTER_DAY = 'FILTER_DAY'; |
822bffce | 36 | |
9ccca401 A |
37 | /** |
38 | * @var string Allowed characters for hashtags (regex syntax). | |
39 | */ | |
40 | public static $HASHTAG_CHARS = '\p{Pc}\p{N}\p{L}\p{Mn}'; | |
41 | ||
822bffce | 42 | /** |
336a28fa | 43 | * @var LegacyLinkDB all available links. |
822bffce A |
44 | */ |
45 | private $links; | |
46 | ||
47 | /** | |
336a28fa | 48 | * @param LegacyLinkDB $links initialization. |
822bffce A |
49 | */ |
50 | public function __construct($links) | |
51 | { | |
52 | $this->links = $links; | |
53 | } | |
54 | ||
55 | /** | |
56 | * Filter links according to parameters. | |
57 | * | |
58 | * @param string $type Type of filter (eg. tags, permalink, etc.). | |
528a6f8a | 59 | * @param mixed $request Filter content. |
822bffce | 60 | * @param bool $casesensitive Optional: Perform case sensitive filter if true. |
7f96d9ec | 61 | * @param string $visibility Optional: return only all/private/public links |
f210d94f | 62 | * @param string $untaggedonly Optional: return only untagged links. Applies only if $type includes FILTER_TAG |
822bffce A |
63 | * |
64 | * @return array filtered link list. | |
65 | */ | |
f210d94f | 66 | public function filter($type, $request, $casesensitive = false, $visibility = 'all', $untaggedonly = false) |
822bffce | 67 | { |
6696729b | 68 | if (!in_array($visibility, ['all', 'public', 'private'])) { |
7f96d9ec A |
69 | $visibility = 'all'; |
70 | } | |
71 | ||
f211e417 | 72 | switch ($type) { |
822bffce A |
73 | case self::$FILTER_HASH: |
74 | return $this->filterSmallHash($request); | |
f210d94f LC |
75 | case self::$FILTER_TAG | self::$FILTER_TEXT: // == "vuotext" |
76 | $noRequest = empty($request) || (empty($request[0]) && empty($request[1])); | |
77 | if ($noRequest) { | |
78 | if ($untaggedonly) { | |
79 | return $this->filterUntagged($visibility); | |
c51fae92 | 80 | } |
f210d94f | 81 | return $this->noFilter($visibility); |
c51fae92 | 82 | } |
f210d94f LC |
83 | if ($untaggedonly) { |
84 | $filtered = $this->filterUntagged($visibility); | |
85 | } else { | |
86 | $filtered = $this->links; | |
87 | } | |
88 | if (!empty($request[0])) { | |
336a28fa | 89 | $filtered = (new LegacyLinkFilter($filtered))->filterTags($request[0], $casesensitive, $visibility); |
f210d94f LC |
90 | } |
91 | if (!empty($request[1])) { | |
336a28fa | 92 | $filtered = (new LegacyLinkFilter($filtered))->filterFulltext($request[1], $visibility); |
f210d94f LC |
93 | } |
94 | return $filtered; | |
822bffce | 95 | case self::$FILTER_TEXT: |
7f96d9ec | 96 | return $this->filterFulltext($request, $visibility); |
822bffce | 97 | case self::$FILTER_TAG: |
f210d94f LC |
98 | if ($untaggedonly) { |
99 | return $this->filterUntagged($visibility); | |
100 | } else { | |
101 | return $this->filterTags($request, $casesensitive, $visibility); | |
102 | } | |
822bffce A |
103 | case self::$FILTER_DAY: |
104 | return $this->filterDay($request); | |
822bffce | 105 | default: |
7f96d9ec | 106 | return $this->noFilter($visibility); |
822bffce A |
107 | } |
108 | } | |
109 | ||
110 | /** | |
111 | * Unknown filter, but handle private only. | |
112 | * | |
7f96d9ec | 113 | * @param string $visibility Optional: return only all/private/public links |
822bffce A |
114 | * |
115 | * @return array filtered links. | |
116 | */ | |
7f96d9ec | 117 | private function noFilter($visibility = 'all') |
822bffce | 118 | { |
7f96d9ec | 119 | if ($visibility === 'all') { |
822bffce A |
120 | return $this->links; |
121 | } | |
122 | ||
123 | $out = array(); | |
01878a75 | 124 | foreach ($this->links as $key => $value) { |
7f96d9ec A |
125 | if ($value['private'] && $visibility === 'private') { |
126 | $out[$key] = $value; | |
6696729b | 127 | } elseif (!$value['private'] && $visibility === 'public') { |
01878a75 | 128 | $out[$key] = $value; |
822bffce A |
129 | } |
130 | } | |
131 | ||
822bffce A |
132 | return $out; |
133 | } | |
134 | ||
135 | /** | |
136 | * Returns the shaare corresponding to a smallHash. | |
137 | * | |
138 | * @param string $smallHash permalink hash. | |
139 | * | |
140 | * @return array $filtered array containing permalink data. | |
528a6f8a | 141 | * |
336a28fa | 142 | * @throws BookmarkNotFoundException if the smallhash doesn't match any link. |
822bffce A |
143 | */ |
144 | private function filterSmallHash($smallHash) | |
145 | { | |
146 | $filtered = array(); | |
01878a75 | 147 | foreach ($this->links as $key => $l) { |
d592daea | 148 | if ($smallHash == $l['shorturl']) { |
822bffce | 149 | // Yes, this is ugly and slow |
01878a75 | 150 | $filtered[$key] = $l; |
822bffce A |
151 | return $filtered; |
152 | } | |
153 | } | |
528a6f8a A |
154 | |
155 | if (empty($filtered)) { | |
336a28fa | 156 | throw new BookmarkNotFoundException(); |
528a6f8a A |
157 | } |
158 | ||
822bffce A |
159 | return $filtered; |
160 | } | |
161 | ||
162 | /** | |
163 | * Returns the list of links corresponding to a full-text search | |
164 | * | |
165 | * Searches: | |
166 | * - in the URLs, title and description; | |
bedd176a A |
167 | * - are case-insensitive; |
168 | * - terms surrounded by quotes " are exact terms search. | |
169 | * - terms starting with a dash - are excluded (except exact terms). | |
822bffce A |
170 | * |
171 | * Example: | |
172 | * print_r($mydb->filterFulltext('hollandais')); | |
173 | * | |
174 | * mb_convert_case($val, MB_CASE_LOWER, 'UTF-8') | |
175 | * - allows to perform searches on Unicode text | |
176 | * - see https://github.com/shaarli/Shaarli/issues/75 for examples | |
177 | * | |
178 | * @param string $searchterms search query. | |
6696729b | 179 | * @param string $visibility Optional: return only all/private/public links. |
822bffce A |
180 | * |
181 | * @return array search results. | |
182 | */ | |
7f96d9ec | 183 | private function filterFulltext($searchterms, $visibility = 'all') |
822bffce | 184 | { |
c51fae92 | 185 | if (empty($searchterms)) { |
7f96d9ec | 186 | return $this->noFilter($visibility); |
c51fae92 A |
187 | } |
188 | ||
522b278b | 189 | $filtered = array(); |
ebd8075a | 190 | $search = mb_convert_case(html_entity_decode($searchterms), MB_CASE_LOWER, 'UTF-8'); |
bedd176a A |
191 | $exactRegex = '/"([^"]+)"/'; |
192 | // Retrieve exact search terms. | |
193 | preg_match_all($exactRegex, $search, $exactSearch); | |
194 | $exactSearch = array_values(array_filter($exactSearch[1])); | |
195 | ||
196 | // Remove exact search terms to get AND terms search. | |
197 | $explodedSearchAnd = explode(' ', trim(preg_replace($exactRegex, '', $search))); | |
198 | $explodedSearchAnd = array_values(array_filter($explodedSearchAnd)); | |
199 | ||
200 | // Filter excluding terms and update andSearch. | |
201 | $excludeSearch = array(); | |
202 | $andSearch = array(); | |
203 | foreach ($explodedSearchAnd as $needle) { | |
204 | if ($needle[0] == '-' && strlen($needle) > 1) { | |
205 | $excludeSearch[] = substr($needle, 1); | |
206 | } else { | |
207 | $andSearch[] = $needle; | |
208 | } | |
209 | } | |
210 | ||
822bffce | 211 | $keys = array('title', 'description', 'url', 'tags'); |
ebd8075a | 212 | |
822bffce | 213 | // Iterate over every stored link. |
01878a75 | 214 | foreach ($this->links as $id => $link) { |
822bffce | 215 | // ignore non private links when 'privatonly' is on. |
7f96d9ec | 216 | if ($visibility !== 'all') { |
6696729b | 217 | if (!$link['private'] && $visibility === 'private') { |
7f96d9ec | 218 | continue; |
d2d4f993 | 219 | } elseif ($link['private'] && $visibility === 'public') { |
7f96d9ec A |
220 | continue; |
221 | } | |
822bffce A |
222 | } |
223 | ||
522b278b A |
224 | // Concatenate link fields to search across fields. |
225 | // Adds a '\' separator for exact search terms. | |
226 | $content = ''; | |
822bffce | 227 | foreach ($keys as $key) { |
522b278b | 228 | $content .= mb_convert_case($link[$key], MB_CASE_LOWER, 'UTF-8') . '\\'; |
822bffce | 229 | } |
522b278b A |
230 | |
231 | // Be optimistic | |
232 | $found = true; | |
233 | ||
234 | // First, we look for exact term search | |
235 | for ($i = 0; $i < count($exactSearch) && $found; $i++) { | |
236 | $found = strpos($content, $exactSearch[$i]) !== false; | |
237 | } | |
238 | ||
239 | // Iterate over keywords, if keyword is not found, | |
240 | // no need to check for the others. We want all or nothing. | |
241 | for ($i = 0; $i < count($andSearch) && $found; $i++) { | |
242 | $found = strpos($content, $andSearch[$i]) !== false; | |
243 | } | |
244 | ||
245 | // Exclude terms. | |
246 | for ($i = 0; $i < count($excludeSearch) && $found; $i++) { | |
247 | $found = strpos($content, $excludeSearch[$i]) === false; | |
248 | } | |
249 | ||
822bffce | 250 | if ($found) { |
01878a75 | 251 | $filtered[$id] = $link; |
822bffce A |
252 | } |
253 | } | |
254 | ||
822bffce A |
255 | return $filtered; |
256 | } | |
257 | ||
341527ba WE |
258 | /** |
259 | * generate a regex fragment out of a tag | |
6696729b | 260 | * |
341527ba | 261 | * @param string $tag to to generate regexs from. may start with '-' to negate, contain '*' as wildcard |
6696729b | 262 | * |
341527ba WE |
263 | * @return string generated regex fragment |
264 | */ | |
265 | private static function tag2regex($tag) | |
266 | { | |
267 | $len = strlen($tag); | |
f211e417 | 268 | if (!$len || $tag === "-" || $tag === "*") { |
341527ba WE |
269 | // nothing to search, return empty regex |
270 | return ''; | |
271 | } | |
f211e417 | 272 | if ($tag[0] === "-") { |
341527ba WE |
273 | // query is negated |
274 | $i = 1; // use offset to start after '-' character | |
275 | $regex = '(?!'; // create negative lookahead | |
276 | } else { | |
277 | $i = 0; // start at first character | |
278 | $regex = '(?='; // use positive lookahead | |
279 | } | |
280 | $regex .= '.*(?:^| )'; // before tag may only be a space or the beginning | |
281 | // iterate over string, separating it into placeholder and content | |
f211e417 V |
282 | for (; $i < $len; $i++) { |
283 | if ($tag[$i] === '*') { | |
341527ba WE |
284 | // placeholder found |
285 | $regex .= '[^ ]*?'; | |
286 | } else { | |
287 | // regular characters | |
288 | $offset = strpos($tag, '*', $i); | |
f211e417 | 289 | if ($offset === false) { |
341527ba WE |
290 | // no placeholder found, set offset to end of string |
291 | $offset = $len; | |
292 | } | |
293 | // subtract one, as we want to get before the placeholder or end of string | |
294 | $offset -= 1; | |
295 | // we got a tag name that we want to search for. escape any regex characters to prevent conflicts. | |
296 | $regex .= preg_quote(substr($tag, $i, $offset - $i + 1), '/'); | |
297 | // move $i on | |
298 | $i = $offset; | |
299 | } | |
300 | } | |
301 | $regex .= '(?:$| ))'; // after the tag may only be a space or the end | |
302 | return $regex; | |
303 | } | |
304 | ||
822bffce A |
305 | /** |
306 | * Returns the list of links associated with a given list of tags | |
307 | * | |
308 | * You can specify one or more tags, separated by space or a comma, e.g. | |
309 | * print_r($mydb->filterTags('linux programming')); | |
310 | * | |
311 | * @param string $tags list of tags separated by commas or blank spaces. | |
312 | * @param bool $casesensitive ignore case if false. | |
7f96d9ec | 313 | * @param string $visibility Optional: return only all/private/public links. |
822bffce A |
314 | * |
315 | * @return array filtered links. | |
316 | */ | |
7f96d9ec | 317 | public function filterTags($tags, $casesensitive = false, $visibility = 'all') |
822bffce | 318 | { |
341527ba WE |
319 | // get single tags (we may get passed an array, even though the docs say different) |
320 | $inputTags = $tags; | |
f211e417 | 321 | if (!is_array($tags)) { |
341527ba WE |
322 | // we got an input string, split tags |
323 | $inputTags = preg_split('/(?:\s+)|,/', $inputTags, -1, PREG_SPLIT_NO_EMPTY); | |
324 | } | |
325 | ||
f211e417 | 326 | if (!count($inputTags)) { |
341527ba | 327 | // no input tags |
7f96d9ec | 328 | return $this->noFilter($visibility); |
c51fae92 A |
329 | } |
330 | ||
341527ba WE |
331 | // build regex from all tags |
332 | $re = '/^' . implode(array_map("self::tag2regex", $inputTags)) . '.*$/'; | |
f211e417 | 333 | if (!$casesensitive) { |
341527ba WE |
334 | // make regex case insensitive |
335 | $re .= 'i'; | |
21979ff1 | 336 | } |
822bffce | 337 | |
341527ba WE |
338 | // create resulting array |
339 | $filtered = array(); | |
340 | ||
341 | // iterate over each link | |
01878a75 | 342 | foreach ($this->links as $key => $link) { |
341527ba WE |
343 | // check level of visibility |
344 | // ignore non private links when 'privateonly' is on. | |
7f96d9ec | 345 | if ($visibility !== 'all') { |
6696729b | 346 | if (!$link['private'] && $visibility === 'private') { |
7f96d9ec | 347 | continue; |
d2d4f993 | 348 | } elseif ($link['private'] && $visibility === 'public') { |
7f96d9ec A |
349 | continue; |
350 | } | |
822bffce | 351 | } |
341527ba | 352 | $search = $link['tags']; // build search string, start with tags of current link |
f211e417 | 353 | if (strlen(trim($link['description'])) && strpos($link['description'], '#') !== false) { |
341527ba WE |
354 | // description given and at least one possible tag found |
355 | $descTags = array(); | |
356 | // find all tags in the form of #tag in the description | |
357 | preg_match_all( | |
358 | '/(?<![' . self::$HASHTAG_CHARS . '])#([' . self::$HASHTAG_CHARS . ']+?)\b/sm', | |
359 | $link['description'], | |
360 | $descTags | |
361 | ); | |
f211e417 | 362 | if (count($descTags[1])) { |
341527ba WE |
363 | // there were some tags in the description, add them to the search string |
364 | $search .= ' ' . implode(' ', $descTags[1]); | |
21979ff1 | 365 | } |
341527ba WE |
366 | }; |
367 | // match regular expression with search string | |
f211e417 | 368 | if (!preg_match($re, $search)) { |
341527ba WE |
369 | // this entry does _not_ match our regex |
370 | continue; | |
21979ff1 | 371 | } |
341527ba | 372 | $filtered[$key] = $link; |
822bffce | 373 | } |
822bffce A |
374 | return $filtered; |
375 | } | |
376 | ||
7d86f40b A |
377 | /** |
378 | * Return only links without any tag. | |
379 | * | |
380 | * @param string $visibility return only all/private/public links. | |
381 | * | |
382 | * @return array filtered links. | |
383 | */ | |
384 | public function filterUntagged($visibility) | |
385 | { | |
386 | $filtered = []; | |
387 | foreach ($this->links as $key => $link) { | |
388 | if ($visibility !== 'all') { | |
6696729b | 389 | if (!$link['private'] && $visibility === 'private') { |
7d86f40b | 390 | continue; |
d2d4f993 | 391 | } elseif ($link['private'] && $visibility === 'public') { |
7d86f40b A |
392 | continue; |
393 | } | |
394 | } | |
395 | ||
396 | if (empty(trim($link['tags']))) { | |
397 | $filtered[$key] = $link; | |
398 | } | |
399 | } | |
400 | ||
401 | return $filtered; | |
402 | } | |
403 | ||
822bffce A |
404 | /** |
405 | * Returns the list of articles for a given day, chronologically sorted | |
406 | * | |
407 | * Day must be in the form 'YYYYMMDD' (e.g. '20120125'), e.g. | |
408 | * print_r($mydb->filterDay('20120125')); | |
409 | * | |
410 | * @param string $day day to filter. | |
411 | * | |
412 | * @return array all link matching given day. | |
413 | * | |
414 | * @throws Exception if date format is invalid. | |
415 | */ | |
416 | public function filterDay($day) | |
417 | { | |
6696729b | 418 | if (!checkDateFormat('Ymd', $day)) { |
822bffce A |
419 | throw new Exception('Invalid date format'); |
420 | } | |
421 | ||
422 | $filtered = array(); | |
01878a75 A |
423 | foreach ($this->links as $key => $l) { |
424 | if ($l['created']->format('Ymd') == $day) { | |
425 | $filtered[$key] = $l; | |
822bffce A |
426 | } |
427 | } | |
01878a75 A |
428 | |
429 | // sort by date ASC | |
430 | return array_reverse($filtered, true); | |
822bffce A |
431 | } |
432 | ||
433 | /** | |
434 | * Convert a list of tags (str) to an array. Also | |
435 | * - handle case sensitivity. | |
436 | * - accepts spaces commas as separator. | |
822bffce A |
437 | * |
438 | * @param string $tags string containing a list of tags. | |
439 | * @param bool $casesensitive will convert everything to lowercase if false. | |
440 | * | |
441 | * @return array filtered tags string. | |
7f96d9ec | 442 | */ |
21979ff1 | 443 | public static function tagsStrToArray($tags, $casesensitive) |
822bffce A |
444 | { |
445 | // We use UTF-8 conversion to handle various graphemes (i.e. cyrillic, or greek) | |
446 | $tagsOut = $casesensitive ? $tags : mb_convert_case($tags, MB_CASE_LOWER, 'UTF-8'); | |
447 | $tagsOut = str_replace(',', ' ', $tagsOut); | |
448 | ||
b3051a6a | 449 | return preg_split('/\s+/', $tagsOut, -1, PREG_SPLIT_NO_EMPTY); |
822bffce A |
450 | } |
451 | } |