]> git.immae.eu Git - github/shaarli/Shaarli.git/blob - tests/bookmark/LinkUtilsTest.php
Improve regex to extract HTML metadata (title, description, etc.)
[github/shaarli/Shaarli.git] / tests / bookmark / LinkUtilsTest.php
1 <?php
2
3 namespace Shaarli\Bookmark;
4
5 use PHPUnit\Framework\TestCase;
6
7 require_once 'tests/utils/CurlUtils.php';
8
9 /**
10 * Class LinkUtilsTest.
11 */
12 class LinkUtilsTest extends TestCase
13 {
14 /**
15 * Test html_extract_title() when the title is found.
16 */
17 public function testHtmlExtractExistentTitle()
18 {
19 $title = 'Read me please.';
20 $html = '<html><meta>stuff</meta><title>' . $title . '</title></html>';
21 $this->assertEquals($title, html_extract_title($html));
22 $html = '<html><title>' . $title . '</title>blabla<title>another</title></html>';
23 $this->assertEquals($title, html_extract_title($html));
24 }
25
26 /**
27 * Test html_extract_title() when the title is not found.
28 */
29 public function testHtmlExtractNonExistentTitle()
30 {
31 $html = '<html><meta>stuff</meta></html>';
32 $this->assertFalse(html_extract_title($html));
33 }
34
35 /**
36 * Test headers_extract_charset() when the charset is found.
37 */
38 public function testHeadersExtractExistentCharset()
39 {
40 $charset = 'x-MacCroatian';
41 $headers = 'text/html; charset=' . $charset;
42 $this->assertEquals(strtolower($charset), header_extract_charset($headers));
43 }
44
45 /**
46 * Test headers_extract_charset() when the charset is not found.
47 */
48 public function testHeadersExtractNonExistentCharset()
49 {
50 $headers = '';
51 $this->assertFalse(header_extract_charset($headers));
52
53 $headers = 'text/html';
54 $this->assertFalse(header_extract_charset($headers));
55 }
56
57 /**
58 * Test html_extract_charset() when the charset is found.
59 */
60 public function testHtmlExtractExistentCharset()
61 {
62 $charset = 'x-MacCroatian';
63 $html = '<html><meta>stuff2</meta><meta charset="' . $charset . '"/></html>';
64 $this->assertEquals(strtolower($charset), html_extract_charset($html));
65 }
66
67 /**
68 * Test html_extract_charset() when the charset is not found.
69 */
70 public function testHtmlExtractNonExistentCharset()
71 {
72 $html = '<html><meta>stuff</meta></html>';
73 $this->assertFalse(html_extract_charset($html));
74 $html = '<html><meta>stuff</meta><meta charset=""/></html>';
75 $this->assertFalse(html_extract_charset($html));
76 }
77
78 /**
79 * Test html_extract_tag() when the tag <meta name= is found.
80 */
81 public function testHtmlExtractExistentNameTag()
82 {
83 $description = 'Bob and Alice share cookies.';
84
85 // Simple one line
86 $html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>';
87 $this->assertEquals($description, html_extract_tag('description', $html));
88
89 // Simple OpenGraph
90 $html = '<meta property="og:description" content="' . $description . '">';
91 $this->assertEquals($description, html_extract_tag('description', $html));
92
93 // Simple reversed OpenGraph
94 $html = '<meta content="' . $description . '" property="og:description">';
95 $this->assertEquals($description, html_extract_tag('description', $html));
96
97 // ItemProp OpenGraph
98 $html = '<meta itemprop="og:description" content="' . $description . '">';
99 $this->assertEquals($description, html_extract_tag('description', $html));
100
101 // OpenGraph without quotes
102 $html = '<meta property=og:description content="' . $description . '">';
103 $this->assertEquals($description, html_extract_tag('description', $html));
104
105 // OpenGraph reversed without quotes
106 $html = '<meta content="' . $description . '" property=og:description>';
107 $this->assertEquals($description, html_extract_tag('description', $html));
108
109 // OpenGraph with noise
110 $html = '<meta tag1="content1" property="og:description" tag2="content2" content="' .
111 $description . '" tag3="content3">';
112 $this->assertEquals($description, html_extract_tag('description', $html));
113
114 // OpenGraph reversed with noise
115 $html = '<meta tag1="content1" content="' . $description . '" ' .
116 'tag3="content3" tag2="content2" property="og:description">';
117 $this->assertEquals($description, html_extract_tag('description', $html));
118
119 // OpenGraph multiple properties start
120 $html = '<meta property="unrelated og:description" content="' . $description . '">';
121 $this->assertEquals($description, html_extract_tag('description', $html));
122
123 // OpenGraph multiple properties end
124 $html = '<meta property="og:description unrelated" content="' . $description . '">';
125 $this->assertEquals($description, html_extract_tag('description', $html));
126
127 // OpenGraph multiple properties both end
128 $html = '<meta property="og:unrelated1 og:description og:unrelated2" content="' . $description . '">';
129 $this->assertEquals($description, html_extract_tag('description', $html));
130
131 // OpenGraph multiple properties both end with noise
132 $html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
133 'tag2="content2" content="' . $description . '" tag3="content3">';
134 $this->assertEquals($description, html_extract_tag('description', $html));
135
136 // OpenGraph reversed multiple properties start
137 $html = '<meta content="' . $description . '" property="unrelated og:description">';
138 $this->assertEquals($description, html_extract_tag('description', $html));
139
140 // OpenGraph reversed multiple properties end
141 $html = '<meta content="' . $description . '" property="og:description unrelated">';
142 $this->assertEquals($description, html_extract_tag('description', $html));
143
144 // OpenGraph reversed multiple properties both end
145 $html = '<meta content="' . $description . '" property="og:unrelated1 og:description og:unrelated2">';
146 $this->assertEquals($description, html_extract_tag('description', $html));
147
148 // OpenGraph reversed multiple properties both end with noise
149 $html = '<meta tag1="content1" content="' . $description . '" tag2="content2" '.
150 'property="og:unrelated1 og:description og:unrelated2" tag3="content3">';
151 $this->assertEquals($description, html_extract_tag('description', $html));
152
153 // Suggestion from #1375
154 $html = '<meta property="og:description" name="description" content="' . $description . '">';
155 $this->assertEquals($description, html_extract_tag('description', $html));
156 }
157
158 /**
159 * Test html_extract_tag() when the tag <meta name= is not found.
160 */
161 public function testHtmlExtractNonExistentNameTag()
162 {
163 $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
164 $this->assertFalse(html_extract_tag('description', $html));
165
166 // Partial meta tag
167 $html = '<meta content="Brief description">';
168 $this->assertFalse(html_extract_tag('description', $html));
169
170 $html = '<meta property="og:description">';
171 $this->assertFalse(html_extract_tag('description', $html));
172
173 $html = '<meta tag1="content1" property="og:description">';
174 $this->assertFalse(html_extract_tag('description', $html));
175
176 $html = '<meta property="og:description" tag1="content1">';
177 $this->assertFalse(html_extract_tag('description', $html));
178
179 $html = '<meta tag1="content1" content="Brief description">';
180 $this->assertFalse(html_extract_tag('description', $html));
181
182 $html = '<meta content="Brief description" tag1="content1">';
183 $this->assertFalse(html_extract_tag('description', $html));
184 }
185
186 /**
187 * Test html_extract_tag() when the tag <meta property="og: is found.
188 */
189 public function testHtmlExtractExistentOgTag()
190 {
191 $description = 'Bob and Alice share cookies.';
192 $html = '<html><meta>stuff2</meta><meta property="og:description" content="' . $description . '"/></html>';
193 $this->assertEquals($description, html_extract_tag('description', $html));
194 }
195
196 /**
197 * Test html_extract_tag() when the tag <meta property="og: is not found.
198 */
199 public function testHtmlExtractNonExistentOgTag()
200 {
201 $html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
202 $this->assertFalse(html_extract_tag('description', $html));
203 }
204
205 /**
206 * Test the download callback with valid value
207 */
208 public function testCurlDownloadCallbackOk()
209 {
210 $callback = get_curl_download_callback(
211 $charset,
212 $title,
213 $desc,
214 $keywords,
215 false,
216 'ut_curl_getinfo_ok'
217 );
218 $data = [
219 'HTTP/1.1 200 OK',
220 'Server: GitHub.com',
221 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
222 'Content-Type: text/html; charset=utf-8',
223 'Status: 200 OK',
224 'end' => 'th=device-width">'
225 . '<title>Refactoring · GitHub</title>'
226 . '<link rel="search" type="application/opensea',
227 '<title>ignored</title>'
228 . '<meta name="description" content="desc" />'
229 . '<meta name="keywords" content="key1,key2" />',
230 ];
231 foreach ($data as $key => $line) {
232 $ignore = null;
233 $expected = $key !== 'end' ? strlen($line) : false;
234 $this->assertEquals($expected, $callback($ignore, $line));
235 if ($expected === false) {
236 break;
237 }
238 }
239 $this->assertEquals('utf-8', $charset);
240 $this->assertEquals('Refactoring · GitHub', $title);
241 $this->assertEmpty($desc);
242 $this->assertEmpty($keywords);
243 }
244
245 /**
246 * Test the download callback with valid values and no charset
247 */
248 public function testCurlDownloadCallbackOkNoCharset()
249 {
250 $callback = get_curl_download_callback(
251 $charset,
252 $title,
253 $desc,
254 $keywords,
255 false,
256 'ut_curl_getinfo_no_charset'
257 );
258 $data = [
259 'HTTP/1.1 200 OK',
260 'end' => 'th=device-width">'
261 . '<title>Refactoring · GitHub</title>'
262 . '<link rel="search" type="application/opensea',
263 '<title>ignored</title>'
264 . '<meta name="description" content="desc" />'
265 . '<meta name="keywords" content="key1,key2" />',
266 ];
267 foreach ($data as $key => $line) {
268 $ignore = null;
269 $this->assertEquals(strlen($line), $callback($ignore, $line));
270 }
271 $this->assertEmpty($charset);
272 $this->assertEquals('Refactoring · GitHub', $title);
273 $this->assertEmpty($desc);
274 $this->assertEmpty($keywords);
275 }
276
277 /**
278 * Test the download callback with valid values and no charset
279 */
280 public function testCurlDownloadCallbackOkHtmlCharset()
281 {
282 $callback = get_curl_download_callback(
283 $charset,
284 $title,
285 $desc,
286 $keywords,
287 false,
288 'ut_curl_getinfo_no_charset'
289 );
290 $data = [
291 'HTTP/1.1 200 OK',
292 '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />',
293 'end' => 'th=device-width">'
294 . '<title>Refactoring · GitHub</title>'
295 . '<link rel="search" type="application/opensea',
296 '<title>ignored</title>'
297 . '<meta name="description" content="desc" />'
298 . '<meta name="keywords" content="key1,key2" />',
299 ];
300 foreach ($data as $key => $line) {
301 $ignore = null;
302 $expected = $key !== 'end' ? strlen($line) : false;
303 $this->assertEquals($expected, $callback($ignore, $line));
304 if ($expected === false) {
305 break;
306 }
307 }
308 $this->assertEquals('utf-8', $charset);
309 $this->assertEquals('Refactoring · GitHub', $title);
310 $this->assertEmpty($desc);
311 $this->assertEmpty($keywords);
312 }
313
314 /**
315 * Test the download callback with valid values and no title
316 */
317 public function testCurlDownloadCallbackOkNoTitle()
318 {
319 $callback = get_curl_download_callback(
320 $charset,
321 $title,
322 $desc,
323 $keywords,
324 false,
325 'ut_curl_getinfo_ok'
326 );
327 $data = [
328 'HTTP/1.1 200 OK',
329 'end' => 'th=device-width">Refactoring · GitHub<link rel="search" type="application/opensea',
330 'ignored',
331 ];
332 foreach ($data as $key => $line) {
333 $ignore = null;
334 $this->assertEquals(strlen($line), $callback($ignore, $line));
335 }
336 $this->assertEquals('utf-8', $charset);
337 $this->assertEmpty($title);
338 $this->assertEmpty($desc);
339 $this->assertEmpty($keywords);
340 }
341
342 /**
343 * Test the download callback with an invalid content type.
344 */
345 public function testCurlDownloadCallbackInvalidContentType()
346 {
347 $callback = get_curl_download_callback(
348 $charset,
349 $title,
350 $desc,
351 $keywords,
352 false,
353 'ut_curl_getinfo_ct_ko'
354 );
355 $ignore = null;
356 $this->assertFalse($callback($ignore, ''));
357 $this->assertEmpty($charset);
358 $this->assertEmpty($title);
359 }
360
361 /**
362 * Test the download callback with an invalid response code.
363 */
364 public function testCurlDownloadCallbackInvalidResponseCode()
365 {
366 $callback = $callback = get_curl_download_callback(
367 $charset,
368 $title,
369 $desc,
370 $keywords,
371 false,
372 'ut_curl_getinfo_rc_ko'
373 );
374 $ignore = null;
375 $this->assertFalse($callback($ignore, ''));
376 $this->assertEmpty($charset);
377 $this->assertEmpty($title);
378 }
379
380 /**
381 * Test the download callback with an invalid content type and response code.
382 */
383 public function testCurlDownloadCallbackInvalidContentTypeAndResponseCode()
384 {
385 $callback = $callback = get_curl_download_callback(
386 $charset,
387 $title,
388 $desc,
389 $keywords,
390 false,
391 'ut_curl_getinfo_rs_ct_ko'
392 );
393 $ignore = null;
394 $this->assertFalse($callback($ignore, ''));
395 $this->assertEmpty($charset);
396 $this->assertEmpty($title);
397 }
398
399 /**
400 * Test the download callback with valid value, and retrieve_description option enabled.
401 */
402 public function testCurlDownloadCallbackOkWithDesc()
403 {
404 $callback = get_curl_download_callback(
405 $charset,
406 $title,
407 $desc,
408 $keywords,
409 true,
410 'ut_curl_getinfo_ok'
411 );
412 $data = [
413 'HTTP/1.1 200 OK',
414 'Server: GitHub.com',
415 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
416 'Content-Type: text/html; charset=utf-8',
417 'Status: 200 OK',
418 'th=device-width">'
419 . '<title>Refactoring · GitHub</title>'
420 . '<link rel="search" type="application/opensea',
421 'end' => '<title>ignored</title>'
422 . '<meta name="description" content="link desc" />'
423 . '<meta name="keywords" content="key1,key2" />',
424 ];
425 foreach ($data as $key => $line) {
426 $ignore = null;
427 $expected = $key !== 'end' ? strlen($line) : false;
428 $this->assertEquals($expected, $callback($ignore, $line));
429 if ($expected === false) {
430 break;
431 }
432 }
433 $this->assertEquals('utf-8', $charset);
434 $this->assertEquals('Refactoring · GitHub', $title);
435 $this->assertEquals('link desc', $desc);
436 $this->assertEquals('key1 key2', $keywords);
437 }
438
439 /**
440 * Test the download callback with valid value, and retrieve_description option enabled,
441 * but no desc or keyword defined in the page.
442 */
443 public function testCurlDownloadCallbackOkWithDescNotFound()
444 {
445 $callback = get_curl_download_callback(
446 $charset,
447 $title,
448 $desc,
449 $keywords,
450 true,
451 'ut_curl_getinfo_ok'
452 );
453 $data = [
454 'HTTP/1.1 200 OK',
455 'Server: GitHub.com',
456 'Date: Sat, 28 Oct 2017 12:01:33 GMT',
457 'Content-Type: text/html; charset=utf-8',
458 'Status: 200 OK',
459 'th=device-width">'
460 . '<title>Refactoring · GitHub</title>'
461 . '<link rel="search" type="application/opensea',
462 'end' => '<title>ignored</title>',
463 ];
464 foreach ($data as $key => $line) {
465 $ignore = null;
466 $expected = $key !== 'end' ? strlen($line) : false;
467 $this->assertEquals($expected, $callback($ignore, $line));
468 if ($expected === false) {
469 break;
470 }
471 }
472 $this->assertEquals('utf-8', $charset);
473 $this->assertEquals('Refactoring · GitHub', $title);
474 $this->assertEmpty($desc);
475 $this->assertEmpty($keywords);
476 }
477
478 /**
479 * Test text2clickable.
480 */
481 public function testText2clickable()
482 {
483 $text = 'stuff http://hello.there/is=someone#here otherstuff';
484 $expectedText = 'stuff <a href="http://hello.there/is=someone#here">'
485 . 'http://hello.there/is=someone#here</a> otherstuff';
486 $processedText = text2clickable($text);
487 $this->assertEquals($expectedText, $processedText);
488
489 $text = 'stuff http://hello.there/is=someone#here(please) otherstuff';
490 $expectedText = 'stuff <a href="http://hello.there/is=someone#here(please)">'
491 . 'http://hello.there/is=someone#here(please)</a> otherstuff';
492 $processedText = text2clickable($text);
493 $this->assertEquals($expectedText, $processedText);
494
495 $text = 'stuff http://hello.there/is=someone#here(please)&no otherstuff';
496 $text = 'stuff http://hello.there/is=someone#here(please)&no otherstuff';
497 $expectedText = 'stuff <a href="http://hello.there/is=someone#here(please)&no">'
498 . 'http://hello.there/is=someone#here(please)&no</a> otherstuff';
499 $processedText = text2clickable($text);
500 $this->assertEquals($expectedText, $processedText);
501 }
502
503 /**
504 * Test testSpace2nbsp.
505 */
506 public function testSpace2nbsp()
507 {
508 $text = ' Are you thrilled by flags ?' . PHP_EOL . ' Really?';
509 $expectedText = '&nbsp; Are you &nbsp; thrilled &nbsp;by flags &nbsp; ?' . PHP_EOL . '&nbsp;Really?';
510 $processedText = space2nbsp($text);
511 $this->assertEquals($expectedText, $processedText);
512 }
513
514 /**
515 * Test hashtags auto-link.
516 */
517 public function testHashtagAutolink()
518 {
519 $index = 'http://domain.tld/';
520 $rawDescription = '#hashtag\n
521 # nothashtag\n
522 test#nothashtag #hashtag \#nothashtag\n
523 test #hashtag #hashtag test #hashtag.test\n
524 #hashtag #hashtag-nothashtag #hashtag_hashtag\n
525 What is #ашок anyway?\n
526 カタカナ #カタカナ」カタカナ\n';
527 $autolinkedDescription = hashtag_autolink($rawDescription, $index);
528
529 $this->assertContains($this->getHashtagLink('hashtag', $index), $autolinkedDescription);
530 $this->assertNotContains(' #hashtag', $autolinkedDescription);
531 $this->assertNotContains('>#nothashtag', $autolinkedDescription);
532 $this->assertContains($this->getHashtagLink('ашок', $index), $autolinkedDescription);
533 $this->assertContains($this->getHashtagLink('カタカナ', $index), $autolinkedDescription);
534 $this->assertContains($this->getHashtagLink('hashtag_hashtag', $index), $autolinkedDescription);
535 $this->assertNotContains($this->getHashtagLink('hashtag-nothashtag', $index), $autolinkedDescription);
536 }
537
538 /**
539 * Test hashtags auto-link without index URL.
540 */
541 public function testHashtagAutolinkNoIndex()
542 {
543 $rawDescription = 'blabla #hashtag x#nothashtag';
544 $autolinkedDescription = hashtag_autolink($rawDescription);
545
546 $this->assertContains($this->getHashtagLink('hashtag'), $autolinkedDescription);
547 $this->assertNotContains(' #hashtag', $autolinkedDescription);
548 $this->assertNotContains('>#nothashtag', $autolinkedDescription);
549 }
550
551 /**
552 * Test is_note with note URLs.
553 */
554 public function testIsNote()
555 {
556 $this->assertTrue(is_note('?'));
557 $this->assertTrue(is_note('?abcDEf'));
558 $this->assertTrue(is_note('?_abcDEf#123'));
559 }
560
561 /**
562 * Test is_note with non note URLs.
563 */
564 public function testIsNotNote()
565 {
566 $this->assertFalse(is_note(''));
567 $this->assertFalse(is_note('nope'));
568 $this->assertFalse(is_note('https://github.com/shaarli/Shaarli/?hi'));
569 }
570
571 /**
572 * Util function to build an hashtag link.
573 *
574 * @param string $hashtag Hashtag name.
575 * @param string $index Index URL.
576 *
577 * @return string HTML hashtag link.
578 */
579 private function getHashtagLink($hashtag, $index = '')
580 {
581 $hashtagLink = '<a href="' . $index . './add-tag/$1" title="Hashtag $1">#$1</a>';
582 return str_replace('$1', $hashtag, $hashtagLink);
583 }
584 }