]> git.immae.eu Git - github/wallabag/wallabag.git/blob - tests/Wallabag/CoreBundle/Helper/ContentProxyTest.php
9ce72c79c1277ec08ded6eb7f3d8b4265cc4fdf7
[github/wallabag/wallabag.git] / tests / Wallabag / CoreBundle / Helper / ContentProxyTest.php
1 <?php
2
3 namespace Tests\Wallabag\CoreBundle\Helper;
4
5 use Graby\Graby;
6 use Monolog\Handler\TestHandler;
7 use Monolog\Logger;
8 use PHPUnit\Framework\TestCase;
9 use Psr\Log\NullLogger;
10 use Symfony\Component\Validator\ConstraintViolation;
11 use Symfony\Component\Validator\ConstraintViolationList;
12 use Symfony\Component\Validator\Validator\RecursiveValidator;
13 use Wallabag\CoreBundle\Entity\Entry;
14 use Wallabag\CoreBundle\Helper\ContentProxy;
15 use Wallabag\CoreBundle\Helper\RuleBasedTagger;
16 use Wallabag\UserBundle\Entity\User;
17
18 class ContentProxyTest extends TestCase
19 {
20 private $fetchingErrorMessage = 'wallabag can\'t retrieve contents for this article. Please <a href="http://doc.wallabag.org/en/user/errors_during_fetching.html#how-can-i-help-to-fix-that">troubleshoot this issue</a>.';
21
22 public function testWithBadUrl()
23 {
24 $tagger = $this->getTaggerMock();
25 $tagger->expects($this->once())
26 ->method('tag');
27
28 $graby = $this->getMockBuilder('Graby\Graby')
29 ->setMethods(['fetchContent'])
30 ->disableOriginalConstructor()
31 ->getMock();
32
33 $graby->expects($this->any())
34 ->method('fetchContent')
35 ->willReturn([
36 'html' => false,
37 'title' => '',
38 'url' => '',
39 'headers' => [
40 'content-type' => '',
41 ],
42 'language' => '',
43 ]);
44
45 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
46 $entry = new Entry(new User());
47 $proxy->updateEntry($entry, 'http://user@:80');
48
49 $this->assertSame('http://user@:80', $entry->getUrl());
50 $this->assertEmpty($entry->getTitle());
51 $this->assertSame($this->fetchingErrorMessage, $entry->getContent());
52 $this->assertEmpty($entry->getPreviewPicture());
53 $this->assertEmpty($entry->getMimetype());
54 $this->assertEmpty($entry->getLanguage());
55 $this->assertSame(0.0, $entry->getReadingTime());
56 $this->assertNull($entry->getDomainName());
57 }
58
59 public function testWithEmptyContent()
60 {
61 $tagger = $this->getTaggerMock();
62 $tagger->expects($this->once())
63 ->method('tag');
64
65 $graby = $this->getMockBuilder('Graby\Graby')
66 ->setMethods(['fetchContent'])
67 ->disableOriginalConstructor()
68 ->getMock();
69
70 $graby->expects($this->any())
71 ->method('fetchContent')
72 ->willReturn([
73 'html' => false,
74 'title' => '',
75 'url' => '',
76 'headers' => [
77 'content-type' => '',
78 ],
79 'language' => '',
80 ]);
81
82 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
83 $entry = new Entry(new User());
84 $proxy->updateEntry($entry, 'http://0.0.0.0');
85
86 $this->assertSame('http://0.0.0.0', $entry->getUrl());
87 $this->assertEmpty($entry->getTitle());
88 $this->assertSame($this->fetchingErrorMessage, $entry->getContent());
89 $this->assertEmpty($entry->getPreviewPicture());
90 $this->assertEmpty($entry->getMimetype());
91 $this->assertEmpty($entry->getLanguage());
92 $this->assertSame(0.0, $entry->getReadingTime());
93 $this->assertSame('0.0.0.0', $entry->getDomainName());
94 }
95
96 public function testWithEmptyContentButOG()
97 {
98 $tagger = $this->getTaggerMock();
99 $tagger->expects($this->once())
100 ->method('tag');
101
102 $graby = $this->getMockBuilder('Graby\Graby')
103 ->setMethods(['fetchContent'])
104 ->disableOriginalConstructor()
105 ->getMock();
106
107 $graby->expects($this->any())
108 ->method('fetchContent')
109 ->willReturn([
110 'html' => false,
111 'title' => 'my title',
112 'url' => '',
113 'headers' => [
114 'content-type' => '',
115 ],
116 'language' => '',
117 'status' => '',
118 'description' => 'desc',
119 ]);
120
121 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
122 $entry = new Entry(new User());
123 $proxy->updateEntry($entry, 'http://domain.io');
124
125 $this->assertSame('http://domain.io', $entry->getUrl());
126 $this->assertSame('my title', $entry->getTitle());
127 $this->assertSame($this->fetchingErrorMessage . '<p><i>But we found a short description: </i></p>desc', $entry->getContent());
128 $this->assertEmpty($entry->getPreviewPicture());
129 $this->assertEmpty($entry->getLanguage());
130 $this->assertEmpty($entry->getHttpStatus());
131 $this->assertEmpty($entry->getMimetype());
132 $this->assertSame(0.0, $entry->getReadingTime());
133 $this->assertSame('domain.io', $entry->getDomainName());
134 }
135
136 public function testWithContent()
137 {
138 $tagger = $this->getTaggerMock();
139 $tagger->expects($this->once())
140 ->method('tag');
141
142 $graby = $this->getMockBuilder('Graby\Graby')
143 ->setMethods(['fetchContent'])
144 ->disableOriginalConstructor()
145 ->getMock();
146
147 $graby->expects($this->any())
148 ->method('fetchContent')
149 ->willReturn([
150 'html' => str_repeat('this is my content', 325),
151 'title' => 'this is my title',
152 'url' => 'http://1.1.1.1',
153 'language' => 'fr',
154 'status' => '200',
155 'description' => 'OG desc',
156 'image' => 'http://3.3.3.3/cover.jpg',
157 'headers' => [
158 'content-type' => 'text/html',
159 ],
160 ]);
161
162 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
163 $entry = new Entry(new User());
164 $proxy->updateEntry($entry, 'http://0.0.0.0');
165
166 $this->assertSame('http://1.1.1.1', $entry->getUrl());
167 $this->assertSame('this is my title', $entry->getTitle());
168 $this->assertContains('content', $entry->getContent());
169 $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
170 $this->assertSame('text/html', $entry->getMimetype());
171 $this->assertSame('fr', $entry->getLanguage());
172 $this->assertSame('200', $entry->getHttpStatus());
173 $this->assertSame(4.0, $entry->getReadingTime());
174 $this->assertSame('1.1.1.1', $entry->getDomainName());
175 }
176
177 public function testWithContentAndNoOgImage()
178 {
179 $tagger = $this->getTaggerMock();
180 $tagger->expects($this->once())
181 ->method('tag');
182
183 $graby = $this->getMockBuilder('Graby\Graby')
184 ->setMethods(['fetchContent'])
185 ->disableOriginalConstructor()
186 ->getMock();
187
188 $graby->expects($this->any())
189 ->method('fetchContent')
190 ->willReturn([
191 'html' => str_repeat('this is my content', 325),
192 'title' => 'this is my title',
193 'url' => 'http://1.1.1.1',
194 'language' => 'fr',
195 'status' => '200',
196 'description' => 'OG desc',
197 'image' => null,
198 'headers' => [
199 'content-type' => 'text/html',
200 ],
201 ]);
202
203 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
204 $entry = new Entry(new User());
205 $proxy->updateEntry($entry, 'http://0.0.0.0');
206
207 $this->assertSame('http://1.1.1.1', $entry->getUrl());
208 $this->assertSame('this is my title', $entry->getTitle());
209 $this->assertContains('content', $entry->getContent());
210 $this->assertNull($entry->getPreviewPicture());
211 $this->assertSame('text/html', $entry->getMimetype());
212 $this->assertSame('fr', $entry->getLanguage());
213 $this->assertSame('200', $entry->getHttpStatus());
214 $this->assertSame(4.0, $entry->getReadingTime());
215 $this->assertSame('1.1.1.1', $entry->getDomainName());
216 }
217
218 public function testWithContentAndContentImage()
219 {
220 $tagger = $this->getTaggerMock();
221 $tagger->expects($this->once())
222 ->method('tag');
223
224 $graby = $this->getMockBuilder('Graby\Graby')
225 ->setMethods(['fetchContent'])
226 ->disableOriginalConstructor()
227 ->getMock();
228
229 $graby->expects($this->any())
230 ->method('fetchContent')
231 ->willReturn([
232 'html' => "<h1>Test</h1><p><img src='http://3.3.3.3/cover.jpg'/></p>",
233 'title' => 'this is my title',
234 'url' => 'http://1.1.1.1',
235 'headers' => [
236 'content-type' => 'text/html',
237 ],
238 'language' => 'fr',
239 'status' => '200',
240 'image' => null,
241 ]);
242
243 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
244 $entry = new Entry(new User());
245 $proxy->updateEntry($entry, 'http://0.0.0.0');
246
247 $this->assertSame('http://1.1.1.1', $entry->getUrl());
248 $this->assertSame('this is my title', $entry->getTitle());
249 $this->assertSame("<h1>Test</h1><p><img src='http://3.3.3.3/cover.jpg'/></p>", $entry->getContent());
250 $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
251 $this->assertSame('text/html', $entry->getMimetype());
252 $this->assertSame('fr', $entry->getLanguage());
253 $this->assertSame('200', $entry->getHttpStatus());
254 $this->assertSame(0.0, $entry->getReadingTime());
255 $this->assertSame('1.1.1.1', $entry->getDomainName());
256 }
257
258 public function testWithContentImageAndOgImage()
259 {
260 $tagger = $this->getTaggerMock();
261 $tagger->expects($this->once())
262 ->method('tag');
263
264 $graby = $this->getMockBuilder('Graby\Graby')
265 ->setMethods(['fetchContent'])
266 ->disableOriginalConstructor()
267 ->getMock();
268
269 $graby->expects($this->any())
270 ->method('fetchContent')
271 ->willReturn([
272 'html' => "<h1>Test</h1><p><img src='http://3.3.3.3/nevermind.jpg'/></p>",
273 'title' => 'this is my title',
274 'url' => 'http://1.1.1.1',
275 'headers' => [
276 'content-type' => 'text/html',
277 ],
278 'language' => 'fr',
279 'status' => '200',
280 'image' => 'http://3.3.3.3/cover.jpg',
281 ]);
282
283 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
284 $entry = new Entry(new User());
285 $proxy->updateEntry($entry, 'http://0.0.0.0');
286
287 $this->assertSame('http://1.1.1.1', $entry->getUrl());
288 $this->assertSame('this is my title', $entry->getTitle());
289 $this->assertSame("<h1>Test</h1><p><img src='http://3.3.3.3/nevermind.jpg'/></p>", $entry->getContent());
290 $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
291 $this->assertSame('text/html', $entry->getMimetype());
292 $this->assertSame('fr', $entry->getLanguage());
293 $this->assertSame('200', $entry->getHttpStatus());
294 $this->assertSame(0.0, $entry->getReadingTime());
295 $this->assertSame('1.1.1.1', $entry->getDomainName());
296 }
297
298 public function testWithContentAndBadLanguage()
299 {
300 $tagger = $this->getTaggerMock();
301 $tagger->expects($this->once())
302 ->method('tag');
303
304 $validator = $this->getValidator(false);
305 $validator->expects($this->once())
306 ->method('validate')
307 ->willReturn(new ConstraintViolationList([new ConstraintViolation('oops', 'oops', [], 'oops', 'language', 'dontexist')]));
308
309 $graby = $this->getMockBuilder('Graby\Graby')
310 ->setMethods(['fetchContent'])
311 ->disableOriginalConstructor()
312 ->getMock();
313
314 $graby->expects($this->any())
315 ->method('fetchContent')
316 ->willReturn([
317 'html' => str_repeat('this is my content', 325),
318 'title' => 'this is my title',
319 'url' => 'http://1.1.1.1',
320 'language' => 'dontexist',
321 'status' => '200',
322 'headers' => [
323 'content-type' => 'text/html',
324 ],
325 ]);
326
327 $proxy = new ContentProxy($graby, $tagger, $validator, $this->getLogger(), $this->fetchingErrorMessage);
328 $entry = new Entry(new User());
329 $proxy->updateEntry($entry, 'http://0.0.0.0');
330
331 $this->assertSame('http://1.1.1.1', $entry->getUrl());
332 $this->assertSame('this is my title', $entry->getTitle());
333 $this->assertContains('content', $entry->getContent());
334 $this->assertSame('text/html', $entry->getMimetype());
335 $this->assertNull($entry->getLanguage());
336 $this->assertSame('200', $entry->getHttpStatus());
337 $this->assertSame(4.0, $entry->getReadingTime());
338 $this->assertSame('1.1.1.1', $entry->getDomainName());
339 }
340
341 public function testWithContentAndBadOgImage()
342 {
343 $tagger = $this->getTaggerMock();
344 $tagger->expects($this->once())
345 ->method('tag');
346
347 $validator = $this->getValidator(false);
348 $validator->expects($this->exactly(2))
349 ->method('validate')
350 ->will($this->onConsecutiveCalls(
351 new ConstraintViolationList(),
352 new ConstraintViolationList([new ConstraintViolation('oops', 'oops', [], 'oops', 'url', 'https://')])
353 ));
354
355 $graby = $this->getMockBuilder('Graby\Graby')
356 ->setMethods(['fetchContent'])
357 ->disableOriginalConstructor()
358 ->getMock();
359
360 $graby->expects($this->any())
361 ->method('fetchContent')
362 ->willReturn([
363 'html' => str_repeat('this is my content', 325),
364 'title' => 'this is my title',
365 'url' => 'http://1.1.1.1',
366 'headers' => [
367 'content-type' => 'text/html',
368 ],
369 'language' => 'fr',
370 'status' => '200',
371 'description' => 'OG desc',
372 'image' => 'https://',
373 ]);
374
375 $proxy = new ContentProxy($graby, $tagger, $validator, $this->getLogger(), $this->fetchingErrorMessage);
376 $entry = new Entry(new User());
377 $proxy->updateEntry($entry, 'http://0.0.0.0');
378
379 $this->assertSame('http://1.1.1.1', $entry->getUrl());
380 $this->assertSame('this is my title', $entry->getTitle());
381 $this->assertContains('content', $entry->getContent());
382 $this->assertNull($entry->getPreviewPicture());
383 $this->assertSame('text/html', $entry->getMimetype());
384 $this->assertSame('fr', $entry->getLanguage());
385 $this->assertSame('200', $entry->getHttpStatus());
386 $this->assertSame(4.0, $entry->getReadingTime());
387 $this->assertSame('1.1.1.1', $entry->getDomainName());
388 }
389
390 public function testWithForcedContent()
391 {
392 $tagger = $this->getTaggerMock();
393 $tagger->expects($this->once())
394 ->method('tag');
395
396 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true);
397 $entry = new Entry(new User());
398 $proxy->updateEntry(
399 $entry,
400 'http://0.0.0.0',
401 [
402 'html' => str_repeat('this is my content', 325),
403 'title' => 'this is my title',
404 'url' => 'http://1.1.1.1',
405 'language' => 'fr',
406 'date' => '1395635872',
407 'authors' => ['Jeremy', 'Nico', 'Thomas'],
408 'headers' => [
409 'cache-control' => 'no-cache',
410 'content-type' => 'text/html',
411 ],
412 ]
413 );
414
415 $this->assertSame('http://1.1.1.1', $entry->getUrl());
416 $this->assertSame('this is my title', $entry->getTitle());
417 $this->assertContains('content', $entry->getContent());
418 $this->assertSame('text/html', $entry->getMimetype());
419 $this->assertSame('fr', $entry->getLanguage());
420 $this->assertSame(4.0, $entry->getReadingTime());
421 $this->assertSame('1.1.1.1', $entry->getDomainName());
422 $this->assertSame('24/03/2014', $entry->getPublishedAt()->format('d/m/Y'));
423 $this->assertContains('Jeremy', $entry->getPublishedBy());
424 $this->assertContains('Nico', $entry->getPublishedBy());
425 $this->assertContains('Thomas', $entry->getPublishedBy());
426 $this->assertNotNull($entry->getHeaders(), 'Headers are stored, so value is not null');
427 $this->assertContains('no-cache', $entry->getHeaders());
428 }
429
430 public function testWithForcedContentAndDatetime()
431 {
432 $tagger = $this->getTaggerMock();
433 $tagger->expects($this->once())
434 ->method('tag');
435
436 $logHandler = new TestHandler();
437 $logger = new Logger('test', [$logHandler]);
438
439 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $logger, $this->fetchingErrorMessage);
440 $entry = new Entry(new User());
441 $proxy->updateEntry(
442 $entry,
443 'http://1.1.1.1',
444 [
445 'html' => str_repeat('this is my content', 325),
446 'title' => 'this is my title',
447 'url' => 'http://1.1.1.1',
448 'language' => 'fr',
449 'date' => '2016-09-08T11:55:58+0200',
450 'headers' => [
451 'content-type' => 'text/html',
452 ],
453 ]
454 );
455
456 $this->assertSame('http://1.1.1.1', $entry->getUrl());
457 $this->assertSame('this is my title', $entry->getTitle());
458 $this->assertContains('content', $entry->getContent());
459 $this->assertSame('text/html', $entry->getMimetype());
460 $this->assertSame('fr', $entry->getLanguage());
461 $this->assertSame(4.0, $entry->getReadingTime());
462 $this->assertSame('1.1.1.1', $entry->getDomainName());
463 $this->assertSame('08/09/2016', $entry->getPublishedAt()->format('d/m/Y'));
464 }
465
466 public function testWithForcedContentAndBadDate()
467 {
468 $tagger = $this->getTaggerMock();
469 $tagger->expects($this->once())
470 ->method('tag');
471
472 $logger = new Logger('foo');
473 $handler = new TestHandler();
474 $logger->pushHandler($handler);
475
476 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $logger, $this->fetchingErrorMessage);
477 $entry = new Entry(new User());
478 $proxy->updateEntry(
479 $entry,
480 'http://1.1.1.1',
481 [
482 'html' => str_repeat('this is my content', 325),
483 'title' => 'this is my title',
484 'url' => 'http://1.1.1.1',
485 'language' => 'fr',
486 'date' => '01 02 2012',
487 'headers' => [
488 'content-type' => 'text/html',
489 ],
490 ]
491 );
492
493 $this->assertSame('http://1.1.1.1', $entry->getUrl());
494 $this->assertSame('this is my title', $entry->getTitle());
495 $this->assertContains('content', $entry->getContent());
496 $this->assertSame('text/html', $entry->getMimetype());
497 $this->assertSame('fr', $entry->getLanguage());
498 $this->assertSame(4.0, $entry->getReadingTime());
499 $this->assertSame('1.1.1.1', $entry->getDomainName());
500 $this->assertNull($entry->getPublishedAt());
501
502 $records = $handler->getRecords();
503
504 $this->assertCount(3, $records);
505 $this->assertContains('Error while defining date', $records[0]['message']);
506 }
507
508 public function testTaggerThrowException()
509 {
510 $tagger = $this->getTaggerMock();
511 $tagger->expects($this->once())
512 ->method('tag')
513 ->will($this->throwException(new \Exception()));
514
515 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
516 $entry = new Entry(new User());
517 $proxy->updateEntry(
518 $entry,
519 'http://1.1.1.1',
520 [
521 'html' => str_repeat('this is my content', 325),
522 'title' => 'this is my title',
523 'url' => 'http://1.1.1.1',
524 'language' => 'fr',
525 'headers' => [
526 'content-type' => 'text/html',
527 ],
528 ]
529 );
530
531 $this->assertCount(0, $entry->getTags());
532 }
533
534 public function dataForCrazyHtml()
535 {
536 return [
537 'script and comment' => [
538 '<strong>Script inside:</strong> <!--[if gte IE 4]><script>alert(\'lol\');</script><![endif]--><br />',
539 'lol',
540 ],
541 'script' => [
542 '<strong>Script inside:</strong><script>alert(\'lol\');</script>',
543 'script',
544 ],
545 ];
546 }
547
548 /**
549 * @dataProvider dataForCrazyHtml
550 */
551 public function testWithCrazyHtmlContent($html, $escapedString)
552 {
553 $tagger = $this->getTaggerMock();
554 $tagger->expects($this->once())
555 ->method('tag');
556
557 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
558 $entry = new Entry(new User());
559 $proxy->updateEntry(
560 $entry,
561 'http://1.1.1.1',
562 [
563 'html' => $html,
564 'title' => 'this is my title',
565 'url' => 'http://1.1.1.1',
566 'language' => 'fr',
567 'status' => '200',
568 //'og_title' => 'my OG title',
569 'description' => 'OG desc',
570 'image' => 'http://3.3.3.3/cover.jpg',
571 'headers' => [
572 'content-type' => 'text/html',
573 ],
574 ]
575 );
576
577 $this->assertSame('http://1.1.1.1', $entry->getUrl());
578 $this->assertSame('this is my title', $entry->getTitle());
579 $this->assertNotContains($escapedString, $entry->getContent());
580 $this->assertSame('http://3.3.3.3/cover.jpg', $entry->getPreviewPicture());
581 $this->assertSame('text/html', $entry->getMimetype());
582 $this->assertSame('fr', $entry->getLanguage());
583 $this->assertSame('200', $entry->getHttpStatus());
584 $this->assertSame('1.1.1.1', $entry->getDomainName());
585 }
586
587 public function testWithImageAsContent()
588 {
589 $tagger = $this->getTaggerMock();
590 $tagger->expects($this->once())
591 ->method('tag');
592
593 $graby = $this->getMockBuilder('Graby\Graby')
594 ->setMethods(['fetchContent'])
595 ->disableOriginalConstructor()
596 ->getMock();
597
598 $graby->expects($this->any())
599 ->method('fetchContent')
600 ->willReturn([
601 'html' => '<p><img src="http://1.1.1.1/image.jpg" /></p>',
602 'title' => 'this is my title',
603 'url' => 'http://1.1.1.1/image.jpg',
604 'status' => '200',
605 'headers' => [
606 'content-type' => 'image/jpeg',
607 ],
608 ]);
609
610 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
611 $entry = new Entry(new User());
612 $proxy->updateEntry($entry, 'http://0.0.0.0');
613
614 $this->assertSame('http://1.1.1.1/image.jpg', $entry->getUrl());
615 $this->assertSame('this is my title', $entry->getTitle());
616 $this->assertContains('http://1.1.1.1/image.jpg', $entry->getContent());
617 $this->assertSame('http://1.1.1.1/image.jpg', $entry->getPreviewPicture());
618 $this->assertSame('image/jpeg', $entry->getMimetype());
619 $this->assertSame('200', $entry->getHttpStatus());
620 $this->assertSame('1.1.1.1', $entry->getDomainName());
621 }
622
623 public function testWebsiteWithValidUTF8Title_doNothing()
624 {
625 // You can use https://www.online-toolz.com/tools/text-hex-convertor.php to convert UTF-8 text <=> hex
626 // See http://graphemica.com for more info about the characters
627 // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
628 $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '7A');
629
630 $tagger = $this->getTaggerMock();
631 $tagger->expects($this->once())
632 ->method('tag');
633
634 $graby = $this->getMockBuilder('Graby\Graby')
635 ->setMethods(['fetchContent'])
636 ->disableOriginalConstructor()
637 ->getMock();
638
639 $graby->expects($this->any())
640 ->method('fetchContent')
641 ->willReturn([
642 'html' => false,
643 'title' => $actualTitle,
644 'url' => '',
645 'headers' => [
646 'content-type' => 'text/html',
647 ],
648 'language' => '',
649 ]);
650
651 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
652 $entry = new Entry(new User());
653 $proxy->updateEntry($entry, 'http://0.0.0.0');
654
655 // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
656 $expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
657 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
658 }
659
660 public function testWebsiteWithInvalidUTF8Title_removeInvalidCharacter()
661 {
662 // See http://graphemica.com for more info about the characters
663 // 'a€b' (61;80;62) in hexadecimal and WINDOWS-1252 - but 80 is a invalid UTF-8 character.
664 // The correct UTF-8 € character (U+20AC) is E282AC
665 $actualTitle = $this->hexToStr('61' . '80' . '62');
666
667 $tagger = $this->getTaggerMock();
668 $tagger->expects($this->once())
669 ->method('tag');
670
671 $graby = $this->getMockBuilder('Graby\Graby')
672 ->setMethods(['fetchContent'])
673 ->disableOriginalConstructor()
674 ->getMock();
675
676 $graby->expects($this->any())
677 ->method('fetchContent')
678 ->willReturn([
679 'html' => false,
680 'title' => $actualTitle,
681 'url' => '',
682 'headers' => [
683 'content-type' => 'text/html',
684 ],
685 'language' => '',
686 ]);
687
688 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
689 $entry = new Entry(new User());
690 $proxy->updateEntry($entry, 'http://0.0.0.0');
691
692 // 'ab' (61;62) because all invalid UTF-8 character (like 80) are removed
693 $expectedTitle = '61' . '62';
694 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
695 }
696
697 public function testPdfWithUTF16BETitle_convertToUTF8()
698 {
699 // See http://graphemica.com for more info about the characters
700 // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF16BE
701 $actualTitle = $this->hexToStr('D83DDE3B');
702
703 $tagger = $this->getTaggerMock();
704 $tagger->expects($this->once())
705 ->method('tag');
706
707 $graby = $this->getMockBuilder('Graby\Graby')
708 ->setMethods(['fetchContent'])
709 ->disableOriginalConstructor()
710 ->getMock();
711
712 $graby->expects($this->any())
713 ->method('fetchContent')
714 ->willReturn([
715 'html' => false,
716 'title' => $actualTitle,
717 'url' => '',
718 'headers' => [
719 'content-type' => 'application/pdf',
720 ],
721 'language' => '',
722 ]);
723
724 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
725 $entry = new Entry(new User());
726 $proxy->updateEntry($entry, 'http://0.0.0.0');
727
728 // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
729 $expectedTitle = 'F09F98BB';
730 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
731 }
732
733 public function testPdfWithUTF8Title_doNothing()
734 {
735 // See http://graphemica.com for more info about the characters
736 // '😻' (U+1F63B;D83DDE3B) in hexadecimal and as UTF8
737 $actualTitle = $this->hexToStr('F09F98BB');
738
739 $tagger = $this->getTaggerMock();
740 $tagger->expects($this->once())
741 ->method('tag');
742
743 $graby = $this->getMockBuilder('Graby\Graby')
744 ->setMethods(['fetchContent'])
745 ->disableOriginalConstructor()
746 ->getMock();
747
748 $graby->expects($this->any())
749 ->method('fetchContent')
750 ->willReturn([
751 'html' => false,
752 'title' => $actualTitle,
753 'url' => '',
754 'headers' => [
755 'content-type' => 'application/pdf',
756 ],
757 'language' => '',
758 ]);
759
760 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
761 $entry = new Entry(new User());
762 $proxy->updateEntry($entry, 'http://0.0.0.0');
763
764 // '😻' (U+1F63B or F09F98BB) in hexadecimal and UTF-8
765 $expectedTitle = 'F09F98BB';
766 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
767 }
768
769 public function testPdfWithWINDOWS1252Title_convertToUTF8()
770 {
771 // See http://graphemica.com for more info about the characters
772 // '€' (80) in hexadecimal and WINDOWS-1252
773 $actualTitle = $this->hexToStr('80');
774
775 $tagger = $this->getTaggerMock();
776 $tagger->expects($this->once())
777 ->method('tag');
778
779 $graby = $this->getMockBuilder('Graby\Graby')
780 ->setMethods(['fetchContent'])
781 ->disableOriginalConstructor()
782 ->getMock();
783
784 $graby->expects($this->any())
785 ->method('fetchContent')
786 ->willReturn([
787 'html' => false,
788 'title' => $actualTitle,
789 'url' => '',
790 'headers' => [
791 'content-type' => 'application/pdf',
792 ],
793 'language' => '',
794 ]);
795
796 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
797 $entry = new Entry(new User());
798 $proxy->updateEntry($entry, 'http://0.0.0.0');
799
800 // '€' (U+20AC or E282AC) in hexadecimal and UTF-8
801 $expectedTitle = 'E282AC';
802 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
803 }
804
805 public function testPdfWithInvalidCharacterInTitle_removeInvalidCharacter()
806 {
807 // See http://graphemica.com for more info about the characters
808 // '😻ℤ�z' (U+1F63B or F09F98BB; U+2124 or E284A4; invalid character 81; U+007A or 7A) in hexadecimal and UTF-8
809 // 0x81 is not a valid character for UTF16, UTF8 and WINDOWS-1252
810 $actualTitle = $this->hexToStr('F09F98BB' . 'E284A4' . '81' . '7A');
811
812 $tagger = $this->getTaggerMock();
813 $tagger->expects($this->once())
814 ->method('tag');
815
816 $graby = $this->getMockBuilder('Graby\Graby')
817 ->setMethods(['fetchContent'])
818 ->disableOriginalConstructor()
819 ->getMock();
820
821 $graby->expects($this->any())
822 ->method('fetchContent')
823 ->willReturn([
824 'html' => false,
825 'title' => $actualTitle,
826 'url' => '',
827 'headers' => [
828 'content-type' => 'application/pdf',
829 ],
830 'language' => '',
831 ]);
832
833 $proxy = new ContentProxy($graby, $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage);
834 $entry = new Entry(new User());
835 $proxy->updateEntry($entry, 'http://0.0.0.0');
836
837 // '😻ℤz' (U+1F63B or F09F98BB; U+2124 or E284A4; U+007A or 7A) in hexadecimal and UTF-8
838 // the 0x81 (represented by �) is invalid for UTF16, UTF8 and WINDOWS-1252 and is removed
839 $expectedTitle = 'F09F98BB' . 'E284A4' . '7A';
840 $this->assertSame($expectedTitle, $this->strToHex($entry->getTitle()));
841 }
842
843 /**
844 * Data provider for testWithChangedUrl.
845 *
846 * Arrays contain the following values:
847 * $entry_url
848 * $origin_url
849 * $content_url
850 * $expected_entry_url
851 * $expected_origin_url
852 * $expected_domain
853 */
854 public function dataForChangedUrl()
855 {
856 return [
857 'normal' => [
858 'http://0.0.0.0',
859 null,
860 'http://1.1.1.1',
861 'http://1.1.1.1',
862 'http://0.0.0.0',
863 '1.1.1.1',
864 ],
865 'origin already set' => [
866 'http://0.0.0.0',
867 'http://hello',
868 'http://1.1.1.1',
869 'http://1.1.1.1',
870 'http://hello',
871 '1.1.1.1',
872 ],
873 'trailing slash' => [
874 'https://example.com/hello-world',
875 null,
876 'https://example.com/hello-world/',
877 'https://example.com/hello-world/',
878 null,
879 'example.com',
880 ],
881 'query string in fetched content' => [
882 'https://example.org/hello',
883 null,
884 'https://example.org/hello?world=1',
885 'https://example.org/hello?world=1',
886 'https://example.org/hello',
887 'example.org',
888 ],
889 'fragment in fetched content' => [
890 'https://example.org/hello',
891 null,
892 'https://example.org/hello#world',
893 'https://example.org/hello',
894 null,
895 'example.org',
896 ],
897 'fragment and query string in fetched content' => [
898 'https://example.org/hello',
899 null,
900 'https://example.org/hello?foo#world',
901 'https://example.org/hello?foo#world',
902 'https://example.org/hello',
903 'example.org',
904 ],
905 'different path and query string in fetch content' => [
906 'https://example.org/hello',
907 null,
908 'https://example.org/world?foo',
909 'https://example.org/world?foo',
910 'https://example.org/hello',
911 'example.org',
912 ],
913 'feedproxy ignore list test' => [
914 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
915 null,
916 'https://example.org/hello-wallabag',
917 'https://example.org/hello-wallabag',
918 null,
919 'example.org',
920 ],
921 'feedproxy ignore list test with origin url already set' => [
922 'http://feedproxy.google.com/~r/Wallabag/~3/helloworld',
923 'https://example.org/this-is-source',
924 'https://example.org/hello-wallabag',
925 'https://example.org/hello-wallabag',
926 'https://example.org/this-is-source',
927 'example.org',
928 ],
929 'lemonde ignore pattern test' => [
930 'http://www.lemonde.fr/tiny/url',
931 null,
932 'http://example.com/hello-world',
933 'http://example.com/hello-world',
934 null,
935 'example.com',
936 ],
937 ];
938 }
939
940 /**
941 * @dataProvider dataForChangedUrl
942 */
943 public function testWithChangedUrl($entry_url, $origin_url, $content_url, $expected_entry_url, $expected_origin_url, $expected_domain)
944 {
945 $tagger = $this->getTaggerMock();
946 $tagger->expects($this->once())
947 ->method('tag');
948
949 $proxy = new ContentProxy((new Graby()), $tagger, $this->getValidator(), $this->getLogger(), $this->fetchingErrorMessage, true);
950 $entry = new Entry(new User());
951 $entry->setOriginUrl($origin_url);
952 $proxy->updateEntry(
953 $entry,
954 $entry_url,
955 [
956 'html' => false,
957 'title' => '',
958 'url' => $content_url,
959 'headers' => [
960 'content-type' => '',
961 ],
962 'language' => '',
963 ],
964 true
965 );
966
967 $this->assertSame($expected_entry_url, $entry->getUrl());
968 $this->assertSame($expected_domain, $entry->getDomainName());
969 $this->assertSame($expected_origin_url, $entry->getOriginUrl());
970 }
971
972 /**
973 * https://stackoverflow.com/a/18506801.
974 *
975 * @param $string
976 *
977 * @return string
978 */
979 private function strToHex($string)
980 {
981 $hex = '';
982 for ($i = 0; $i < \strlen($string); ++$i) {
983 $ord = \ord($string[$i]);
984 $hexCode = dechex($ord);
985 $hex .= substr('0' . $hexCode, -2);
986 }
987
988 return strtoupper($hex);
989 }
990
991 /**
992 * Convert hex to string.
993 *
994 * @see https://stackoverflow.com/a/18506801
995 *
996 * @param $hex
997 *
998 * @return string
999 */
1000 private function hexToStr($hex)
1001 {
1002 $string = '';
1003 for ($i = 0; $i < \strlen($hex) - 1; $i += 2) {
1004 $string .= \chr(hexdec($hex[$i] . $hex[$i + 1]));
1005 }
1006
1007 return $string;
1008 }
1009
1010 private function getTaggerMock()
1011 {
1012 return $this->getMockBuilder(RuleBasedTagger::class)
1013 ->setMethods(['tag'])
1014 ->disableOriginalConstructor()
1015 ->getMock();
1016 }
1017
1018 private function getLogger()
1019 {
1020 return new NullLogger();
1021 }
1022
1023 private function getValidator($withDefaultMock = true)
1024 {
1025 $mock = $this->getMockBuilder(RecursiveValidator::class)
1026 ->setMethods(['validate'])
1027 ->disableOriginalConstructor()
1028 ->getMock();
1029
1030 if ($withDefaultMock) {
1031 $mock->expects($this->any())
1032 ->method('validate')
1033 ->willReturn(new ConstraintViolationList());
1034 }
1035
1036 return $mock;
1037 }
1038 }