]>
Commit | Line | Data |
---|---|---|
eb1af592 NL |
1 | <?php |
2 | /** | |
3 | * poche, a read it later open source system | |
4 | * | |
5 | * @category poche | |
6 | * @author Nicolas Lœuillet <support@inthepoche.com> | |
7 | * @copyright 2013 | |
8 | * @license http://www.wtfpl.net/ see COPYING file | |
9 | */ | |
10 | ||
11 | class Url | |
12 | { | |
13 | public $url; | |
14 | ||
15 | function __construct($url) | |
16 | { | |
17 | $this->url = base64_decode($url); | |
18 | } | |
19 | ||
20 | public function getUrl() { | |
21 | return $this->url; | |
22 | } | |
23 | ||
24 | public function setUrl($url) { | |
25 | $this->url = $url; | |
26 | } | |
27 | ||
28 | public function isCorrect() | |
29 | { | |
149df445 | 30 | return filter_var($this->url, FILTER_VALIDATE_URL) !== FALSE; |
eb1af592 NL |
31 | } |
32 | ||
33 | public function clean() | |
34 | { | |
35 | $url = html_entity_decode(trim($this->url)); | |
36 | ||
37 | $stuff = strpos($url,'&utm_source='); | |
38 | if ($stuff !== FALSE) | |
39 | $url = substr($url, 0, $stuff); | |
40 | $stuff = strpos($url,'?utm_source='); | |
41 | if ($stuff !== FALSE) | |
42 | $url = substr($url, 0, $stuff); | |
43 | $stuff = strpos($url,'#xtor=RSS-'); | |
44 | if ($stuff !== FALSE) | |
45 | $url = substr($url, 0, $stuff); | |
46 | ||
47 | $this->url = $url; | |
48 | } | |
49 | ||
50 | public function fetchContent() | |
51 | { | |
52 | if ($this->isCorrect()) { | |
53 | $this->clean(); | |
54 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | |
55 | ||
56 | # if Tools::getFile() if not able to retrieve HTTPS content, try the same URL with HTTP protocol | |
57 | if (!preg_match('!^https?://!i', $this->getUrl()) && (!isset($html) || strlen($html) <= 0)) { | |
58 | $this->setUrl('http://' . $this->getUrl()); | |
59 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | |
60 | } | |
61 | ||
62 | if (function_exists('tidy_parse_string')) { | |
63 | $tidy = tidy_parse_string($html, array(), 'UTF8'); | |
64 | $tidy->cleanRepair(); | |
2bf93dc0 | 65 | |
70c39d16 NL |
66 | //Warning: tidy might fail so, ensure there is still a content |
67 | $body = $tidy->body(); | |
2bf93dc0 | 68 | |
70c39d16 NL |
69 | //hasChildren does not seem to work, just check the string |
70 | //returned (and do not forget to clean the white spaces) | |
71 | if (preg_replace('/\s+/', '', $body->value) !== "<body></body>") { | |
72 | $html = $tidy->value; | |
73 | } | |
7374ff30 | 74 | } |
eb1af592 NL |
75 | |
76 | $parameters = array(); | |
77 | if (isset($html) and strlen($html) > 0) | |
78 | { | |
79 | $readability = new Readability($html, $this->getUrl()); | |
80 | $readability->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; | |
81 | $readability->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS; | |
82 | ||
83 | if($readability->init()) | |
84 | { | |
85 | $content = $readability->articleContent->innerHTML; | |
8340fedd | 86 | $parameters['title'] = ($readability->articleTitle->innerHTML != '' ? $readability->articleTitle->innerHTML : _('Untitled')); |
eb1af592 NL |
87 | $parameters['content'] = $content; |
88 | ||
89 | return $parameters; | |
90 | } | |
91 | } | |
92 | } | |
93 | else { | |
94 | #$msg->add('e', _('error during url preparation : the link is not valid')); | |
95 | Tools::logm($this->getUrl() . ' is not a valid url'); | |
96 | } | |
97 | ||
98 | return FALSE; | |
99 | } | |
100 | } |