]>
Commit | Line | Data |
---|---|---|
eb1af592 NL |
1 | <?php |
2 | /** | |
3 | * poche, a read it later open source system | |
4 | * | |
5 | * @category poche | |
6 | * @author Nicolas Lœuillet <support@inthepoche.com> | |
7 | * @copyright 2013 | |
8 | * @license http://www.wtfpl.net/ see COPYING file | |
9 | */ | |
10 | ||
11 | class Url | |
12 | { | |
13 | public $url; | |
14 | ||
15 | function __construct($url) | |
16 | { | |
17 | $this->url = base64_decode($url); | |
18 | } | |
19 | ||
20 | public function getUrl() { | |
21 | return $this->url; | |
22 | } | |
23 | ||
24 | public function setUrl($url) { | |
25 | $this->url = $url; | |
26 | } | |
27 | ||
28 | public function isCorrect() | |
29 | { | |
0b05568c | 30 | $pattern = '|^(.*:)//([a-z\-.]+)(:[0-9]+)?(.*)$|i'; |
eb1af592 NL |
31 | |
32 | return preg_match($pattern, $this->url); | |
33 | } | |
34 | ||
35 | public function clean() | |
36 | { | |
37 | $url = html_entity_decode(trim($this->url)); | |
38 | ||
39 | $stuff = strpos($url,'&utm_source='); | |
40 | if ($stuff !== FALSE) | |
41 | $url = substr($url, 0, $stuff); | |
42 | $stuff = strpos($url,'?utm_source='); | |
43 | if ($stuff !== FALSE) | |
44 | $url = substr($url, 0, $stuff); | |
45 | $stuff = strpos($url,'#xtor=RSS-'); | |
46 | if ($stuff !== FALSE) | |
47 | $url = substr($url, 0, $stuff); | |
48 | ||
49 | $this->url = $url; | |
50 | } | |
51 | ||
52 | public function fetchContent() | |
53 | { | |
54 | if ($this->isCorrect()) { | |
55 | $this->clean(); | |
56 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | |
57 | ||
58 | # if Tools::getFile() if not able to retrieve HTTPS content, try the same URL with HTTP protocol | |
59 | if (!preg_match('!^https?://!i', $this->getUrl()) && (!isset($html) || strlen($html) <= 0)) { | |
60 | $this->setUrl('http://' . $this->getUrl()); | |
61 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | |
62 | } | |
63 | ||
64 | if (function_exists('tidy_parse_string')) { | |
65 | $tidy = tidy_parse_string($html, array(), 'UTF8'); | |
66 | $tidy->cleanRepair(); | |
2bf93dc0 | 67 | |
70c39d16 NL |
68 | //Warning: tidy might fail so, ensure there is still a content |
69 | $body = $tidy->body(); | |
2bf93dc0 | 70 | |
70c39d16 NL |
71 | //hasChildren does not seem to work, just check the string |
72 | //returned (and do not forget to clean the white spaces) | |
73 | if (preg_replace('/\s+/', '', $body->value) !== "<body></body>") { | |
74 | $html = $tidy->value; | |
75 | } | |
2bf93dc0 | 76 | } |
eb1af592 NL |
77 | |
78 | $parameters = array(); | |
79 | if (isset($html) and strlen($html) > 0) | |
80 | { | |
81 | $readability = new Readability($html, $this->getUrl()); | |
82 | $readability->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; | |
83 | $readability->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS; | |
84 | ||
85 | if($readability->init()) | |
86 | { | |
87 | $content = $readability->articleContent->innerHTML; | |
8340fedd | 88 | $parameters['title'] = ($readability->articleTitle->innerHTML != '' ? $readability->articleTitle->innerHTML : _('Untitled')); |
eb1af592 NL |
89 | $parameters['content'] = $content; |
90 | ||
91 | return $parameters; | |
92 | } | |
93 | } | |
94 | } | |
95 | else { | |
96 | #$msg->add('e', _('error during url preparation : the link is not valid')); | |
97 | Tools::logm($this->getUrl() . ' is not a valid url'); | |
98 | } | |
99 | ||
100 | return FALSE; | |
101 | } | |
102 | } |