diff options
author | Nicolas Lœuillet <nicolas.loeuillet@gmail.com> | 2013-08-08 09:36:10 -0700 |
---|---|---|
committer | Nicolas Lœuillet <nicolas.loeuillet@gmail.com> | 2013-08-08 09:36:10 -0700 |
commit | 9a8b4ff4edf84d7df60de1b6fd1e493b59f88273 (patch) | |
tree | 3c8ab8086fd8a2750270f8aeaee1f1ce016167cb /inc/poche/Url.class.php | |
parent | 85ebc80c7eaf88e4d57a52adb8e4c32d8cc34b64 (diff) | |
parent | 572e758bf2e76308a3fa3eda9a8d9e9be8b53ecc (diff) | |
download | wallabag-9a8b4ff4edf84d7df60de1b6fd1e493b59f88273.tar.gz wallabag-9a8b4ff4edf84d7df60de1b6fd1e493b59f88273.tar.zst wallabag-9a8b4ff4edf84d7df60de1b6fd1e493b59f88273.zip |
Merge pull request #109 from inthepoche/dev
merge dev into master
Diffstat (limited to 'inc/poche/Url.class.php')
-rw-r--r-- | inc/poche/Url.class.php | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/inc/poche/Url.class.php b/inc/poche/Url.class.php new file mode 100644 index 00000000..f4a8f99e --- /dev/null +++ b/inc/poche/Url.class.php | |||
@@ -0,0 +1,94 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * poche, a read it later open source system | ||
4 | * | ||
5 | * @category poche | ||
6 | * @author Nicolas Lœuillet <support@inthepoche.com> | ||
7 | * @copyright 2013 | ||
8 | * @license http://www.wtfpl.net/ see COPYING file | ||
9 | */ | ||
10 | |||
11 | class Url | ||
12 | { | ||
13 | public $url; | ||
14 | |||
15 | function __construct($url) | ||
16 | { | ||
17 | $this->url = base64_decode($url); | ||
18 | } | ||
19 | |||
20 | public function getUrl() { | ||
21 | return $this->url; | ||
22 | } | ||
23 | |||
24 | public function setUrl($url) { | ||
25 | $this->url = $url; | ||
26 | } | ||
27 | |||
28 | public function isCorrect() | ||
29 | { | ||
30 | $pattern = '|^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)?$|i'; | ||
31 | |||
32 | return preg_match($pattern, $this->url); | ||
33 | } | ||
34 | |||
35 | public function clean() | ||
36 | { | ||
37 | $url = html_entity_decode(trim($this->url)); | ||
38 | |||
39 | $stuff = strpos($url,'&utm_source='); | ||
40 | if ($stuff !== FALSE) | ||
41 | $url = substr($url, 0, $stuff); | ||
42 | $stuff = strpos($url,'?utm_source='); | ||
43 | if ($stuff !== FALSE) | ||
44 | $url = substr($url, 0, $stuff); | ||
45 | $stuff = strpos($url,'#xtor=RSS-'); | ||
46 | if ($stuff !== FALSE) | ||
47 | $url = substr($url, 0, $stuff); | ||
48 | |||
49 | $this->url = $url; | ||
50 | } | ||
51 | |||
52 | public function fetchContent() | ||
53 | { | ||
54 | if ($this->isCorrect()) { | ||
55 | $this->clean(); | ||
56 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | ||
57 | |||
58 | # if Tools::getFile() if not able to retrieve HTTPS content, try the same URL with HTTP protocol | ||
59 | if (!preg_match('!^https?://!i', $this->getUrl()) && (!isset($html) || strlen($html) <= 0)) { | ||
60 | $this->setUrl('http://' . $this->getUrl()); | ||
61 | $html = Encoding::toUTF8(Tools::getFile($this->getUrl())); | ||
62 | } | ||
63 | |||
64 | if (function_exists('tidy_parse_string')) { | ||
65 | $tidy = tidy_parse_string($html, array(), 'UTF8'); | ||
66 | $tidy->cleanRepair(); | ||
67 | $html = $tidy->value; | ||
68 | } | ||
69 | |||
70 | $parameters = array(); | ||
71 | if (isset($html) and strlen($html) > 0) | ||
72 | { | ||
73 | $readability = new Readability($html, $this->getUrl()); | ||
74 | $readability->convertLinksToFootnotes = CONVERT_LINKS_FOOTNOTES; | ||
75 | $readability->revertForcedParagraphElements = REVERT_FORCED_PARAGRAPH_ELEMENTS; | ||
76 | |||
77 | if($readability->init()) | ||
78 | { | ||
79 | $content = $readability->articleContent->innerHTML; | ||
80 | $parameters['title'] = $readability->articleTitle->innerHTML; | ||
81 | $parameters['content'] = $content; | ||
82 | |||
83 | return $parameters; | ||
84 | } | ||
85 | } | ||
86 | } | ||
87 | else { | ||
88 | #$msg->add('e', _('error during url preparation : the link is not valid')); | ||
89 | Tools::logm($this->getUrl() . ' is not a valid url'); | ||
90 | } | ||
91 | |||
92 | return FALSE; | ||
93 | } | ||
94 | } \ No newline at end of file | ||