diff options
author | Thomas Citharel <tcit@tcit.fr> | 2014-12-19 21:08:44 +0100 |
---|---|---|
committer | Thomas Citharel <tcit@tcit.fr> | 2014-12-19 21:08:44 +0100 |
commit | 3dcd85c075998ecdf2d54c5661c42e24080067d5 (patch) | |
tree | b939cdbf86c1be64ef64d8afb3efd0d55e8a945e /inc/3rdparty/libraries/MOBIClass/OnlineArticle.php | |
parent | 44b95cb81deae35f58e0058910afde2f2ffb9a60 (diff) | |
parent | e9a64ef8a9776becbe4c621ee4bd762f24b1bc3e (diff) | |
download | wallabag-3dcd85c075998ecdf2d54c5661c42e24080067d5.tar.gz wallabag-3dcd85c075998ecdf2d54c5661c42e24080067d5.tar.zst wallabag-3dcd85c075998ecdf2d54c5661c42e24080067d5.zip |
merge
Diffstat (limited to 'inc/3rdparty/libraries/MOBIClass/OnlineArticle.php')
-rw-r--r-- | inc/3rdparty/libraries/MOBIClass/OnlineArticle.php | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/MOBIClass/OnlineArticle.php b/inc/3rdparty/libraries/MOBIClass/OnlineArticle.php new file mode 100644 index 00000000..ec3182fe --- /dev/null +++ b/inc/3rdparty/libraries/MOBIClass/OnlineArticle.php | |||
@@ -0,0 +1,116 @@ | |||
1 | <?php | ||
2 | |||
3 | /** | ||
4 | * Description of OnlineArticle | ||
5 | * | ||
6 | * @author Sander | ||
7 | */ | ||
8 | class OnlineArticle extends ContentProvider { | ||
9 | private $text; | ||
10 | private $images; | ||
11 | private $metadata = array(); | ||
12 | private $imgCounter = 0; | ||
13 | |||
14 | public function __construct($url) { | ||
15 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | ||
16 | |||
17 | $data = Http::Request($url); | ||
18 | //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII"); | ||
19 | $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII"); | ||
20 | //$html = utf8_encode($html); | ||
21 | $r = new Readability($html, $url); | ||
22 | $r->init(); | ||
23 | if(!isset($this->metadata["title"])){ | ||
24 | $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); | ||
25 | } | ||
26 | if(!isset($this->metadata["author"])){ | ||
27 | $parts = parse_url($url); | ||
28 | $this->metadata["author"] = $parts["host"]; | ||
29 | } | ||
30 | |||
31 | $article = $r->getContent()->innerHTML; | ||
32 | if(substr($article, 0, 5) == "<body"){ | ||
33 | $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>".$article."</html>"; | ||
34 | }else{ | ||
35 | $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>".$article."</body></html>"; | ||
36 | } | ||
37 | $doc = new DOMDocument(); | ||
38 | @$doc->loadHTML($article) or die($article); | ||
39 | $doc->normalizeDocument(); | ||
40 | |||
41 | $this->images = $this->handleImages($doc, $url); | ||
42 | $this->text = $doc->saveHTML(); | ||
43 | } | ||
44 | |||
45 | /** | ||
46 | * Get the text data to be integrated in the MOBI file | ||
47 | * @return string | ||
48 | */ | ||
49 | public function getTextData(){ | ||
50 | return $this->text; | ||
51 | } | ||
52 | /** | ||
53 | * Get the images (an array containing the jpeg data). Array entry 0 will | ||
54 | * correspond to image record 0. | ||
55 | * @return array | ||
56 | */ | ||
57 | public function getImages(){ | ||
58 | return $this->images; | ||
59 | } | ||
60 | /** | ||
61 | * Get the metadata in the form of a hashtable (for example, title or author). | ||
62 | * @return array | ||
63 | */ | ||
64 | public function getMetaData(){ | ||
65 | return $this->metadata; | ||
66 | } | ||
67 | /** | ||
68 | * | ||
69 | * @param DOMElement $dom | ||
70 | * @return array | ||
71 | */ | ||
72 | private function handleImages($dom, $url){ | ||
73 | $images = array(); | ||
74 | |||
75 | $parts = parse_url($url); | ||
76 | |||
77 | $savedImages = array(); | ||
78 | |||
79 | $imgElements = $dom->getElementsByTagName('img'); | ||
80 | foreach($imgElements as $img) { | ||
81 | $src = $img->getAttribute("src"); | ||
82 | |||
83 | $is_root = false; | ||
84 | if(substr($src, 0, 1) == "/"){ | ||
85 | $is_root = true; | ||
86 | } | ||
87 | |||
88 | $parsed = parse_url($src); | ||
89 | |||
90 | if(!isset($parsed["host"])){ | ||
91 | if($is_root){ | ||
92 | $src = http_build_url($url, $parsed, HTTP_URL_REPLACE); | ||
93 | }else{ | ||
94 | $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH); | ||
95 | } | ||
96 | } | ||
97 | $img->setAttribute("src", ""); | ||
98 | if(isset($savedImages[$src])){ | ||
99 | $img->setAttribute("recindex", $savedImages[$src]); | ||
100 | }else{ | ||
101 | $image = ImageHandler::DownloadImage($src); | ||
102 | |||
103 | if($image !== false){ | ||
104 | $images[$this->imgCounter] = new FileRecord(new Record($image)); | ||
105 | |||
106 | $img->setAttribute("recindex", $this->imgCounter); | ||
107 | $savedImages[$src] = $this->imgCounter; | ||
108 | $this->imgCounter++; | ||
109 | } | ||
110 | } | ||
111 | } | ||
112 | |||
113 | return $images; | ||
114 | } | ||
115 | } | ||
116 | ?> | ||