]> git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/libraries/MOBIClass/OnlineArticle.php
phpepub via composer
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / MOBIClass / OnlineArticle.php
1 <?php
2
3 /**
4 * Description of OnlineArticle
5 *
6 * @author Sander
7 */
8 class OnlineArticle extends ContentProvider {
9 private $text;
10 private $images;
11 private $metadata = array();
12 private $imgCounter = 0;
13
14 public function __construct($url) {
15 if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
16
17 $data = Http::Request($url);
18 //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
19 $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
20 //$html = utf8_encode($html);
21 $r = new Readability($html, $url);
22 $r->init();
23 if(!isset($this->metadata["title"])){
24 $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
25 }
26 if(!isset($this->metadata["author"])){
27 $parts = parse_url($url);
28 $this->metadata["author"] = $parts["host"];
29 }
30
31 $article = $r->getContent()->innerHTML;
32 if(substr($article, 0, 5) == "<body"){
33 $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>".$article."</html>";
34 }else{
35 $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>".$article."</body></html>";
36 }
37 $doc = new DOMDocument();
38 @$doc->loadHTML($article) or die($article);
39 $doc->normalizeDocument();
40
41 $this->images = $this->handleImages($doc, $url);
42 $this->text = $doc->saveHTML();
43 }
44
45 /**
46 * Get the text data to be integrated in the MOBI file
47 * @return string
48 */
49 public function getTextData(){
50 return $this->text;
51 }
52 /**
53 * Get the images (an array containing the jpeg data). Array entry 0 will
54 * correspond to image record 0.
55 * @return array
56 */
57 public function getImages(){
58 return $this->images;
59 }
60 /**
61 * Get the metadata in the form of a hashtable (for example, title or author).
62 * @return array
63 */
64 public function getMetaData(){
65 return $this->metadata;
66 }
67 /**
68 *
69 * @param DOMElement $dom
70 * @return array
71 */
72 private function handleImages($dom, $url){
73 $images = array();
74
75 $parts = parse_url($url);
76
77 $savedImages = array();
78
79 $imgElements = $dom->getElementsByTagName('img');
80 foreach($imgElements as $img) {
81 $src = $img->getAttribute("src");
82
83 $is_root = false;
84 if(substr($src, 0, 1) == "/"){
85 $is_root = true;
86 }
87
88 $parsed = parse_url($src);
89
90 if(!isset($parsed["host"])){
91 if($is_root){
92 $src = http_build_url($url, $parsed, HTTP_URL_REPLACE);
93 }else{
94 $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH);
95 }
96 }
97 $img->setAttribute("src", "");
98 if(isset($savedImages[$src])){
99 $img->setAttribute("recindex", $savedImages[$src]);
100 }else{
101 $image = ImageHandler::DownloadImage($src);
102
103 if($image !== false){
104 $images[$this->imgCounter] = new FileRecord(new Record($image));
105
106 $img->setAttribute("recindex", $this->imgCounter);
107 $savedImages[$src] = $this->imgCounter;
108 $this->imgCounter++;
109 }
110 }
111 }
112
113 return $images;
114 }
115 }
116 ?>