]>
Commit | Line | Data |
---|---|---|
4188f38a | 1 | <?php |
2 | ||
3 | /** | |
4 | * Description of OnlineArticle | |
5 | * | |
6 | * @author Sander | |
7 | */ | |
8 | class OnlineArticle extends ContentProvider { | |
9 | private $text; | |
10 | private $images; | |
11 | private $metadata = array(); | |
12 | private $imgCounter = 0; | |
13 | ||
14 | public function __construct($url) { | |
15 | if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url; | |
16 | ||
17 | $data = Http::Request($url); | |
18 | //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII"); | |
19 | $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII"); | |
20 | //$html = utf8_encode($html); | |
21 | $r = new Readability($html, $url); | |
22 | $r->init(); | |
23 | if(!isset($this->metadata["title"])){ | |
24 | $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); | |
25 | } | |
26 | if(!isset($this->metadata["author"])){ | |
27 | $parts = parse_url($url); | |
28 | $this->metadata["author"] = $parts["host"]; | |
29 | } | |
30 | ||
31 | $article = $r->getContent()->innerHTML; | |
32 | if(substr($article, 0, 5) == "<body"){ | |
33 | $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>".$article."</html>"; | |
34 | }else{ | |
35 | $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>".$article."</body></html>"; | |
36 | } | |
37 | $doc = new DOMDocument(); | |
38 | @$doc->loadHTML($article) or die($article); | |
39 | $doc->normalizeDocument(); | |
40 | ||
41 | $this->images = $this->handleImages($doc, $url); | |
42 | $this->text = $doc->saveHTML(); | |
43 | } | |
44 | ||
45 | /** | |
46 | * Get the text data to be integrated in the MOBI file | |
47 | * @return string | |
48 | */ | |
49 | public function getTextData(){ | |
50 | return $this->text; | |
51 | } | |
52 | /** | |
53 | * Get the images (an array containing the jpeg data). Array entry 0 will | |
54 | * correspond to image record 0. | |
55 | * @return array | |
56 | */ | |
57 | public function getImages(){ | |
58 | return $this->images; | |
59 | } | |
60 | /** | |
61 | * Get the metadata in the form of a hashtable (for example, title or author). | |
62 | * @return array | |
63 | */ | |
64 | public function getMetaData(){ | |
65 | return $this->metadata; | |
66 | } | |
67 | /** | |
68 | * | |
69 | * @param DOMElement $dom | |
70 | * @return array | |
71 | */ | |
72 | private function handleImages($dom, $url){ | |
73 | $images = array(); | |
74 | ||
75 | $parts = parse_url($url); | |
76 | ||
77 | $savedImages = array(); | |
78 | ||
79 | $imgElements = $dom->getElementsByTagName('img'); | |
80 | foreach($imgElements as $img) { | |
81 | $src = $img->getAttribute("src"); | |
82 | ||
83 | $is_root = false; | |
84 | if(substr($src, 0, 1) == "/"){ | |
85 | $is_root = true; | |
86 | } | |
87 | ||
88 | $parsed = parse_url($src); | |
89 | ||
90 | if(!isset($parsed["host"])){ | |
91 | if($is_root){ | |
92 | $src = http_build_url($url, $parsed, HTTP_URL_REPLACE); | |
93 | }else{ | |
94 | $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH); | |
95 | } | |
96 | } | |
97 | $img->setAttribute("src", ""); | |
98 | if(isset($savedImages[$src])){ | |
99 | $img->setAttribute("recindex", $savedImages[$src]); | |
100 | }else{ | |
101 | $image = ImageHandler::DownloadImage($src); | |
102 | ||
103 | if($image !== false){ | |
104 | $images[$this->imgCounter] = new FileRecord(new Record($image)); | |
105 | ||
106 | $img->setAttribute("recindex", $this->imgCounter); | |
107 | $savedImages[$src] = $this->imgCounter; | |
108 | $this->imgCounter++; | |
109 | } | |
110 | } | |
111 | } | |
112 | ||
113 | return $images; | |
114 | } | |
115 | } | |
116 | ?> |