aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/libraries/MOBIClass/OnlineArticle.php
blob: ec3182fed5f660d500778047676f9f3118f4a145 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
<?php

/**
 * Description of OnlineArticle
 *
 * @author Sander
 */
class OnlineArticle extends ContentProvider {
	private $text;
	private $images;
	private $metadata = array();
	private $imgCounter = 0;

	public function  __construct($url) {
		if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;

		$data = Http::Request($url);
		//$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
		$html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
		//$html = utf8_encode($html);
		$r = new Readability($html, $url);
		$r->init();
		if(!isset($this->metadata["title"])){
			$this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
		}
		if(!isset($this->metadata["author"])){
			$parts = parse_url($url);
			$this->metadata["author"] = $parts["host"];
		}

		$article = $r->getContent()->innerHTML;
		if(substr($article, 0, 5) == "<body"){
			$article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>".$article."</html>";
		}else{
			$article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>".$article."</body></html>";
		}
		$doc = new DOMDocument();
		@$doc->loadHTML($article) or die($article);
		$doc->normalizeDocument();

		$this->images = $this->handleImages($doc, $url);
		$this->text = $doc->saveHTML();
	}

	/**
	 * Get the text data to be integrated in the MOBI file
	 * @return string
	 */
	public function getTextData(){
		return $this->text;
	}
	/**
	 * Get the images (an array containing the jpeg data). Array entry 0 will
	 * correspond to image record 0.
	 * @return array
	 */
	public function getImages(){
		return $this->images;
	}
	/**
	 * Get the metadata in the form of a hashtable (for example, title or author).
	 * @return array
	 */
	public function getMetaData(){
		return $this->metadata;
	}
	/**
	 *
	 * @param DOMElement $dom
	 * @return array
	 */
	private function handleImages($dom, $url){
		$images = array();

		$parts = parse_url($url);

		$savedImages = array();

		$imgElements = $dom->getElementsByTagName('img');
		foreach($imgElements as $img) {
			$src = $img->getAttribute("src");
			
			$is_root = false;
			if(substr($src, 0, 1) == "/"){
				$is_root = true;
			}
			
			$parsed = parse_url($src);

			if(!isset($parsed["host"])){
				if($is_root){
					$src = http_build_url($url, $parsed, HTTP_URL_REPLACE);
				}else{
					$src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH);
				}
			}
			$img->setAttribute("src", "");
			if(isset($savedImages[$src])){
				$img->setAttribute("recindex", $savedImages[$src]);
			}else{
				$image = ImageHandler::DownloadImage($src);
				
				if($image !== false){
					$images[$this->imgCounter] = new FileRecord(new Record($image));

					$img->setAttribute("recindex", $this->imgCounter);
					$savedImages[$src] = $this->imgCounter;
					$this->imgCounter++;
				}
			}
		}

		return $images;
	}
}
?>