aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/libraries/send2kindle/readability.php
diff options
context:
space:
mode:
Diffstat (limited to 'inc/3rdparty/libraries/send2kindle/readability.php')
-rw-r--r--inc/3rdparty/libraries/send2kindle/readability.php249
1 files changed, 249 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/send2kindle/readability.php b/inc/3rdparty/libraries/send2kindle/readability.php
new file mode 100644
index 00000000..18c2dbde
--- /dev/null
+++ b/inc/3rdparty/libraries/send2kindle/readability.php
@@ -0,0 +1,249 @@
1<?php
2/**
3 * PHP Readability
4 *
5 * Readability PHP 版本,详见
6 * http://code.google.com/p/arc90labs-readability/
7 *
8 * ChangeLog:
9 *
10 * [+] 2011-02-17 初始化版本
11 *
12 * @author mingcheng<i.feelinglucky#gmail.com>
13 * @date 2011-02-17
14 * @link http://www.gracecode.com/
15 */
16
17define("READABILITY_VERSION", 0.12);
18
19class Readability2 {
20 // 保存判定结果的标记位名称
21 const ATTR_CONTENT_SCORE = "contentScore";
22
23 // DOM 解析类目前只支持 UTF-8 编码
24 const DOM_DEFAULT_CHARSET = "utf-8";
25
26 // 当判定失败时显示的内容
27 const MESSAGE_CAN_NOT_GET = "Sorry, readability was unable to parse this page for content. \n
28 If you feel like it should have been able to,
29 please let me know by mail: lucky[at]gracecode.com";
30
31 // DOM 解析类(PHP5 已内置)
32 protected $DOM = null;
33
34 // 需要解析的源代码
35 protected $source = "";
36
37 // 章节的父元素列表
38 private $parentNodes = array();
39
40 // 需要删除的标签
41 private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea");
42
43 // 需要删除的属性
44 private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");
45
46
47 /**
48 * 构造函数
49 * @param $input_char 字符串的编码。默认 utf-8,可以省略
50 */
51 function __construct($source, $input_char = "utf-8") {
52 $this->source = $source;
53
54 // DOM 解析类只能处理 UTF-8 格式的字符
55 $source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);
56
57 // 预处理 HTML 标签,剔除冗余的标签等
58 $source = $this->preparSource($source);
59
60 // 生成 DOM 解析类
61 $this->DOM = new DOMDocument('1.0', $input_char);
62 try {
63 //libxml_use_internal_errors(true);
64 // 会有些错误信息,不过不要紧 :^)
65 if (!@$this->DOM->loadHTML('<?xml encoding="'.Readability2::DOM_DEFAULT_CHARSET.'">'.$source)) {
66 throw new Exception("Parse HTML Error!");
67 }
68
69 foreach ($this->DOM->childNodes as $item) {
70 if ($item->nodeType == XML_PI_NODE) {
71 $this->DOM->removeChild($item); // remove hack
72 }
73 }
74
75 // insert proper
76 $this->DOM->encoding = Readability2::DOM_DEFAULT_CHARSET;
77 } catch (Exception $e) {
78 // ...
79 }
80 }
81
82
83 /**
84 * 预处理 HTML 标签,使其能够准确被 DOM 解析类处理
85 *
86 * @return String
87 */
88 private function preparSource($string) {
89 // 剔除多余的 HTML 编码标记,避免解析出错
90 preg_match("/charset=([\w|\-]+);?/", $string, $match);
91 if (isset($match[1])) {
92 $string = preg_replace("/charset=([\w|\-]+);?/", "", $string, 1);
93 }
94
95 // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
96 $string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
97 $string = preg_replace("/<\/?font[^>]*>/i", "", $string);
98
99 return trim($string);
100 }
101
102
103 /**
104 * 删除 DOM 元素中所有的 $TagName 标签
105 *
106 * @return DOMDocument
107 */
108 private function removeJunkTag($RootNode, $TagName) {
109 $Tags = $RootNode->getElementsByTagName($TagName);
110
111 $i = 0;
112 while($Tag = $Tags->item($i++)) {
113 $parentNode = $Tag->parentNode;
114 $parentNode->removeChild($Tag);
115 }
116
117 return $RootNode;
118 }
119
120 /**
121 * 删除元素中所有不需要的属性
122 */
123 private function removeJunkAttr($RootNode, $Attr) {
124 $Tags = $RootNode->getElementsByTagName("*");
125
126 $i = 0;
127 while($Tag = $Tags->item($i++)) {
128 $Tag->removeAttribute($Attr);
129 }
130
131 return $RootNode;
132 }
133
134 /**
135 * 根据评分获取页面主要内容的盒模型
136 * 判定算法来自:http://code.google.com/p/arc90labs-readability/
137 *
138 * @return DOMNode
139 */
140 private function getTopBox() {
141 // 获得页面所有的章节
142 $allParagraphs = $this->DOM->getElementsByTagName("p");
143
144 // Study all the paragraphs and find the chunk that has the best score.
145 // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
146 $i = 0;
147 while($paragraph = $allParagraphs->item($i++)) {
148 $parentNode = $paragraph->parentNode;
149 $contentScore = intval($parentNode->getAttribute(Readability2::ATTR_CONTENT_SCORE));
150 $className = $parentNode->getAttribute("class");
151 $id = $parentNode->getAttribute("id");
152
153 // Look for a special classname
154 if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
155 $contentScore -= 50;
156 } else if(preg_match(
157 "/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
158 $className)) {
159 $contentScore += 25;
160 }
161
162 // Look for a special ID
163 if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {
164 $contentScore -= 50;
165 } else if (preg_match(
166 "/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i",
167 $id)) {
168 $contentScore += 25;
169 }
170
171 // Add a point for the paragraph found
172 // Add points for any commas within this paragraph
173 if (strlen($paragraph->nodeValue) > 10) {
174 $contentScore += strlen($paragraph->nodeValue);
175 }
176
177 // 保存父元素的判定得分
178 $parentNode->setAttribute(Readability2::ATTR_CONTENT_SCORE, $contentScore);
179
180 // 保存章节的父元素,以便下次快速获取
181 array_push($this->parentNodes, $parentNode);
182 }
183
184 $topBox = $this->DOM->createElement('div', Readability2::MESSAGE_CAN_NOT_GET);
185 // Assignment from index for performance.
186 // See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
187 for ($i = 0, $len = sizeof($this->parentNodes); $i < $len; $i++) {
188 $parentNode = $this->parentNodes[$i];
189 $contentScore = intval($parentNode->getAttribute(Readability2::ATTR_CONTENT_SCORE));
190 $orgContentScore = intval($topBox->getAttribute(Readability2::ATTR_CONTENT_SCORE));
191
192 if ($contentScore && $contentScore > $orgContentScore) {
193 $topBox = $parentNode;
194 }
195 }
196
197 // 此时,$topBox 应为已经判定后的页面内容主元素
198 return $topBox;
199 }
200
201
202 /**
203 * 获取 HTML 页面标题
204 *
205 * @return String
206 */
207 public function getTitle() {
208 $title = $this->DOM->getElementsByTagName("title");
209 return $title->item(0);
210 }
211
212
213 /**
214 * 获取页面的主要内容(Readability 以后的内容)
215 *
216 * @return Array
217 */
218 public function getContent() {
219 if (!$this->DOM) return false;
220
221 // 获取页面标题
222 $ContentTitle = $this->getTitle();
223
224 // 获取页面主内容
225 $ContentBox = $this->getTopBox();
226
227 // 复制内容到新的 DOMDocument
228 $Target = new DOMDocument;
229 $Target->appendChild($Target->importNode($ContentBox, true));
230
231 // 删除不需要的标签
232 foreach ($this->junkTags as $tag) {
233 $Target = $this->removeJunkTag($Target, $tag);
234 }
235
236 // 删除不需要的属性
237 foreach ($this->junkAttrs as $attr) {
238 $Target = $this->removeJunkAttr($Target, $attr);
239 }
240
241 // 多个数据,以数组的形式返回
242 return Array(
243 'title' => $ContentTitle ? $ContentTitle->nodeValue : "",
244 'content' => $Target->saveHTML()
245 );
246 }
247
248 function __destruct() { }
249}