]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - readability.php
18c2dbde9dc71189da7dd371a7a6d374b70377b4
5 * Readability PHP 版本,详见
6 * http://code.google.com/p/arc90labs-readability/
10 * [+] 2011-02-17 初始化版本
12 * @author mingcheng<i.feelinglucky#gmail.com>
14 * @link http://www.gracecode.com/
17 define("READABILITY_VERSION", 0.12);
21 const ATTR_CONTENT_SCORE
= "contentScore";
23 // DOM 解析类目前只支持 UTF-8 编码
24 const DOM_DEFAULT_CHARSET
= "utf-8";
27 const MESSAGE_CAN_NOT_GET
= "Sorry, readability was unable to parse this page for content. \n
28 If you feel like it should have been able to,
29 please let me know by mail: lucky[at]gracecode.com";
32 protected $DOM = null;
35 protected $source = "";
38 private $parentNodes = array();
41 private $junkTags = Array("style", "form", "iframe", "script", "button", "input", "textarea");
44 private $junkAttrs = Array("style", "class", "onclick", "onmouseover", "align", "border", "margin");
49 * @param $input_char 字符串的编码。默认 utf-8,可以省略
51 function __construct($source, $input_char = "utf-8") {
52 $this->source
= $source;
54 // DOM 解析类只能处理 UTF-8 格式的字符
55 $source = mb_convert_encoding($source, 'HTML-ENTITIES', $input_char);
57 // 预处理 HTML 标签,剔除冗余的标签等
58 $source = $this->preparSource($source);
61 $this->DOM
= new DOMDocument('1.0', $input_char);
63 //libxml_use_internal_errors(true);
65 if (!@$this->DOM
->loadHTML('<?xml encoding="'.Readability2
::DOM_DEFAULT_CHARSET
.'">'.$source)) {
66 throw new Exception("Parse HTML Error!");
69 foreach ($this->DOM
->childNodes
as $item) {
70 if ($item->nodeType
== XML_PI_NODE
) {
71 $this->DOM
->removeChild($item); // remove hack
76 $this->DOM
->encoding
= Readability2
::DOM_DEFAULT_CHARSET
;
77 } catch (Exception
$e) {
84 * 预处理 HTML 标签,使其能够准确被 DOM 解析类处理
88 private function preparSource($string) {
89 // 剔除多余的 HTML 编码标记,避免解析出错
90 preg_match("/charset=([\w|\-]+);?/", $string, $match);
91 if (isset($match[1])) {
92 $string = preg_replace("/charset=([\w|\-]+);?/", "", $string, 1);
95 // Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
96 $string = preg_replace("/<br\/?>[ \r\n\s]*<br\/?>/i", "</p><p>", $string);
97 $string = preg_replace("/<\/?font[^>]*>/i", "", $string);
104 * 删除 DOM 元素中所有的 $TagName 标签
106 * @return DOMDocument
108 private function removeJunkTag($RootNode, $TagName) {
109 $Tags = $RootNode->getElementsByTagName($TagName);
112 while($Tag = $Tags->item($i++
)) {
113 $parentNode = $Tag->parentNode
;
114 $parentNode->removeChild($Tag);
123 private function removeJunkAttr($RootNode, $Attr) {
124 $Tags = $RootNode->getElementsByTagName("*");
127 while($Tag = $Tags->item($i++
)) {
128 $Tag->removeAttribute($Attr);
136 * 判定算法来自:http://code.google.com/p/arc90labs-readability/
140 private function getTopBox() {
142 $allParagraphs = $this->DOM
->getElementsByTagName("p");
144 // Study all the paragraphs and find the chunk that has the best score.
145 // A score is determined by things like: Number of <p>'s, commas, special classes, etc.
147 while($paragraph = $allParagraphs->item($i++
)) {
148 $parentNode = $paragraph->parentNode
;
149 $contentScore = intval($parentNode->getAttribute(Readability2
::ATTR_CONTENT_SCORE
));
150 $className = $parentNode->getAttribute("class");
151 $id = $parentNode->getAttribute("id");
153 // Look for a special classname
154 if (preg_match("/(comment|meta|footer|footnote)/i", $className)) {
156 } else if(preg_match(
157 "/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/i",
162 // Look for a special ID
163 if (preg_match("/(comment|meta|footer|footnote)/i", $id)) {
165 } else if (preg_match(
166 "/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i",
171 // Add a point for the paragraph found
172 // Add points for any commas within this paragraph
173 if (strlen($paragraph->nodeValue
) > 10) {
174 $contentScore +
= strlen($paragraph->nodeValue
);
178 $parentNode->setAttribute(Readability2
::ATTR_CONTENT_SCORE
, $contentScore);
181 array_push($this->parentNodes
, $parentNode);
184 $topBox = $this->DOM
->createElement('div', Readability2
::MESSAGE_CAN_NOT_GET
);
185 // Assignment from index for performance.
186 // See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
187 for ($i = 0, $len = sizeof($this->parentNodes
); $i < $len; $i++
) {
188 $parentNode = $this->parentNodes
[$i];
189 $contentScore = intval($parentNode->getAttribute(Readability2
::ATTR_CONTENT_SCORE
));
190 $orgContentScore = intval($topBox->getAttribute(Readability2
::ATTR_CONTENT_SCORE
));
192 if ($contentScore && $contentScore > $orgContentScore) {
193 $topBox = $parentNode;
197 // 此时,$topBox 应为已经判定后的页面内容主元素
207 public function getTitle() {
208 $title = $this->DOM
->getElementsByTagName("title");
209 return $title->item(0);
214 * 获取页面的主要内容(Readability 以后的内容)
218 public function getContent() {
219 if (!$this->DOM
) return false;
222 $ContentTitle = $this->getTitle();
225 $ContentBox = $this->getTopBox();
227 // 复制内容到新的 DOMDocument
228 $Target = new DOMDocument
;
229 $Target->appendChild($Target->importNode($ContentBox, true));
232 foreach ($this->junkTags
as $tag) {
233 $Target = $this->removeJunkTag($Target, $tag);
237 foreach ($this->junkAttrs
as $attr) {
238 $Target = $this->removeJunkAttr($Target, $attr);
243 'title' => $ContentTitle ? $ContentTitle->nodeValue
: "",
244 'content' => $Target->saveHTML()
248 function __destruct() { }