From 1a268ba710b2cdb4ede98af3368c43d66c4c5e53 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Nicolas=20L=C5=93uillet?= Para 1 Para 2 Para 1 Para 2 The story begins...
+
,
. */
+blockquote:before, blockquote:after, q:before, q:after { content: ""; }
+blockquote, q { quotes: "" ""; }
+
+/* Remove annoying border on linked images. */
+a img { border: none; }
+
+
+body {
+
+ margin: 10px;
+}
+
diff --git a/css/style.css b/css/style.css
new file mode 100644
index 00000000..41a61780
--- /dev/null
+++ b/css/style.css
@@ -0,0 +1,35 @@
+body {
+ color: #222222;
+ font: 20px/1.3em Palatino,Georgia,serif;
+ background-color: #e6e6e6;
+}
+
+a, a:hover, a:visited {
+ color: #000;
+}
+header {
+ text-align: center;
+}
+
+#main {
+ margin: 0 auto;
+}
+
+#main ul#links {
+ padding: 0;
+ list-style-type: none;
+ text-align: center;
+}
+
+#main ul#links li {
+ display: inline;
+ padding: 15px;
+}
+
+#main a.tool {
+ text-decoration: none;
+}
+
+footer {
+ text-align: right;
+}
\ No newline at end of file
diff --git a/css/typography.css b/css/typography.css
new file mode 100755
index 00000000..e41db096
--- /dev/null
+++ b/css/typography.css
@@ -0,0 +1,85 @@
+
+body {
+ font:1em/1.625em "lucida grande","lucida sans unicode", sans-serif; background-color:#FFFEF0;
+ font-size-adjust:none;
+ font-style:normal;
+ font-variant:normal;
+ font-weight:normal;
+ padding: 15px;
+ margin: 15px auto;
+}
+
+article {
+ border: 3px solid grey;
+ max-width:700px;
+ margin: 15px auto;
+ padding: 15px;
+}
+
+footer {
+ border: 1px solid black;
+ padding: 15px;
+ margin: 15px auto;
+}
+
+p { padding:0 0 0.8125em 0; color:#111; font-weight:300;}
+
+p + p { text-indent:1.625em;}
+
+img { display: block; margin: 0.5em 0.8125em 0.8125em 0; padding: 0; }
+
+p > img { display: inline-block; margin: 0; }
+
+h1,h2{ font-weight:normal; color: #333; font-family:Georgia, serif; }
+h3,h4,h5,h6 { font-weight: normal; color: #333; font-family:Georgia, serif; }
+
+
+h1 { font-size: 2.125em; margin-bottom: 0.765em; line-height: 1.5em;}
+h2 { font-size: 1.9em; margin-bottom: 0.855em; }
+h3 { font-size: 1.7em; margin-bottom: 0.956em; }
+h4 { font-size: 1.4em; margin-bottom: 1.161em; }
+h5,h6 { font-size: 1.313em; margin-bottom: 1.238em; }
+
+
+
+ul{list-style-position:outside;}
+li ul,
+li ol { margin:0 1.625em; }
+ul, ol { margin: 0 0 1.625em 0; }
+
+
+dl { margin: 0 0 1.625em 0; }
+dl dt { font-weight: bold; }
+dl dd { margin-left: 1.625em; }
+
+a { color:#005AF2; text-decoration:none; }
+a:hover { text-decoration: underline; }
+
+
+table { margin-bottom:1.625em; border-collapse: collapse; }
+th { font-weight:bold; }
+tr,th,td { margin:0; padding:0 1.625em 0 1em; height:26px; }
+tfoot { font-style: italic; }
+caption { text-align:center; font-family:Georgia, serif; }
+
+
+abbr, acronym { border-bottom:1px dotted #000; }
+address { margin-top:1.625em; font-style: italic; }
+del {color:#000;}
+
+
+blockquote { padding:1em 1em 1.625em 1em; font-family:georgia,serif;font-style: italic; }
+blockquote:before { content:"\201C";font-size:3em;margin-left:-.625em; font-family:georgia,serif;color:#aaa;line-height:0;}/* From Tripoli */
+blockquote > p {padding:0; margin:0; }
+
+strong { font-weight: bold; }
+em, dfn { font-style: italic; }
+dfn { font-weight: bold; }
+pre, code { margin: 1.625em 0; white-space: pre; }
+pre, code, tt { font: 1em monospace; line-height: 1.5; }
+tt { display: block; margin: 1.625em 0; }
+hr { margin-bottom:1.625em; }
+
+.oldbook { font-family:"Warnock Pro","Goudy Old Style","Book Antiqua","Palatino",Georgia,serif; }
+.note { font-family:Georgia, "Times New Roman", Times, serif; font-style:italic; font-size:0.9em; margin:0.1em; color:#333; }
+.mono { font-family:"Courier New", Courier, monospace; }
diff --git a/inc/Encoding.php b/inc/Encoding.php
new file mode 100755
index 00000000..ac107af9
--- /dev/null
+++ b/inc/Encoding.php
@@ -0,0 +1,262 @@
+
+ * @package Encoding
+ * @version 1.1
+ * @link http://www.framework2.com.ar/dzone/forceUTF8-es/
+ * @example http://www.framework2.com.ar/dzone/forceUTF8-es/
+ */
+
+class Encoding {
+
+ protected static $win1252ToUtf8 = array(
+ 128 => "\xe2\x82\xac",
+
+ 130 => "\xe2\x80\x9a",
+ 131 => "\xc6\x92",
+ 132 => "\xe2\x80\x9e",
+ 133 => "\xe2\x80\xa6",
+ 134 => "\xe2\x80\xa0",
+ 135 => "\xe2\x80\xa1",
+ 136 => "\xcb\x86",
+ 137 => "\xe2\x80\xb0",
+ 138 => "\xc5\xa0",
+ 139 => "\xe2\x80\xb9",
+ 140 => "\xc5\x92",
+
+ 142 => "\xc5\xbd",
+
+
+ 145 => "\xe2\x80\x98",
+ 146 => "\xe2\x80\x99",
+ 147 => "\xe2\x80\x9c",
+ 148 => "\xe2\x80\x9d",
+ 149 => "\xe2\x80\xa2",
+ 150 => "\xe2\x80\x93",
+ 151 => "\xe2\x80\x94",
+ 152 => "\xcb\x9c",
+ 153 => "\xe2\x84\xa2",
+ 154 => "\xc5\xa1",
+ 155 => "\xe2\x80\xba",
+ 156 => "\xc5\x93",
+
+ 158 => "\xc5\xbe",
+ 159 => "\xc5\xb8"
+ );
+
+ protected static $brokenUtf8ToUtf8 = array(
+ "\xc2\x80" => "\xe2\x82\xac",
+
+ "\xc2\x82" => "\xe2\x80\x9a",
+ "\xc2\x83" => "\xc6\x92",
+ "\xc2\x84" => "\xe2\x80\x9e",
+ "\xc2\x85" => "\xe2\x80\xa6",
+ "\xc2\x86" => "\xe2\x80\xa0",
+ "\xc2\x87" => "\xe2\x80\xa1",
+ "\xc2\x88" => "\xcb\x86",
+ "\xc2\x89" => "\xe2\x80\xb0",
+ "\xc2\x8a" => "\xc5\xa0",
+ "\xc2\x8b" => "\xe2\x80\xb9",
+ "\xc2\x8c" => "\xc5\x92",
+
+ "\xc2\x8e" => "\xc5\xbd",
+
+
+ "\xc2\x91" => "\xe2\x80\x98",
+ "\xc2\x92" => "\xe2\x80\x99",
+ "\xc2\x93" => "\xe2\x80\x9c",
+ "\xc2\x94" => "\xe2\x80\x9d",
+ "\xc2\x95" => "\xe2\x80\xa2",
+ "\xc2\x96" => "\xe2\x80\x93",
+ "\xc2\x97" => "\xe2\x80\x94",
+ "\xc2\x98" => "\xcb\x9c",
+ "\xc2\x99" => "\xe2\x84\xa2",
+ "\xc2\x9a" => "\xc5\xa1",
+ "\xc2\x9b" => "\xe2\x80\xba",
+ "\xc2\x9c" => "\xc5\x93",
+
+ "\xc2\x9e" => "\xc5\xbe",
+ "\xc2\x9f" => "\xc5\xb8"
+ );
+
+ protected static $utf8ToWin1252 = array(
+ "\xe2\x82\xac" => "\x80",
+
+ "\xe2\x80\x9a" => "\x82",
+ "\xc6\x92" => "\x83",
+ "\xe2\x80\x9e" => "\x84",
+ "\xe2\x80\xa6" => "\x85",
+ "\xe2\x80\xa0" => "\x86",
+ "\xe2\x80\xa1" => "\x87",
+ "\xcb\x86" => "\x88",
+ "\xe2\x80\xb0" => "\x89",
+ "\xc5\xa0" => "\x8a",
+ "\xe2\x80\xb9" => "\x8b",
+ "\xc5\x92" => "\x8c",
+
+ "\xc5\xbd" => "\x8e",
+
+
+ "\xe2\x80\x98" => "\x91",
+ "\xe2\x80\x99" => "\x92",
+ "\xe2\x80\x9c" => "\x93",
+ "\xe2\x80\x9d" => "\x94",
+ "\xe2\x80\xa2" => "\x95",
+ "\xe2\x80\x93" => "\x96",
+ "\xe2\x80\x94" => "\x97",
+ "\xcb\x9c" => "\x98",
+ "\xe2\x84\xa2" => "\x99",
+ "\xc5\xa1" => "\x9a",
+ "\xe2\x80\xba" => "\x9b",
+ "\xc5\x93" => "\x9c",
+
+ "\xc5\xbe" => "\x9e",
+ "\xc5\xb8" => "\x9f"
+ );
+
+ static function toUTF8($text){
+ /**
+ * Function Encoding::toUTF8
+ *
+ * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
+ *
+ * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
+ *
+ * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
+ *
+ * 1) when any of these characters: ÃÃÃÃÃÃ
ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃ
+ * are followed by any of these: ("group B")
+ * ¡¢£¤¥¦§¨©ª«¬Â®¯°±²³´µ¶â¢Â¸Â¹ÂºÂ»Â¼Â½Â¾Â¿
+ * For example: %ABREPRESENT%C9%BB. «REPRESENTû
+ * The "«" (%AB) character will be converted, but the "Ã" followed by "»" (%C9%BB)
+ * is also a valid unicode character, and will be left unchanged.
+ *
+ * 2) when any of these: à áâãäåæçèéêëìÃîï are followed by TWO chars from group B,
+ * 3) when any of these: ðñòó are followed by THREE chars from group B.
+ *
+ * @name toUTF8
+ * @param string $text Any string.
+ * @return string The same string, UTF8 encoded
+ *
+ */
+
+ if(is_array($text))
+ {
+ foreach($text as $k => $v)
+ {
+ $text[$k] = self::toUTF8($v);
+ }
+ return $text;
+ } elseif(is_string($text)) {
+
+ $max = strlen($text);
+ $buf = "";
+ for($i = 0; $i < $max; $i++){
+ $c1 = $text{$i};
+ if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
+ $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
+ $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
+ $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
+ if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2;
+ $i++;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2 . $c3;
+ $i = $i + 2;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
+ if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+ $buf .= $c1 . $c2 . $c3;
+ $i = $i + 2;
+ } else { //not valid UTF8. Convert it.
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = ($c1 & "\x3f") | "\x80";
+ $buf .= $cc1 . $cc2;
+ }
+ } else { //doesn't look like UTF8, but should be converted
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = (($c1 & "\x3f") | "\x80");
+ $buf .= $cc1 . $cc2;
+ }
+ } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
+ if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
+ $buf .= self::$win1252ToUtf8[ord($c1)];
+ } else {
+ $cc1 = (chr(ord($c1) / 64) | "\xc0");
+ $cc2 = (($c1 & "\x3f") | "\x80");
+ $buf .= $cc1 . $cc2;
+ }
+ } else { // it doesn't need convesion
+ $buf .= $c1;
+ }
+ }
+ return $buf;
+ } else {
+ return $text;
+ }
+ }
+
+ static function toWin1252($text) {
+ if(is_array($text)) {
+ foreach($text as $k => $v) {
+ $text[$k] = self::toWin1252($v);
+ }
+ return $text;
+ } elseif(is_string($text)) {
+ return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
+ } else {
+ return $text;
+ }
+ }
+
+ static function toISO8859($text) {
+ return self::toWin1252($text);
+ }
+
+ static function toLatin1($text) {
+ return self::toWin1252($text);
+ }
+
+ static function fixUTF8($text){
+ if(is_array($text)) {
+ foreach($text as $k => $v) {
+ $text[$k] = self::fixUTF8($v);
+ }
+ return $text;
+ }
+
+ $last = "";
+ while($last <> $text){
+ $last = $text;
+ $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+ }
+ $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+ return $text;
+ }
+
+ static function UTF8FixWin1252Chars($text){
+ // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
+ // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
+ // See: http://en.wikipedia.org/wiki/Windows-1252
+
+ return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
+ }
+
+ static function removeBOM($str=""){
+ if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
+ $str=substr($str, 3);
+ }
+ return $str;
+ }
+}
\ No newline at end of file
diff --git a/inc/JSLikeHTMLElement.php b/inc/JSLikeHTMLElement.php
new file mode 100755
index 00000000..0557205f
--- /dev/null
+++ b/inc/JSLikeHTMLElement.php
@@ -0,0 +1,110 @@
+registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+* $doc->loadHTML('
Chapter 2
]*>[ \n\r\t]*){2,}/i',
+ 'replaceFonts' => '/<(\/?)font[^>]*>/i',
+ // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
+ 'normalize' => '/\s{2,}/',
+ 'killBreaks' => '/(
(\s| ?)*){1,}/',
+ 'video' => '/http:\/\/(www\.)?(youtube|vimeo|dailymotion)\.com/i',
+ 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+ );
+
+ /**
+ * Create instance of Readability
+ * @param string UTF-8 encoded string
+ * @param string (optional) URL associated with HTML (used for footnotes)
+ */
+ function __construct($html, $url=null)
+ {
+ /* Turn all double br's into p's */
+ $html = preg_replace($this->regexps['replaceBrs'], '
', $html); + $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); + $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); + $this->dom = new DOMDocument(); + $this->dom->preserveWhiteSpace = false; + $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); + if (trim($html) == '') $html = ''; + @$this->dom->loadHTML($html); + $this->url = $url; + } + + /** + * Get article title element + * @return DOMElement + */ + public function getTitle() { + return $this->articleTitle; + } + + /** + * Get article content element + * @return DOMElement + */ + public function getContent() { + return $this->articleContent; + } + + /** + * Runs readability. + * + * Workflow: + * 1. Prep the document by removing script tags, css, etc. + * 2. Build readability's DOM tree. + * 3. Grab the article content from the current dom tree. + * 4. Replace the current DOM tree with the new one. + * 5. Read peacefully. + * + * @return boolean true if we found content, false otherwise + **/ + public function init() + { + if (!isset($this->dom->documentElement)) return false; + $this->removeScripts($this->dom); + //die($this->getInnerHTML($this->dom->documentElement)); + + // Assume successful outcome + $this->success = true; + + $bodyElems = $this->dom->getElementsByTagName('body'); + if ($bodyElems->length > 0) { + if ($this->bodyCache == null) { + $this->bodyCache = $bodyElems->item(0)->innerHTML; + } + if ($this->body == null) { + $this->body = $bodyElems->item(0); + } + } + + $this->prepDocument(); + + //die($this->dom->documentElement->parentNode->nodeType); + //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement)); + //die($this->getInnerHTML($this->dom->documentElement)); + + /* Build readability's DOM tree */ + $overlay = $this->dom->createElement('div'); + $innerDiv = $this->dom->createElement('div'); + $articleTitle = $this->getArticleTitle(); + $articleContent = $this->grabArticle(); + + if (!$articleContent) { + $this->success = false; + $articleContent = $this->dom->createElement('div'); + $articleContent->setAttribute('id', 'readability-content'); + $articleContent->innerHTML = '
Sorry, Readability was unable to parse this page for content.
'; + } + + $overlay->setAttribute('id', 'readOverlay'); + $innerDiv->setAttribute('id', 'readInner'); + + /* Glue the structure of our document together. */ + $innerDiv->appendChild($articleTitle); + $innerDiv->appendChild($articleContent); + $overlay->appendChild($innerDiv); + + /* Clear the old HTML, insert the new content. */ + $this->body->innerHTML = ''; + $this->body->appendChild($overlay); + //document.body.insertBefore(overlay, document.body.firstChild); + $this->body->removeAttribute('style'); + + $this->postProcessContent($articleContent); + + // Set title and content instance variables + $this->articleTitle = $articleTitle; + $this->articleContent = $articleContent; + + return $this->success; + } + + /** + * Debug + */ + protected function dbg($msg) { + if ($this->debug) echo '* ',$msg, '').replace(readability.regexps.replaceFonts, '<$1span>'); + // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree. + // Manipulating innerHTML as it's done in JS is not possible in PHP. + } + + /** + * For easier reading, convert this document to have footnotes at the bottom rather than inline links. + * @see http://www.roughtype.com/archives/2010/05/experiments_in.php + * + * @return void + **/ + public function addFootnotes($articleContent) { + $footnotesWrapper = $this->dom->createElement('div'); + $footnotesWrapper->setAttribute('id', 'readability-footnotes'); + $footnotesWrapper->innerHTML = '
tags, etc.
+ *
+ * @param DOMElement
+ * @return void
+ */
+ function prepArticle($articleContent) {
+ $this->cleanStyles($articleContent);
+ $this->killBreaks($articleContent);
+ if ($this->revertForcedParagraphElements) {
+ $this->revertReadabilityStyledElements($articleContent);
+ }
+
+ /* Clean out junk from the article content */
+ $this->cleanConditionally($articleContent, 'form');
+ $this->clean($articleContent, 'object');
+ $this->clean($articleContent, 'h1');
+
+ /**
+ * If there is only one h2, they are probably using it
+ * as a header and not a subheader, so remove it since we already have a header.
+ ***/
+ if ($articleContent->getElementsByTagName('h2')->length == 1) {
+ $this->clean($articleContent, 'h2');
+ }
+ $this->clean($articleContent, 'iframe');
+
+ $this->cleanHeaders($articleContent);
+
+ /* Do these last as the previous stuff may have removed junk that will affect these */
+ $this->cleanConditionally($articleContent, 'table');
+ $this->cleanConditionally($articleContent, 'ul');
+ $this->cleanConditionally($articleContent, 'div');
+
+ /* Remove extra paragraphs */
+ $articleParagraphs = $articleContent->getElementsByTagName('p');
+ for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
+ {
+ $imgCount = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
+ $embedCount = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
+ $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+
+ if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+ {
+ $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
+ }
+ }
+
+ try {
+ $articleContent->innerHTML = preg_replace('/
]*>\s*
innerHTML);
+ //articleContent.innerHTML = articleContent.innerHTML.replace(/
]*>\s*
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
+ }
+ }
+
+ /**
+ * Initialize a node with the readability object. Also checks the
+ * className/id for special names to add to its score.
+ *
+ * @param Element
+ * @return void
+ **/
+ protected function initializeNode($node) {
+ $readability = $this->dom->createAttribute('readability');
+ $readability->value = 0; // this is our contentScore
+ $node->setAttributeNode($readability);
+
+ switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
+ case 'DIV':
+ $readability->value += 5;
+ break;
+
+ case 'PRE':
+ case 'TD':
+ case 'BLOCKQUOTE':
+ $readability->value += 3;
+ break;
+
+ case 'ADDRESS':
+ case 'OL':
+ case 'UL':
+ case 'DL':
+ case 'DD':
+ case 'DT':
+ case 'LI':
+ case 'FORM':
+ $readability->value -= 3;
+ break;
+
+ case 'H1':
+ case 'H2':
+ case 'H3':
+ case 'H4':
+ case 'H5':
+ case 'H6':
+ case 'TH':
+ $readability->value -= 5;
+ break;
+ }
+ $readability->value += $this->getClassWeight($node);
+ }
+
+ /***
+ * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+ * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+ *
+ * @return DOMElement
+ **/
+ protected function grabArticle($page=null) {
+ $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
+ if (!$page) $page = $this->dom;
+ $allElements = $page->getElementsByTagName('*');
+ /**
+ * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+ * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+ *
+ * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+ * TODO: Shouldn't this be a reverse traversal?
+ **/
+ $node = null;
+ $nodesToScore = array();
+ for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
+ //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
+ //$node = $targetList->item($nodeIndex);
+ $tagName = strtoupper($node->tagName);
+ /* Remove unlikely candidates */
+ if ($stripUnlikelyCandidates) {
+ $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
+ if (
+ preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+ !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
+ $tagName != 'BODY'
+ )
+ {
+ $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
+ //$nodesToRemove[] = $node;
+ $node->parentNode->removeChild($node);
+ $nodeIndex--;
+ continue;
+ }
+ }
+
+ if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
+ $nodesToScore[] = $node;
+ }
+
+ /* Turn all divs that don't have children block level elements into p's */
+ if ($tagName == 'DIV') {
+ if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+ //$this->dbg('Altering div to p');
+ $newNode = $this->dom->createElement('p');
+ try {
+ $newNode->innerHTML = $node->innerHTML;
+ //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
+ $node->parentNode->replaceChild($newNode, $node);
+ $nodeIndex--;
+ $nodesToScore[] = $node; // or $newNode?
+ }
+ catch(Exception $e) {
+ $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
+ }
+ }
+ else
+ {
+ // EXPERIMENTAL
+ // TODO: change these p elements back to text nodes after processing
+ for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
+ $childNode = $node->childNodes->item($i);
+ if ($childNode->nodeType == 3) { // XML_TEXT_NODE
+ //$this->dbg('replacing text node with a p tag with the same content.');
+ $p = $this->dom->createElement('p');
+ $p->innerHTML = $childNode->nodeValue;
+ $p->setAttribute('style', 'display: inline;');
+ $p->setAttribute('class', 'readability-styled');
+ $childNode->parentNode->replaceChild($p, $childNode);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+ * Then add their score to their parent node.
+ *
+ * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+ **/
+ $candidates = array();
+ for ($pt=0; $pt < count($nodesToScore); $pt++) {
+ $parentNode = $nodesToScore[$pt]->parentNode;
+ // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
+ $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
+ $innerText = $this->getInnerText($nodesToScore[$pt]);
+
+ if (!$parentNode || !isset($parentNode->tagName)) {
+ continue;
+ }
+
+ /* If this paragraph is less than 25 characters, don't even count it. */
+ if(strlen($innerText) < 25) {
+ continue;
+ }
+
+ /* Initialize readability data for the parent. */
+ if (!$parentNode->hasAttribute('readability'))
+ {
+ $this->initializeNode($parentNode);
+ $candidates[] = $parentNode;
+ }
+
+ /* Initialize readability data for the grandparent. */
+ if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
+ {
+ $this->initializeNode($grandParentNode);
+ $candidates[] = $grandParentNode;
+ }
+
+ $contentScore = 0;
+
+ /* Add a point for the paragraph itself as a base. */
+ $contentScore++;
+
+ /* Add points for any commas within this paragraph */
+ $contentScore += count(explode(',', $innerText));
+
+ /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+ $contentScore += min(floor(strlen($innerText) / 100), 3);
+
+ /* Add the score to the parent. The grandparent gets half. */
+ $parentNode->getAttributeNode('readability')->value += $contentScore;
+
+ if ($grandParentNode) {
+ $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;
+ }
+ }
+
+ /**
+ * After we've calculated scores, loop through all of the possible candidate nodes we found
+ * and find the one with the highest score.
+ **/
+ $topCandidate = null;
+ for ($c=0, $cl=count($candidates); $c < $cl; $c++)
+ {
+ /**
+ * Scale the final candidates score based on link density. Good content should have a
+ * relatively small link density (5% or less) and be mostly unaffected by this operation.
+ **/
+ $readability = $candidates[$c]->getAttributeNode('readability');
+ $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
+
+ $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
+
+ if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
+ $topCandidate = $candidates[$c];
+ }
+ }
+
+ /**
+ * If we still have no top candidate, just use the body as a last resort.
+ * We also have to copy the body node so it is something we can modify.
+ **/
+ if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
+ {
+ $topCandidate = $this->dom->createElement('div');
+ if ($page instanceof DOMDocument) {
+ if (!isset($page->documentElement)) {
+ // we don't have a body either? what a mess! :)
+ } else {
+ $topCandidate->innerHTML = $page->documentElement->innerHTML;
+ $page->documentElement->innerHTML = '';
+ $page->documentElement->appendChild($topCandidate);
+ }
+ } else {
+ $topCandidate->innerHTML = $page->innerHTML;
+ $page->innerHTML = '';
+ $page->appendChild($topCandidate);
+ }
+ $this->initializeNode($topCandidate);
+ }
+
+ /**
+ * Now that we have the top candidate, look through its siblings for content that might also be related.
+ * Things like preambles, content split by ads that we removed, etc.
+ **/
+ $articleContent = $this->dom->createElement('div');
+ $articleContent->setAttribute('id', 'readability-content');
+ $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
+ $siblingNodes = $topCandidate->parentNode->childNodes;
+ if (!isset($siblingNodes)) {
+ $siblingNodes = new stdClass;
+ $siblingNodes->length = 0;
+ }
+
+ for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
+ {
+ $siblingNode = $siblingNodes->item($s);
+ $append = false;
+
+ $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+
+ //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
+
+ if ($siblingNode === $topCandidate)
+ // or if ($siblingNode->isSameNode($topCandidate))
+ {
+ $append = true;
+ }
+
+ $contentBonus = 0;
+ /* Give a bonus if sibling nodes and top candidates have the example same classname */
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
+ $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
+ }
+
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
+ {
+ $append = true;
+ }
+
+ if (strtoupper($siblingNode->nodeName) == 'P') {
+ $linkDensity = $this->getLinkDensity($siblingNode);
+ $nodeContent = $this->getInnerText($siblingNode);
+ $nodeLength = strlen($nodeContent);
+
+ if ($nodeLength > 80 && $linkDensity < 0.25)
+ {
+ $append = true;
+ }
+ else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
+ {
+ $append = true;
+ }
+ }
+
+ /* Look for a special classname */
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('class') && $siblingNode->getAttribute('class') != '')
+ {
+ if (preg_match($this->regexps['okMaybeItsACandidate'], $siblingNode->getAttribute('class'))) {
+ $append = true;
+ }
+ }
+
+ /* Look for a special classname */
+ if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('id') && $siblingNode->getAttribute('id') != '')
+ {
+ if (preg_match($this->regexps['okMaybeItsACandidate'], $siblingNode->getAttribute('id'))) {
+ $append = true;
+ }
+ }
+
+
+ if ($append)
+ {
+ $this->dbg('Appending node: ' . $siblingNode->nodeName);
+
+ $nodeToAppend = null;
+ $sibNodeName = strtoupper($siblingNode->nodeName);
+ if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
+ /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+
+ $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
+ $nodeToAppend = $this->dom->createElement('div');
+ try {
+ $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
+ $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+ }
+ catch(Exception $e)
+ {
+ $this->dbg('Could not alter siblingNode to div, reverting back to original.');
+ $nodeToAppend = $siblingNode;
+ $s--;
+ $sl--;
+ }
+ } else {
+ $nodeToAppend = $siblingNode;
+ $s--;
+ $sl--;
+ }
+
+ /* To ensure a node does not interfere with readability styles, remove its classnames */
+ $nodeToAppend->removeAttribute('class');
+
+ /* Append sibling and subtract from our list because it removes the node when you append to another node */
+ $articleContent->appendChild($nodeToAppend);
+ }
+ }
+
+ /**
+ * So we have all of the content that we need. Now we clean it up for presentation.
+ **/
+ $this->prepArticle($articleContent);
+
+ /**
+ * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+ * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+ * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+ * finding the -right- content.
+ **/
+ if (strlen($this->getInnerText($articleContent, false)) < 250)
+ {
+ // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+ // in the meantime, we check and create an empty element if it's not there.
+ if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
+ $this->body->innerHTML = $this->bodyCache;
+
+ if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+ $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+ return $this->grabArticle($this->body);
+ }
+ else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+ $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
+ return $this->grabArticle($this->body);
+ }
+ else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+ return $this->grabArticle($this->body);
+ }
+ else {
+ return false;
+ }
+ }
+ return $articleContent;
+ }
+
+ /**
+ * Remove script tags from document
+ *
+ * @param DOMElement
+ * @return void
+ */
+ public function removeScripts($doc) {
+ $scripts = $doc->getElementsByTagName('script');
+ for($i = $scripts->length-1; $i >= 0; $i--)
+ {
+ $scripts->item($i)->parentNode->removeChild($scripts->item($i));
+ }
+ }
+
+ /**
+ * Get the inner text of a node.
+ * This also strips out any excess whitespace to be found.
+ *
+ * @param DOMElement $
+ * @param boolean $normalizeSpaces (default: true)
+ * @return string
+ **/
+ public function getInnerText($e, $normalizeSpaces=true) {
+ $textContent = '';
+
+ if (!isset($e->textContent) || $e->textContent == '') {
+ return '';
+ }
+
+ $textContent = trim($e->textContent);
+
+ if ($normalizeSpaces) {
+ return preg_replace($this->regexps['normalize'], ' ', $textContent);
+ } else {
+ return $textContent;
+ }
+ }
+
+ /**
+ * Get the number of times a string $s appears in the node $e.
+ *
+ * @param DOMElement $e
+ * @param string - what to count. Default is ","
+ * @return number (integer)
+ **/
+ public function getCharCount($e, $s=',') {
+ return substr_count($this->getInnerText($e), $s);
+ }
+
+ /**
+ * Remove the style attribute on every $e and under.
+ *
+ * @param DOMElement $e
+ * @return void
+ */
+ public function cleanStyles($e) {
+ if (!is_object($e)) return;
+ $elems = $e->getElementsByTagName('*');
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('style');
+ }
+ }
+
+ /**
+ * Get the density of links as a percentage of the content
+ * This is the amount of text that is inside a link divided by the total text in the node.
+ *
+ * @param DOMElement $e
+ * @return number (float)
+ */
+ public function getLinkDensity($e) {
+ $links = $e->getElementsByTagName('a');
+ $textLength = strlen($this->getInnerText($e));
+ $linkLength = 0;
+ for ($i=0, $il=$links->length; $i < $il; $i++)
+ {
+ $linkLength += strlen($this->getInnerText($links->item($i)));
+ }
+ if ($textLength > 0) {
+ return $linkLength / $textLength;
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Get an elements class/id weight. Uses regular expressions to tell if this
+ * element looks good or bad.
+ *
+ * @param DOMElement $e
+ * @return number (Integer)
+ */
+ public function getClassWeight($e) {
+ if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+ return 0;
+ }
+
+ $weight = 0;
+
+ /* Look for a special classname */
+ if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
+ {
+ if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
+ $weight -= 25;
+ }
+ if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
+ $weight += 25;
+ }
+ }
+
+ /* Look for a special ID */
+ if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
+ {
+ if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
+ $weight -= 25;
+ }
+ if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
+ $weight += 25;
+ }
+ }
+ return $weight;
+ }
+
+ /**
+ * Remove extraneous break tags from a node.
+ *
+ * @param DOMElement $node
+ * @return void
+ */
+ public function killBreaks($node) {
+ $html = $node->innerHTML;
+ $html = preg_replace($this->regexps['killBreaks'], '
', $html);
+ $node->innerHTML = $html;
+ }
+
+ /**
+ * Clean a node of all elements of type "tag".
+ * (Unless it's a youtube/vimeo video. People love movies.)
+ *
+ * @param DOMElement $e
+ * @param string $tag
+ * @return void
+ */
+ public function clean($e, $tag) {
+ $targetList = $e->getElementsByTagName($tag);
+ $isEmbed = ($tag == 'object' || $tag == 'embed');
+
+ for ($y=$targetList->length-1; $y >= 0; $y--) {
+ /* Allow youtube and vimeo videos through as people usually want to see those. */
+ if ($isEmbed) {
+ $attributeValues = '';
+ for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
+ $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
+ }
+
+ /* First, check the elements attributes to see if any of them contain youtube or vimeo */
+ if (preg_match($this->regexps['video'], $attributeValues)) {
+ continue;
+ }
+
+ /* Then check the elements inside this element for the same. */
+ if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
+ continue;
+ }
+ }
+ $targetList->item($y)->parentNode->removeChild($targetList->item($y));
+ }
+ }
+
+ /**
+ * Clean an element of all tags of type "tag" if they look fishy.
+ * "Fishy" is an algorithm based on content length, classnames,
+ * link density, number of images & embeds, etc.
+ *
+ * @param DOMElement $e
+ * @param string $tag
+ * @return void
+ */
+ public function cleanConditionally($e, $tag) {
+ if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+ return;
+ }
+
+ $tagsList = $e->getElementsByTagName($tag);
+ $curTagsLength = $tagsList->length;
+
+ /**
+ * Gather counts for other typical elements embedded within.
+ * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+ *
+ * TODO: Consider taking into account original contentScore here.
+ */
+ for ($i=$curTagsLength-1; $i >= 0; $i--) {
+ $weight = $this->getClassWeight($tagsList->item($i));
+ $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
+
+ $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
+
+ if ($weight + $contentScore < 0) {
+ $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+ }
+ else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
+ /**
+ * If there are not very many commas, and the number of
+ * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+ **/
+ $p = $tagsList->item($i)->getElementsByTagName('p')->length;
+ $img = $tagsList->item($i)->getElementsByTagName('img')->length;
+ $li = $tagsList->item($i)->getElementsByTagName('li')->length-100;
+ $input = $tagsList->item($i)->getElementsByTagName('input')->length;
+
+ $embedCount = 0;
+ $embeds = $tagsList->item($i)->getElementsByTagName('embed');
+ for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+ if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+ $embedCount++;
+ }
+ }
+
+ $linkDensity = $this->getLinkDensity($tagsList->item($i));
+ $contentLength = strlen($this->getInnerText($tagsList->item($i)));
+ $toRemove = false;
+
+ if ( $img > $p ) {
+ $toRemove = true;
+ } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+ $toRemove = true;
+ } else if ( $input > floor($p/3) ) {
+ $toRemove = true;
+ } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+ $toRemove = true;
+ } else if($weight < 25 && $linkDensity > 0.2) {
+ $toRemove = true;
+ } else if($weight >= 25 && $linkDensity > 0.5) {
+ $toRemove = true;
+ } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+ $toRemove = true;
+ }
+
+ if ($toRemove) {
+ $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+ }
+ }
+ }
+ }
+
+ /**
+ * Clean out spurious headers from an Element. Checks things like classnames and link density.
+ *
+ * @param DOMElement $e
+ * @return void
+ */
+ public function cleanHeaders($e) {
+ for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+ $headers = $e->getElementsByTagName('h' . $headerIndex);
+ for ($i=$headers->length-1; $i >=0; $i--) {
+ if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+ $headers->item($i)->parentNode->removeChild($headers->item($i));
+ }
+ }
+ }
+ }
+
+ public function flagIsActive($flag) {
+ return ($this->flags & $flag) > 0;
+ }
+
+ public function addFlag($flag) {
+ $this->flags = $this->flags | $flag;
+ }
+
+ public function removeFlag($flag) {
+ $this->flags = $this->flags & ~$flag;
+ }
+}
+?>
\ No newline at end of file
diff --git a/inc/index.html b/inc/index.html
new file mode 100755
index 00000000..e69de29b
diff --git a/inc/rain.tpl.class.php b/inc/rain.tpl.class.php
new file mode 100755
index 00000000..ea83b2c1
--- /dev/null
+++ b/inc/rain.tpl.class.php
@@ -0,0 +1,1043 @@
+), stylesheet (), script (