add Twig & refactor poche

author: Nicolas Lœuillet <nicolas.loeuillet@gmail.com> 2013-08-02 22:40:51 +0200
committer: Nicolas Lœuillet <nicolas.loeuillet@gmail.com> 2013-08-02 22:40:51 +0200
commit: a4565e88edbc8e3bd092a475469769c86a4c350c (patch)
tree: a6a3c935b03a23ff87575c8c315cf8ba78fe68c2 /inc/3rdparty
parent: f6c9baab3efeec1d0efa151e276fc08d5b58f9e9 (diff)
download: wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.gz
wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.zst
wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.zip
5 files changed, 3366 insertions, 0 deletions
diff --git a/inc/3rdparty/Encoding.php b/inc/3rdparty/Encoding.php
new file mode 100644
index 00000000..577763b4
--- /dev/null
+++ b/inc/3rdparty/Encoding.php
@@ -0,0 +1,262 @@
+<?php
+/**
+ * @author   "Sebastián Grignoli" <grignoli@framework2.com.ar>
+ * @package  Encoding
+ * @version  1.1
+ * @link     http://www.framework2.com.ar/dzone/forceUTF8-es/
+ * @example  http://www.framework2.com.ar/dzone/forceUTF8-es/
+  */
+class Encoding {
+  protected static $win1252ToUtf8 = array(
+        128 => "\xe2\x82\xac",
+        130 => "\xe2\x80\x9a",
+        131 => "\xc6\x92",
+        132 => "\xe2\x80\x9e",
+        133 => "\xe2\x80\xa6",
+        134 => "\xe2\x80\xa0",
+        135 => "\xe2\x80\xa1",
+        136 => "\xcb\x86",
+        137 => "\xe2\x80\xb0",
+        138 => "\xc5\xa0",
+        139 => "\xe2\x80\xb9",
+        140 => "\xc5\x92",
+        142 => "\xc5\xbd",
+        145 => "\xe2\x80\x98",
+        146 => "\xe2\x80\x99",
+        147 => "\xe2\x80\x9c",
+        148 => "\xe2\x80\x9d",
+        149 => "\xe2\x80\xa2",
+        150 => "\xe2\x80\x93",
+        151 => "\xe2\x80\x94",
+        152 => "\xcb\x9c",
+        153 => "\xe2\x84\xa2",
+        154 => "\xc5\xa1",
+        155 => "\xe2\x80\xba",
+        156 => "\xc5\x93",
+        158 => "\xc5\xbe",
+        159 => "\xc5\xb8"
+  );
+    protected static $brokenUtf8ToUtf8 = array(
+        "\xc2\x80" => "\xe2\x82\xac",
+        "\xc2\x82" => "\xe2\x80\x9a",
+        "\xc2\x83" => "\xc6\x92",
+        "\xc2\x84" => "\xe2\x80\x9e",
+        "\xc2\x85" => "\xe2\x80\xa6",
+        "\xc2\x86" => "\xe2\x80\xa0",
+        "\xc2\x87" => "\xe2\x80\xa1",
+        "\xc2\x88" => "\xcb\x86",
+        "\xc2\x89" => "\xe2\x80\xb0",
+        "\xc2\x8a" => "\xc5\xa0",
+        "\xc2\x8b" => "\xe2\x80\xb9",
+        "\xc2\x8c" => "\xc5\x92",
+        "\xc2\x8e" => "\xc5\xbd",
+        "\xc2\x91" => "\xe2\x80\x98",
+        "\xc2\x92" => "\xe2\x80\x99",
+        "\xc2\x93" => "\xe2\x80\x9c",
+        "\xc2\x94" => "\xe2\x80\x9d",
+        "\xc2\x95" => "\xe2\x80\xa2",
+        "\xc2\x96" => "\xe2\x80\x93",
+        "\xc2\x97" => "\xe2\x80\x94",
+        "\xc2\x98" => "\xcb\x9c",
+        "\xc2\x99" => "\xe2\x84\xa2",
+        "\xc2\x9a" => "\xc5\xa1",
+        "\xc2\x9b" => "\xe2\x80\xba",
+        "\xc2\x9c" => "\xc5\x93",
+        "\xc2\x9e" => "\xc5\xbe",
+        "\xc2\x9f" => "\xc5\xb8"
+  );
+  protected static $utf8ToWin1252 = array(
+       "\xe2\x82\xac" => "\x80",
+       "\xe2\x80\x9a" => "\x82",
+       "\xc6\x92"     => "\x83",
+       "\xe2\x80\x9e" => "\x84",
+       "\xe2\x80\xa6" => "\x85",
+       "\xe2\x80\xa0" => "\x86",
+       "\xe2\x80\xa1" => "\x87",
+       "\xcb\x86"     => "\x88",
+       "\xe2\x80\xb0" => "\x89",
+       "\xc5\xa0"     => "\x8a",
+       "\xe2\x80\xb9" => "\x8b",
+       "\xc5\x92"     => "\x8c",
+       "\xc5\xbd"     => "\x8e",
+       "\xe2\x80\x98" => "\x91",
+       "\xe2\x80\x99" => "\x92",
+       "\xe2\x80\x9c" => "\x93",
+       "\xe2\x80\x9d" => "\x94",
+       "\xe2\x80\xa2" => "\x95",
+       "\xe2\x80\x93" => "\x96",
+       "\xe2\x80\x94" => "\x97",
+       "\xcb\x9c"     => "\x98",
+       "\xe2\x84\xa2" => "\x99",
+       "\xc5\xa1"     => "\x9a",
+       "\xe2\x80\xba" => "\x9b",
+       "\xc5\x93"     => "\x9c",
+       "\xc5\xbe"     => "\x9e",
+       "\xc5\xb8"     => "\x9f"
+    );
+  static function toUTF8($text){
+  /**
+   * Function Encoding::toUTF8
+   *
+   * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
+   *
+   * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
+   *
+   * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
+   *
+   * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
+   *    are followed by any of these:  ("group B")
+   *                                    ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶•¸¹º»¼½¾¿
+   * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
+   * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
+   * is also a valid unicode character, and will be left unchanged.
+   *
+   * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
+   * 3) when any of these: ðñòó  are followed by THREE chars from group B.
+   *
+   * @name toUTF8
+   * @param string $text  Any string.
+   * @return string  The same string, UTF8 encoded
+   *
+   */
+    if(is_array($text))
+    {
+      foreach($text as $k => $v)
+      {
+        $text[$k] = self::toUTF8($v);
+      }
+      return $text;
+    } elseif(is_string($text)) {
+      $max = strlen($text);
+      $buf = "";
+      for($i = 0; $i < $max; $i++){
+          $c1 = $text{$i};
+          if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already
+            $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
+            $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
+            $c4 = $i+3 >= $max? "\x00" : $text{$i+3};
+              if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2;
+                      $i++;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2 . $c3;
+                      $i = $i + 2;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8
+                  if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already
+                      $buf .= $c1 . $c2 . $c3;
+                      $i = $i + 2;
+                  } else { //not valid UTF8.  Convert it.
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = ($c1 & "\x3f") | "\x80";
+                      $buf .= $cc1 . $cc2;
+                  }
+              } else { //doesn't look like UTF8, but should be converted
+                      $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                      $cc2 = (($c1 & "\x3f") | "\x80");
+                      $buf .= $cc1 . $cc2;
+              }
+          } elseif(($c1 & "\xc0") == "\x80"){ // needs conversion
+                if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
+                    $buf .= self::$win1252ToUtf8[ord($c1)];
+                } else {
+                  $cc1 = (chr(ord($c1) / 64) | "\xc0");
+                  $cc2 = (($c1 & "\x3f") | "\x80");
+                  $buf .= $cc1 . $cc2;
+                }
+          } else { // it doesn't need convesion
+              $buf .= $c1;
+          }
+      }
+      return $buf;
+    } else {
+      return $text;
+    }
+  }
+  static function toWin1252($text) {
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::toWin1252($v);
+      }
+      return $text;
+    } elseif(is_string($text)) {
+      return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
+    } else {
+      return $text;
+    }
+  }
+  static function toISO8859($text) {
+    return self::toWin1252($text);
+  }
+  static function toLatin1($text) {
+    return self::toWin1252($text);
+  }
+  static function fixUTF8($text){
+    if(is_array($text)) {
+      foreach($text as $k => $v) {
+        $text[$k] = self::fixUTF8($v);
+      }
+      return $text;
+    }
+    $last = "";
+    while($last <> $text){
+      $last = $text;
+      $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+    }
+    $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
+    return $text;
+  }
+  static function UTF8FixWin1252Chars($text){
+    // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
+    // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
+    // See: http://en.wikipedia.org/wiki/Windows-1252
+    return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
+  }
+  static function removeBOM($str=""){
+    if(substr($str, 0,3) == pack("CCC",0xef,0xbb,0xbf)) {
+      $str=substr($str, 3);
+    }
+    return $str;
+  }
+}
+\ No newline at end of file
diff --git a/inc/3rdparty/JSLikeHTMLElement.php b/inc/3rdparty/JSLikeHTMLElement.php
new file mode 100644
index 00000000..238ba8a8
--- /dev/null
+++ b/inc/3rdparty/JSLikeHTMLElement.php
@@ -0,0 +1,109 @@
+<?php
+/**
+* JavaScript-like HTML DOM Element
+*
+* This class extends PHP's DOMElement to allow
+* users to get and set the innerHTML property of
+* HTML elements in the same way it's done in 
+* JavaScript.
+*
+* Example usage:
+* @code
+* require_once 'JSLikeHTMLElement.php';
+* header('Content-Type: text/plain');
+* $doc = new DOMDocument();
+* $doc->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+* $doc->loadHTML('<div><p>Para 1</p><p>Para 2</p></div>');
+* $elem = $doc->getElementsByTagName('div')->item(0);
+* 
+* // print innerHTML
+* echo $elem->innerHTML; // prints '<p>Para 1</p><p>Para 2</p>'
+* echo "\n\n";
+* 
+* // set innerHTML
+* $elem->innerHTML = '<a href="http://fivefilters.org">FiveFilters.org</a>';
+* echo $elem->innerHTML; // prints '<a href="http://fivefilters.org">FiveFilters.org</a>'
+* echo "\n\n";
+* 
+* // print document (with our changes)
+* echo $doc->saveXML();
+* @endcode
+*
+* @author Keyvan Minoukadeh - http://www.keyvan.net - keyvan@keyvan.net
+* @see http://fivefilters.org (the project this was written for)
+*/
+class JSLikeHTMLElement extends DOMElement
+{
+        /**
+        * Used for setting innerHTML like it's done in JavaScript:
+        * @code
+        * $div->innerHTML = '<h2>Chapter 2</h2><p>The story begins...</p>';
+        * @endcode
+        */
+        public function __set($name, $value) {
+                if ($name == 'innerHTML') {
+                        // first, empty the element
+                        for ($x=$this->childNodes->length-1; $x>=0; $x--) {
+                                $this->removeChild($this->childNodes->item($x));
+                        }
+                        // $value holds our new inner HTML
+                        if ($value != '') {
+                                $f = $this->ownerDocument->createDocumentFragment();
+                                // appendXML() expects well-formed markup (XHTML)
+                                $result = @$f->appendXML($value); // @ to suppress PHP warnings
+                                if ($result) {
+                                        if ($f->hasChildNodes()) $this->appendChild($f);
+                                } else {
+                                        // $value is probably ill-formed
+                                        $f = new DOMDocument();
+                                        $value = mb_convert_encoding($value, 'HTML-ENTITIES', 'UTF-8');
+                                        // Using <htmlfragment> will generate a warning, but so will bad HTML
+                                        // (and by this point, bad HTML is what we've got).
+                                        // We use it (and suppress the warning) because an HTML fragment will 
+                                        // be wrapped around <html><body> tags which we don't really want to keep.
+                                        // Note: despite the warning, if loadHTML succeeds it will return true.
+                                        $result = @$f->loadHTML('<htmlfragment>'.$value.'</htmlfragment>');
+                                        if ($result) {
+                                                $import = $f->getElementsByTagName('htmlfragment')->item(0);
+                                                foreach ($import->childNodes as $child) {
+                                                        $importedNode = $this->ownerDocument->importNode($child, true);
+                                                        $this->appendChild($importedNode);
+                                                }
+                                        } else {
+                                                // oh well, we tried, we really did. :(
+                                                // this element is now empty
+                                        }
+                                }
+                        }
+                } else {
+                        $trace = debug_backtrace();
+                        trigger_error('Undefined property via __set(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
+                }
+        }
+        /**
+        * Used for getting innerHTML like it's done in JavaScript:
+        * @code
+        * $string = $div->innerHTML;
+        * @endcode
+        */      
+        public function __get($name)
+        {
+                if ($name == 'innerHTML') {
+                        $inner = '';
+                        foreach ($this->childNodes as $child) {
+                                $inner .= $this->ownerDocument->saveXML($child);
+                        }
+                        return $inner;
+                }
+                $trace = debug_backtrace();
+                trigger_error('Undefined property via __get(): '.$name.' in '.$trace[0]['file'].' on line '.$trace[0]['line'], E_USER_NOTICE);
+                return null;
+        }
+        public function __toString()
+        {
+                return '['.$this->tagName.']';
+        }
+}
+\ No newline at end of file
diff --git a/inc/3rdparty/Readability.php b/inc/3rdparty/Readability.php
new file mode 100644
index 00000000..e1e8738b
--- /dev/null
+++ b/inc/3rdparty/Readability.php
@@ -0,0 +1,1137 @@
+<?php
+/** 
+* Arc90's Readability ported to PHP for FiveFilters.org
+* Based on readability.js version 1.7.1 (without multi-page support)
+* Updated to allow HTML5 parsing with html5lib
+* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
+* ------------------------------------------------------
+* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
+* Arc90's project URL: http://lab.arc90.com/experiments/readability/
+* JS Source: http://code.google.com/p/arc90labs-readability
+* Ported by: Keyvan Minoukadeh, http://www.keyvan.net
+* More information: http://fivefilters.org/content-only/
+* License: Apache License, Version 2.0
+* Requires: PHP5
+* Date: 2012-09-19
+* 
+* Differences between the PHP port and the original
+* ------------------------------------------------------
+* Arc90's Readability is designed to run in the browser. It works on the DOM 
+* tree (the parsed HTML) after the page's CSS styles have been applied and 
+* Javascript code executed. This PHP port does not run inside a browser. 
+* We use PHP's ability to parse HTML to build our DOM tree, but we cannot 
+* rely on CSS or Javascript support. As such, the results will not always 
+* match Arc90's Readability. (For example, if a web page contains CSS style 
+* rules or Javascript code which hide certain HTML elements from display, 
+* Arc90's Readability will dismiss those from consideration but our PHP port, 
+* unable to understand CSS or Javascript, will not know any better.)
+* 
+* Another significant difference is that the aim of Arc90's Readability is 
+* to re-present the main content block of a given web page so users can 
+* read it more easily in their browsers. Correct identification, clean up, 
+* and separation of the content block is only a part of this process. 
+* This PHP port is only concerned with this part, it does not include code 
+* that relates to presentation in the browser - Arc90 already do 
+* that extremely well, and for PDF output there's FiveFilters.org's 
+* PDF Newspaper: http://fivefilters.org/pdf-newspaper/.
+* 
+* Finally, this class contains methods that might be useful for developers 
+* working on HTML document fragments. So without deviating too much from 
+* the original code (which I don't want to do because it makes debugging 
+* and updating more difficult), I've tried to make it a little more 
+* developer friendly. You should be able to use the methods here on 
+* existing DOMElement objects without passing an entire HTML document to 
+* be parsed.
+*/
+// This class allows us to do JavaScript like assignements to innerHTML
+require_once(dirname(__FILE__).'/JSLikeHTMLElement.php');
+// Alternative usage (for testing only!)
+// uncomment the lines below and call Readability.php in your browser 
+// passing it the URL of the page you'd like content from, e.g.:
+// Readability.php?url=http://medialens.org/alerts/09/090615_the_guardian_climate.php
+/*
+if (!isset($_GET['url']) || $_GET['url'] == '') {
+    die('Please pass a URL to the script. E.g. Readability.php?url=bla.com/story.html');
+}
+$url = $_GET['url'];
+if (!preg_match('!^https?://!i', $url)) $url = 'http://'.$url;
+$html = file_get_contents($url);
+$r = new Readability($html, $url);
+$r->init();
+echo $r->articleContent->innerHTML;
+*/
+class Readability
+{
+    public $version = '1.7.1-without-multi-page';
+    public $convertLinksToFootnotes = false;
+    public $revertForcedParagraphElements = true;
+    public $articleTitle;
+    public $articleContent;
+    public $dom;
+    public $url = null; // optional - URL where HTML was retrieved
+    public $debug = false;
+    public $lightClean = true; // preserves more content (experimental) added 2012-09-19
+    protected $body = null; // 
+    protected $bodyCache = null; // Cache the body HTML in case we need to re-use it later
+    protected $flags = 7; // 1 | 2 | 4;   // Start with all flags set.
+    protected $success = false; // indicates whether we were able to extract or not
+    
+    /**
+    * All of the regular expressions in use within readability.
+    * Defined up here so we don't instantiate them repeatedly in loops.
+    **/
+    public $regexps = array(
+        'unlikelyCandidates' => '/combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i',
+        'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i',
+        'positive' => '/article|body|content|entry|hentry|main|page|attachment|pagination|post|text|blog|story/i',
+        'negative' => '/combx|comment|com-|contact|foot|footer|_nav|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i',
+        'divToPElements' => '/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i',
+        'replaceBrs' => '/(<br[^>]*>[ \n\r\t]*){2,}/i',
+        'replaceFonts' => '/<(\/?)font[^>]*>/i',
+        // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
+        'normalize' => '/\s{2,}/',
+        'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
+        'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
+        'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
+    );  
+    
+    /* constants */
+    const FLAG_STRIP_UNLIKELYS = 1;
+    const FLAG_WEIGHT_CLASSES = 2;
+    const FLAG_CLEAN_CONDITIONALLY = 4;
+    
+    /**
+    * Create instance of Readability
+    * @param string UTF-8 encoded string
+    * @param string (optional) URL associated with HTML (used for footnotes)
+    * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
+    */  
+    function __construct($html, $url=null, $parser='libxml')
+    {
+        $this->url = $url;
+        /* Turn all double br's into p's */
+        $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
+        $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
+        $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
+        if (trim($html) == '') $html = '<html></html>';
+        if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
+            // all good
+        } else {
+            $this->dom = new DOMDocument();
+            $this->dom->preserveWhiteSpace = false;
+            @$this->dom->loadHTML($html);
+        }
+        $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
+    }
+    /**
+    * Get article title element
+    * @return DOMElement
+    */
+    public function getTitle() {
+        return $this->articleTitle;
+    }
+    
+    /**
+    * Get article content element
+    * @return DOMElement
+    */
+    public function getContent() {
+        return $this->articleContent;
+    }   
+    
+    /**
+    * Runs readability.
+    * 
+    * Workflow:
+    *  1. Prep the document by removing script tags, css, etc.
+    *  2. Build readability's DOM tree.
+    *  3. Grab the article content from the current dom tree.
+    *  4. Replace the current DOM tree with the new one.
+    *  5. Read peacefully.
+    *
+    * @return boolean true if we found content, false otherwise
+    **/
+    public function init()
+    {
+        if (!isset($this->dom->documentElement)) return false;
+        $this->removeScripts($this->dom);
+        //die($this->getInnerHTML($this->dom->documentElement));
+        
+        // Assume successful outcome
+        $this->success = true;
+        $bodyElems = $this->dom->getElementsByTagName('body');
+        if ($bodyElems->length > 0) {
+            if ($this->bodyCache == null) {
+                $this->bodyCache = $bodyElems->item(0)->innerHTML;
+            }
+            if ($this->body == null) {
+                $this->body = $bodyElems->item(0);
+            }
+        }
+        $this->prepDocument();
+        
+        //die($this->dom->documentElement->parentNode->nodeType);
+        //$this->setInnerHTML($this->dom->documentElement, $this->getInnerHTML($this->dom->documentElement));
+        //die($this->getInnerHTML($this->dom->documentElement));
+        /* Build readability's DOM tree */
+        $overlay        = $this->dom->createElement('div');
+        $innerDiv       = $this->dom->createElement('div');
+        $articleTitle   = $this->getArticleTitle();
+        $articleContent = $this->grabArticle();
+        if (!$articleContent) {
+            $this->success = false;
+            $articleContent = $this->dom->createElement('div');
+            $articleContent->setAttribute('id', 'readability-content');
+            $articleContent->innerHTML = '<p>Sorry, Readability was unable to parse this page for content.</p>';        
+        }
+        
+        $overlay->setAttribute('id', 'readOverlay');
+        $innerDiv->setAttribute('id', 'readInner');
+        /* Glue the structure of our document together. */
+        $innerDiv->appendChild($articleTitle);
+        $innerDiv->appendChild($articleContent);
+        $overlay->appendChild($innerDiv);
+        
+        /* Clear the old HTML, insert the new content. */
+        $this->body->innerHTML = '';
+        $this->body->appendChild($overlay);
+        //document.body.insertBefore(overlay, document.body.firstChild);
+        $this->body->removeAttribute('style');
+        $this->postProcessContent($articleContent);
+        
+        // Set title and content instance variables
+        $this->articleTitle = $articleTitle;
+        $this->articleContent = $articleContent;
+        
+        return $this->success;
+    }
+    
+    /**
+    * Debug
+    */
+    protected function dbg($msg) {
+        if ($this->debug) echo '* ',$msg, "\n";
+    }
+    
+    /**
+    * Run any post-process modifications to article content as necessary.
+    *
+    * @param DOMElement
+    * @return void
+    */
+    public function postProcessContent($articleContent) {
+        if ($this->convertLinksToFootnotes && !preg_match('/wikipedia\.org/', @$this->url)) { 
+            $this->addFootnotes($articleContent);
+        }
+    }
+    
+    /**
+    * Get the article title as an H1.
+    *
+    * @return DOMElement
+    */
+    protected function getArticleTitle() {
+        $curTitle = '';
+        $origTitle = '';
+        try {
+            $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
+        } catch(Exception $e) {}
+        
+        if (preg_match('/ [\|\-] /', $curTitle))
+        {
+            $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
+            
+            if (count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
+            }
+        }
+        else if (strpos($curTitle, ': ') !== false)
+        {
+            $curTitle = preg_replace('/.*:(.*)/i', '$1', $origTitle);
+            if (count(explode(' ', $curTitle)) < 3) {
+                $curTitle = preg_replace('/[^:]*[:](.*)/i','$1', $origTitle);
+            }
+        }
+        else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
+        {
+            $hOnes = $this->dom->getElementsByTagName('h1');
+            if($hOnes->length == 1)
+            {
+                $curTitle = $this->getInnerText($hOnes->item(0));
+            }
+        }
+        $curTitle = trim($curTitle);
+        if (count(explode(' ', $curTitle)) <= 4) {
+            $curTitle = $origTitle;
+        }
+        
+        $articleTitle = $this->dom->createElement('h1');
+        $articleTitle->innerHTML = $curTitle;
+        
+        return $articleTitle;
+    }
+    
+    /**
+    * Prepare the HTML document for readability to scrape it.
+    * This includes things like stripping javascript, CSS, and handling terrible markup.
+    * 
+    * @return void
+    **/
+    protected function prepDocument() {
+        /**
+        * In some cases a body element can't be found (if the HTML is totally hosed for example)
+        * so we create a new body node and append it to the document.
+        */
+        if ($this->body == null)
+        {
+            $this->body = $this->dom->createElement('body');
+            $this->dom->documentElement->appendChild($this->body);
+        }
+        $this->body->setAttribute('id', 'readabilityBody');
+        /* Remove all style tags in head */
+        $styleTags = $this->dom->getElementsByTagName('style');
+        for ($i = $styleTags->length-1; $i >= 0; $i--)
+        {
+            $styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
+        }
+        /* Turn all double br's into p's */
+        /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
+        //document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
+        // We do this in the constructor for PHP as that's when we have raw HTML - before parsing it into a DOM tree.
+        // Manipulating innerHTML as it's done in JS is not possible in PHP.
+    }
+    /**
+    * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
+    * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
+    *
+    * @return void
+    **/
+    public function addFootnotes($articleContent) {
+        $footnotesWrapper = $this->dom->createElement('div');
+        $footnotesWrapper->setAttribute('id', 'readability-footnotes');
+        $footnotesWrapper->innerHTML = '<h3>References</h3>';
+        
+        $articleFootnotes = $this->dom->createElement('ol');
+        $articleFootnotes->setAttribute('id', 'readability-footnotes-list');
+        $footnotesWrapper->appendChild($articleFootnotes);
+        
+        $articleLinks = $articleContent->getElementsByTagName('a');
+        
+        $linkCount = 0;
+        for ($i = 0; $i < $articleLinks->length; $i++)
+        {
+            $articleLink  = $articleLinks->item($i);
+            $footnoteLink = $articleLink->cloneNode(true);
+            $refLink      = $this->dom->createElement('a');
+            $footnote     = $this->dom->createElement('li');
+            $linkDomain   = @parse_url($footnoteLink->getAttribute('href'), PHP_URL_HOST);
+            if (!$linkDomain && isset($this->url)) $linkDomain = @parse_url($this->url, PHP_URL_HOST);
+            //linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
+            $linkText     = $this->getInnerText($articleLink);
+            
+            if ((strpos($articleLink->getAttribute('class'), 'readability-DoNotFootnote') !== false) || preg_match($this->regexps['skipFootnoteLink'], $linkText)) {
+                continue;
+            }
+            
+            $linkCount++;
+            /** Add a superscript reference after the article link */
+            $refLink->setAttribute('href', '#readabilityFootnoteLink-' . $linkCount);
+            $refLink->innerHTML = '<small><sup>[' . $linkCount . ']</sup></small>';
+            $refLink->setAttribute('class', 'readability-DoNotFootnote');
+            $refLink->setAttribute('style', 'color: inherit;');
+            
+            //TODO: does this work or should we use DOMNode.isSameNode()?
+            if ($articleLink->parentNode->lastChild == $articleLink) {
+                $articleLink->parentNode->appendChild($refLink);
+            } else {
+                $articleLink->parentNode->insertBefore($refLink, $articleLink->nextSibling);
+            }
+            $articleLink->setAttribute('style', 'color: inherit; text-decoration: none;');
+            $articleLink->setAttribute('name', 'readabilityLink-' . $linkCount);
+            $footnote->innerHTML = '<small><sup><a href="#readabilityLink-' . $linkCount . '" title="Jump to Link in Article">^</a></sup></small> ';
+            $footnoteLink->innerHTML = ($footnoteLink->getAttribute('title') != '' ? $footnoteLink->getAttribute('title') : $linkText);
+            $footnoteLink->setAttribute('name', 'readabilityFootnoteLink-' . $linkCount);
+            
+            $footnote->appendChild($footnoteLink);
+            if ($linkDomain) $footnote->innerHTML = $footnote->innerHTML . '<small> (' . $linkDomain . ')</small>';
+            
+            $articleFootnotes->appendChild($footnote);
+        }
+        if ($linkCount > 0) {
+            $articleContent->appendChild($footnotesWrapper);           
+        }
+    }
+    /**
+    * Reverts P elements with class 'readability-styled'
+    * to text nodes - which is what they were before.
+    *
+    * @param DOMElement
+    * @return void
+    */
+    function revertReadabilityStyledElements($articleContent) {
+        $xpath = new DOMXPath($articleContent->ownerDocument);
+        $elems = $xpath->query('.//p[@class="readability-styled"]', $articleContent);
+        //$elems = $articleContent->getElementsByTagName('p');
+        for ($i = $elems->length-1; $i >= 0; $i--) {
+            $e = $elems->item($i);
+            $e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+            //if ($e->hasAttribute('class') && $e->getAttribute('class') == 'readability-styled') {
+            //  $e->parentNode->replaceChild($this->dom->createTextNode($e->textContent), $e);
+            //}
+        }
+    }
+    
+    /**
+    * Prepare the article node for display. Clean out any inline styles,
+    * iframes, forms, strip extraneous <p> tags, etc.
+    *
+    * @param DOMElement
+    * @return void
+    */
+    function prepArticle($articleContent) {
+        $this->cleanStyles($articleContent);
+        $this->killBreaks($articleContent);
+        if ($this->revertForcedParagraphElements) {
+            $this->revertReadabilityStyledElements($articleContent);
+        }
+        /* Clean out junk from the article content */
+        $this->cleanConditionally($articleContent, 'form');
+        $this->clean($articleContent, 'object');
+        $this->clean($articleContent, 'h1');
+        /**
+        * If there is only one h2, they are probably using it
+        * as a header and not a subheader, so remove it since we already have a header.
+        ***/
+        if (!$this->lightClean && ($articleContent->getElementsByTagName('h2')->length == 1)) {
+            $this->clean($articleContent, 'h2'); 
+        }
+        $this->clean($articleContent, 'iframe');
+        $this->cleanHeaders($articleContent);
+        /* Do these last as the previous stuff may have removed junk that will affect these */
+        $this->cleanConditionally($articleContent, 'table');
+        $this->cleanConditionally($articleContent, 'ul');
+        $this->cleanConditionally($articleContent, 'div');
+        /* Remove extra paragraphs */
+        $articleParagraphs = $articleContent->getElementsByTagName('p');
+        for ($i = $articleParagraphs->length-1; $i >= 0; $i--)
+        {
+            $imgCount    = $articleParagraphs->item($i)->getElementsByTagName('img')->length;
+            $embedCount  = $articleParagraphs->item($i)->getElementsByTagName('embed')->length;
+            $objectCount = $articleParagraphs->item($i)->getElementsByTagName('object')->length;
+            $iframeCount = $articleParagraphs->item($i)->getElementsByTagName('iframe')->length;
+            
+            if ($imgCount === 0 && $embedCount === 0 && $objectCount === 0 && $iframeCount === 0 && $this->getInnerText($articleParagraphs->item($i), false) == '')
+            {
+                $articleParagraphs->item($i)->parentNode->removeChild($articleParagraphs->item($i));
+            }
+        }
+        try {
+            $articleContent->innerHTML = preg_replace('/<br[^>]*>\s*<p/i', '<p', $articleContent->innerHTML);
+            //articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');      
+        }
+        catch (Exception $e) {
+            $this->dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " . $e);
+        }
+    }
+    
+    /**
+    * Initialize a node with the readability object. Also checks the
+    * className/id for special names to add to its score.
+    *
+    * @param Element
+    * @return void
+    **/
+    protected function initializeNode($node) {
+        $readability = $this->dom->createAttribute('readability');
+        $readability->value = 0; // this is our contentScore
+        $node->setAttributeNode($readability);               
+        switch (strtoupper($node->tagName)) { // unsure if strtoupper is needed, but using it just in case
+            case 'DIV':
+                $readability->value += 5;
+                break;
+            case 'PRE':
+            case 'TD':
+            case 'BLOCKQUOTE':
+                $readability->value += 3;
+                break;
+                
+            case 'ADDRESS':
+            case 'OL':
+            case 'UL':
+            case 'DL':
+            case 'DD':
+            case 'DT':
+            case 'LI':
+            case 'FORM':
+                $readability->value -= 3;
+                break;
+            case 'H1':
+            case 'H2':
+            case 'H3':
+            case 'H4':
+            case 'H5':
+            case 'H6':
+            case 'TH':
+                $readability->value -= 5;
+                break;
+        }
+        $readability->value += $this->getClassWeight($node);
+    }
+    
+    /***
+    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+    *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+    *
+    * @return DOMElement
+    **/
+    protected function grabArticle($page=null) {
+        $stripUnlikelyCandidates = $this->flagIsActive(self::FLAG_STRIP_UNLIKELYS);
+        if (!$page) $page = $this->dom;
+        $allElements = $page->getElementsByTagName('*');
+        /**
+        * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+        * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+        *
+        * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+        * TODO: Shouldn't this be a reverse traversal?
+        **/
+        $node = null;
+        $nodesToScore = array();
+        for ($nodeIndex = 0; ($node = $allElements->item($nodeIndex)); $nodeIndex++) {
+        //for ($nodeIndex=$targetList->length-1; $nodeIndex >= 0; $nodeIndex--) {
+            //$node = $targetList->item($nodeIndex);
+            $tagName = strtoupper($node->tagName);
+            /* Remove unlikely candidates */
+            if ($stripUnlikelyCandidates) {
+                $unlikelyMatchString = $node->getAttribute('class') . $node->getAttribute('id');
+                if (
+                    preg_match($this->regexps['unlikelyCandidates'], $unlikelyMatchString) &&
+                    !preg_match($this->regexps['okMaybeItsACandidate'], $unlikelyMatchString) &&
+                    $tagName != 'BODY'
+                )
+                {
+                    $this->dbg('Removing unlikely candidate - ' . $unlikelyMatchString);
+                    //$nodesToRemove[] = $node;
+                    $node->parentNode->removeChild($node);
+                    $nodeIndex--;
+                    continue;
+                }               
+            }
+            if ($tagName == 'P' || $tagName == 'TD' || $tagName == 'PRE') {
+                $nodesToScore[] = $node;
+            }
+            /* Turn all divs that don't have children block level elements into p's */
+            if ($tagName == 'DIV') {
+                if (!preg_match($this->regexps['divToPElements'], $node->innerHTML)) {
+                    //$this->dbg('Altering div to p');
+                    $newNode = $this->dom->createElement('p');
+                    try {
+                        $newNode->innerHTML = $node->innerHTML;
+                        //$nodesToReplace[] = array('new'=>$newNode, 'old'=>$node);
+                        $node->parentNode->replaceChild($newNode, $node);
+                        $nodeIndex--;
+                        $nodesToScore[] = $node; // or $newNode?
+                    }
+                    catch(Exception $e) {
+                        $this->dbg('Could not alter div to p, reverting back to div.: ' . $e);
+                    }
+                }
+                else
+                {
+                    /* EXPERIMENTAL */
+                    // TODO: change these p elements back to text nodes after processing
+                    for ($i = 0, $il = $node->childNodes->length; $i < $il; $i++) {
+                        $childNode = $node->childNodes->item($i);
+                        if ($childNode->nodeType == 3) { // XML_TEXT_NODE
+                            //$this->dbg('replacing text node with a p tag with the same content.');
+                            $p = $this->dom->createElement('p');
+                            $p->innerHTML = $childNode->nodeValue;
+                            $p->setAttribute('style', 'display: inline;');
+                            $p->setAttribute('class', 'readability-styled');
+                            $childNode->parentNode->replaceChild($p, $childNode);
+                        }
+                    }
+                }
+            }
+        }
+        
+        /**
+        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+        * Then add their score to their parent node.
+        *
+        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+        **/
+        $candidates = array();
+        for ($pt=0; $pt < count($nodesToScore); $pt++) {
+            $parentNode      = $nodesToScore[$pt]->parentNode;
+            // $grandParentNode = $parentNode ? $parentNode->parentNode : null;
+            $grandParentNode = !$parentNode ? null : (($parentNode->parentNode instanceof DOMElement) ? $parentNode->parentNode : null);
+            $innerText       = $this->getInnerText($nodesToScore[$pt]);
+            if (!$parentNode || !isset($parentNode->tagName)) {
+                continue;
+            }
+            /* If this paragraph is less than 25 characters, don't even count it. */
+            if(strlen($innerText) < 25) {
+                continue;
+            }
+            /* Initialize readability data for the parent. */
+            if (!$parentNode->hasAttribute('readability')) 
+            {
+                $this->initializeNode($parentNode);
+                $candidates[] = $parentNode;
+            }
+            /* Initialize readability data for the grandparent. */
+            if ($grandParentNode && !$grandParentNode->hasAttribute('readability') && isset($grandParentNode->tagName))
+            {
+                $this->initializeNode($grandParentNode);
+                $candidates[] = $grandParentNode;
+            }
+            $contentScore = 0;
+            /* Add a point for the paragraph itself as a base. */
+            $contentScore++;
+            /* Add points for any commas within this paragraph */
+            $contentScore += count(explode(',', $innerText));
+            
+            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+            $contentScore += min(floor(strlen($innerText) / 100), 3);
+            
+            /* Add the score to the parent. The grandparent gets half. */
+            $parentNode->getAttributeNode('readability')->value += $contentScore;
+            if ($grandParentNode) {
+                $grandParentNode->getAttributeNode('readability')->value += $contentScore/2;             
+            }
+        }
+        /**
+        * After we've calculated scores, loop through all of the possible candidate nodes we found
+        * and find the one with the highest score.
+        **/
+        $topCandidate = null;
+        for ($c=0, $cl=count($candidates); $c < $cl; $c++)
+        {
+            /**
+            * Scale the final candidates score based on link density. Good content should have a
+            * relatively small link density (5% or less) and be mostly unaffected by this operation.
+            **/
+            $readability = $candidates[$c]->getAttributeNode('readability');
+            $readability->value = $readability->value * (1-$this->getLinkDensity($candidates[$c]));
+            $this->dbg('Candidate: ' . $candidates[$c]->tagName . ' (' . $candidates[$c]->getAttribute('class') . ':' . $candidates[$c]->getAttribute('id') . ') with score ' . $readability->value);
+            if (!$topCandidate || $readability->value > (int)$topCandidate->getAttribute('readability')) {
+                $topCandidate = $candidates[$c];
+            }
+        }
+        /**
+        * If we still have no top candidate, just use the body as a last resort.
+        * We also have to copy the body node so it is something we can modify.
+        **/
+        if ($topCandidate === null || strtoupper($topCandidate->tagName) == 'BODY')
+        {
+            $topCandidate = $this->dom->createElement('div');
+            if ($page instanceof DOMDocument) {
+                if (!isset($page->documentElement)) {
+                    // we don't have a body either? what a mess! :)
+                } else {
+                    $topCandidate->innerHTML = $page->documentElement->innerHTML;
+                    $page->documentElement->innerHTML = '';
+                    $page->documentElement->appendChild($topCandidate);
+                }
+            } else {
+                $topCandidate->innerHTML = $page->innerHTML;
+                $page->innerHTML = '';
+                $page->appendChild($topCandidate);
+            }
+            $this->initializeNode($topCandidate);
+        }
+        /**
+        * Now that we have the top candidate, look through its siblings for content that might also be related.
+        * Things like preambles, content split by ads that we removed, etc.
+        **/
+        $articleContent        = $this->dom->createElement('div');
+        $articleContent->setAttribute('id', 'readability-content');
+        $siblingScoreThreshold = max(10, ((int)$topCandidate->getAttribute('readability')) * 0.2);
+        $siblingNodes          = $topCandidate->parentNode->childNodes;
+        if (!isset($siblingNodes)) {
+            $siblingNodes = new stdClass;
+            $siblingNodes->length = 0;
+        }
+        for ($s=0, $sl=$siblingNodes->length; $s < $sl; $s++)
+        {
+            $siblingNode = $siblingNodes->item($s);
+            $append      = false;
+            $this->dbg('Looking at sibling node: ' . $siblingNode->nodeName . (($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability')) ? (' with score ' . $siblingNode->getAttribute('readability')) : ''));
+            //dbg('Sibling has score ' . ($siblingNode->readability ? siblingNode.readability.contentScore : 'Unknown'));
+            if ($siblingNode === $topCandidate)
+            // or if ($siblingNode->isSameNode($topCandidate))
+            {
+                $append = true;
+            }
+            $contentBonus = 0;
+            /* Give a bonus if sibling nodes and top candidates have the example same classname */
+            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->getAttribute('class') == $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') != '') {
+                $contentBonus += ((int)$topCandidate->getAttribute('readability')) * 0.2;
+            }
+            if ($siblingNode->nodeType === XML_ELEMENT_NODE && $siblingNode->hasAttribute('readability') && (((int)$siblingNode->getAttribute('readability')) + $contentBonus) >= $siblingScoreThreshold)
+            {
+                $append = true;
+            }
+            
+            if (strtoupper($siblingNode->nodeName) == 'P') {
+                $linkDensity = $this->getLinkDensity($siblingNode);
+                $nodeContent = $this->getInnerText($siblingNode);
+                $nodeLength  = strlen($nodeContent);
+                
+                if ($nodeLength > 80 && $linkDensity < 0.25)
+                {
+                    $append = true;
+                }
+                else if ($nodeLength < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent))
+                {
+                    $append = true;
+                }
+            }
+            if ($append)
+            {
+                $this->dbg('Appending node: ' . $siblingNode->nodeName);
+                $nodeToAppend = null;
+                $sibNodeName = strtoupper($siblingNode->nodeName);
+                if ($sibNodeName != 'DIV' && $sibNodeName != 'P') {
+                    /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+                    
+                    $this->dbg('Altering siblingNode of ' . $sibNodeName . ' to div.');
+                    $nodeToAppend = $this->dom->createElement('div');
+                    try {
+                        $nodeToAppend->setAttribute('id', $siblingNode->getAttribute('id'));
+                        $nodeToAppend->innerHTML = $siblingNode->innerHTML;
+                    }
+                    catch(Exception $e)
+                    {
+                        $this->dbg('Could not alter siblingNode to div, reverting back to original.');
+                        $nodeToAppend = $siblingNode;
+                        $s--;
+                        $sl--;
+                    }
+                } else {
+                    $nodeToAppend = $siblingNode;
+                    $s--;
+                    $sl--;
+                }
+                
+                /* To ensure a node does not interfere with readability styles, remove its classnames */
+                $nodeToAppend->removeAttribute('class');
+                /* Append sibling and subtract from our list because it removes the node when you append to another node */
+                $articleContent->appendChild($nodeToAppend);
+            }
+        }
+        /**
+        * So we have all of the content that we need. Now we clean it up for presentation.
+        **/
+        $this->prepArticle($articleContent);
+        /**
+        * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+        * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+        * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+        * finding the -right- content.
+        **/
+        if (strlen($this->getInnerText($articleContent, false)) < 250)
+        {
+            // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
+            // in the meantime, we check and create an empty element if it's not there.
+            if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body');
+            $this->body->innerHTML = $this->bodyCache;
+            
+            if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
+                $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
+                return $this->grabArticle($this->body);
+            }
+            else if ($this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+                $this->removeFlag(self::FLAG_WEIGHT_CLASSES);
+                return $this->grabArticle($this->body);              
+            }
+            else if ($this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+                $this->removeFlag(self::FLAG_CLEAN_CONDITIONALLY);
+                return $this->grabArticle($this->body);
+            }
+            else {
+                return false;
+            }
+        }
+        return $articleContent;
+    }
+    
+    /**
+    * Remove script tags from document
+    *
+    * @param DOMElement
+    * @return void
+    */
+    public function removeScripts($doc) {
+        $scripts = $doc->getElementsByTagName('script');
+        for($i = $scripts->length-1; $i >= 0; $i--)
+        {
+            $scripts->item($i)->parentNode->removeChild($scripts->item($i));
+        }
+    }
+    
+    /**
+    * Get the inner text of a node.
+    * This also strips out any excess whitespace to be found.
+    *
+    * @param DOMElement $
+    * @param boolean $normalizeSpaces (default: true)
+    * @return string
+    **/
+    public function getInnerText($e, $normalizeSpaces=true) {
+        $textContent = '';
+        if (!isset($e->textContent) || $e->textContent == '') {
+            return '';
+        }
+        $textContent = trim($e->textContent);
+        if ($normalizeSpaces) {
+            return preg_replace($this->regexps['normalize'], ' ', $textContent);
+        } else {
+            return $textContent;
+        }
+    }
+    /**
+    * Get the number of times a string $s appears in the node $e.
+    *
+    * @param DOMElement $e
+    * @param string - what to count. Default is ","
+    * @return number (integer)
+    **/
+    public function getCharCount($e, $s=',') {
+        return substr_count($this->getInnerText($e), $s);
+    }
+    /**
+    * Remove the style attribute on every $e and under.
+    *
+    * @param DOMElement $e
+    * @return void
+    */
+    public function cleanStyles($e) {
+        if (!is_object($e)) return;
+        $elems = $e->getElementsByTagName('*');
+        foreach ($elems as $elem) {
+            $elem->removeAttribute('style');
+        }
+    }
+    
+    /**
+    * Get the density of links as a percentage of the content
+    * This is the amount of text that is inside a link divided by the total text in the node.
+    * 
+    * @param DOMElement $e
+    * @return number (float)
+    */
+    public function getLinkDensity($e) {
+        $links      = $e->getElementsByTagName('a');
+        $textLength = strlen($this->getInnerText($e));
+        $linkLength = 0;
+        for ($i=0, $il=$links->length; $i < $il; $i++)
+        {
+            $linkLength += strlen($this->getInnerText($links->item($i)));
+        }
+        if ($textLength > 0) {
+            return $linkLength / $textLength;
+        } else {
+            return 0;
+        }
+    }
+    
+    /**
+    * Get an elements class/id weight. Uses regular expressions to tell if this 
+    * element looks good or bad.
+    *
+    * @param DOMElement $e
+    * @return number (Integer)
+    */
+    public function getClassWeight($e) {
+        if(!$this->flagIsActive(self::FLAG_WEIGHT_CLASSES)) {
+            return 0;
+        }
+        $weight = 0;
+        /* Look for a special classname */
+        if ($e->hasAttribute('class') && $e->getAttribute('class') != '')
+        {
+            if (preg_match($this->regexps['negative'], $e->getAttribute('class'))) {
+                $weight -= 25;
+            }
+            if (preg_match($this->regexps['positive'], $e->getAttribute('class'))) {
+                $weight += 25;
+            }
+        }
+        /* Look for a special ID */
+        if ($e->hasAttribute('id') && $e->getAttribute('id') != '')
+        {
+            if (preg_match($this->regexps['negative'], $e->getAttribute('id'))) {
+                $weight -= 25;
+            }
+            if (preg_match($this->regexps['positive'], $e->getAttribute('id'))) {
+                $weight += 25;
+            }
+        }
+        return $weight;
+    }
+    /**
+    * Remove extraneous break tags from a node.
+    *
+    * @param DOMElement $node
+    * @return void
+    */
+    public function killBreaks($node) {
+        $html = $node->innerHTML;
+        $html = preg_replace($this->regexps['killBreaks'], '<br />', $html);
+        $node->innerHTML = $html;
+    }
+    /**
+    * Clean a node of all elements of type "tag".
+    * (Unless it's a youtube/vimeo video. People love movies.)
+    *
+    * Updated 2012-09-18 to preserve youtube/vimeo iframes
+    *
+    * @param DOMElement $e
+    * @param string $tag
+    * @return void
+    */
+    public function clean($e, $tag) {
+        $targetList = $e->getElementsByTagName($tag);
+        $isEmbed = ($tag == 'iframe' || $tag == 'object' || $tag == 'embed');
+        
+        for ($y=$targetList->length-1; $y >= 0; $y--) {
+            /* Allow youtube and vimeo videos through as people usually want to see those. */
+            if ($isEmbed) {
+                $attributeValues = '';
+                for ($i=0, $il=$targetList->item($y)->attributes->length; $i < $il; $i++) {
+                    $attributeValues .= $targetList->item($y)->attributes->item($i)->value . '|'; // DOMAttr? (TODO: test)
+                }
+                
+                /* First, check the elements attributes to see if any of them contain youtube or vimeo */
+                if (preg_match($this->regexps['video'], $attributeValues)) {
+                    continue;
+                }
+                /* Then check the elements inside this element for the same. */
+                if (preg_match($this->regexps['video'], $targetList->item($y)->innerHTML)) {
+                    continue;
+                }
+            }
+            $targetList->item($y)->parentNode->removeChild($targetList->item($y));
+        }
+    }
+    
+    /**
+    * Clean an element of all tags of type "tag" if they look fishy.
+    * "Fishy" is an algorithm based on content length, classnames, 
+    * link density, number of images & embeds, etc.
+    *
+    * @param DOMElement $e
+    * @param string $tag
+    * @return void
+    */
+    public function cleanConditionally($e, $tag) {
+        if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) {
+            return;
+        }
+        $tagsList = $e->getElementsByTagName($tag);
+        $curTagsLength = $tagsList->length;
+        /**
+        * Gather counts for other typical elements embedded within.
+        * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
+        *
+        * TODO: Consider taking into account original contentScore here.
+        */
+        for ($i=$curTagsLength-1; $i >= 0; $i--) {
+            $weight = $this->getClassWeight($tagsList->item($i));
+            $contentScore = ($tagsList->item($i)->hasAttribute('readability')) ? (int)$tagsList->item($i)->getAttribute('readability') : 0;
+            
+            $this->dbg('Cleaning Conditionally ' . $tagsList->item($i)->tagName . ' (' . $tagsList->item($i)->getAttribute('class') . ':' . $tagsList->item($i)->getAttribute('id') . ')' . (($tagsList->item($i)->hasAttribute('readability')) ? (' with score ' . $tagsList->item($i)->getAttribute('readability')) : ''));
+            if ($weight + $contentScore < 0) {
+                $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+            }
+            else if ( $this->getCharCount($tagsList->item($i), ',') < 10) {
+                /**
+                * If there are not very many commas, and the number of
+                * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
+                **/
+                $p      = $tagsList->item($i)->getElementsByTagName('p')->length;
+                $img    = $tagsList->item($i)->getElementsByTagName('img')->length;
+                $li     = $tagsList->item($i)->getElementsByTagName('li')->length-100;
+                $input  = $tagsList->item($i)->getElementsByTagName('input')->length;
+                $a      = $tagsList->item($i)->getElementsByTagName('a')->length;
+                $embedCount = 0;
+                $embeds = $tagsList->item($i)->getElementsByTagName('embed');
+                for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+                    if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+                        $embedCount++; 
+                    }
+                }
+                $embeds = $tagsList->item($i)->getElementsByTagName('iframe');
+                for ($ei=0, $il=$embeds->length; $ei < $il; $ei++) {
+                    if (preg_match($this->regexps['video'], $embeds->item($ei)->getAttribute('src'))) {
+                        $embedCount++; 
+                    }
+                }
+                $linkDensity   = $this->getLinkDensity($tagsList->item($i));
+                $contentLength = strlen($this->getInnerText($tagsList->item($i)));
+                $toRemove      = false;
+                if ($this->lightClean) {
+                    $this->dbg('Light clean...');
+                    if ( ($img > $p) && ($img > 4) ) {
+                        $this->dbg(' more than 4 images and more image elements than paragraph elements');
+                        $toRemove = true;
+                    } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+                        $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } else if ( $input > floor($p/3) ) {
+                        $this->dbg(' too many <input> elements');
+                        $toRemove = true; 
+                    } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
+                        $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images');
+                        $toRemove = true;
+                    } else if($weight < 25 && $linkDensity > 0.2) {
+                        $this->dbg(' weight smaller than 25 and link density above 0.2');
+                        $toRemove = true;
+                    } else if($a > 2 && ($weight >= 25 && $linkDensity > 0.5)) {
+                        $this->dbg(' more than 2 links and weight above 25 but link density greater than 0.5');
+                        $toRemove = true;
+                    } else if($embedCount > 3) {
+                        $this->dbg(' more than 3 embeds');
+                        $toRemove = true;
+                    }
+                } else {
+                    $this->dbg('Standard clean...');
+                    if ( $img > $p ) {
+                        $this->dbg(' more image elements than paragraph elements');
+                        $toRemove = true;
+                    } else if ($li > $p && $tag != 'ul' && $tag != 'ol') {
+                        $this->dbg(' too many <li> elements, and parent is not <ul> or <ol>');
+                        $toRemove = true;
+                    } else if ( $input > floor($p/3) ) {
+                        $this->dbg(' too many <input> elements');
+                        $toRemove = true; 
+                    } else if ($contentLength < 25 && ($img === 0 || $img > 2) ) {
+                        $this->dbg(' content length less than 25 chars and 0 images, or more than 2 images');
+                        $toRemove = true;
+                    } else if($weight < 25 && $linkDensity > 0.2) {
+                        $this->dbg(' weight smaller than 25 and link density above 0.2');
+                        $toRemove = true;
+                    } else if($weight >= 25 && $linkDensity > 0.5) {
+                        $this->dbg(' weight above 25 but link density greater than 0.5');
+                        $toRemove = true;
+                    } else if(($embedCount == 1 && $contentLength < 75) || $embedCount > 1) {
+                        $this->dbg(' 1 embed and content length smaller than 75 chars, or more than one embed');
+                        $toRemove = true;
+                    }
+                }
+                if ($toRemove) {
+                    //$this->dbg('Removing: '.$tagsList->item($i)->innerHTML);
+                    $tagsList->item($i)->parentNode->removeChild($tagsList->item($i));
+                }
+            }
+        }
+    }
+    /**
+    * Clean out spurious headers from an Element. Checks things like classnames and link density.
+    *
+    * @param DOMElement $e
+    * @return void
+    */
+    public function cleanHeaders($e) {
+        for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
+            $headers = $e->getElementsByTagName('h' . $headerIndex);
+            for ($i=$headers->length-1; $i >=0; $i--) {
+                if ($this->getClassWeight($headers->item($i)) < 0 || $this->getLinkDensity($headers->item($i)) > 0.33) {
+                    $headers->item($i)->parentNode->removeChild($headers->item($i));
+                }
+            }
+        }
+    }
+    public function flagIsActive($flag) {
+        return ($this->flags & $flag) > 0;
+    }
+    
+    public function addFlag($flag) {
+        $this->flags = $this->flags | $flag;
+    }
+    
+    public function removeFlag($flag) {
+        $this->flags = $this->flags & ~$flag;
+    }
+}
+\ No newline at end of file
diff --git a/inc/3rdparty/Session.class.php b/inc/3rdparty/Session.class.php
new file mode 100644
index 00000000..eff924cc
--- /dev/null
+++ b/inc/3rdparty/Session.class.php
@@ -0,0 +1,136 @@
+<?php
+/**
+ * Session management class
+ * http://www.developpez.net/forums/d51943/php/langage/sessions/
+ * http://sebsauvage.net/wiki/doku.php?id=php:session
+ * http://sebsauvage.net/wiki/doku.php?id=php:shaarli
+ *
+ * Features:
+ * - Everything is stored on server-side (we do not trust client-side data,
+ *   such as cookie expiration)
+ * - IP addresses + user agent are checked on each access to prevent session
+ *   cookie hijacking (such as Firesheep)
+ * - Session expires on user inactivity (Session expiration date is
+ *   automatically updated everytime the user accesses a page.)
+ * - A unique secret key is generated on server-side for this session
+ *   (and never sent over the wire) which can be used
+ *   to sign forms (HMAC) (See $_SESSION['uid'] )
+ * - Token management to prevent XSRF attacks.
+ *
+ * TODO:
+ * - log login fail
+ * - prevent brute force (ban IP)
+ *
+ * HOWTOUSE:
+ * - Just call Session::init(); to initialize session and
+ *   check if connected with Session::isLogged()
+ */
+class Session
+{
+    // If the user does not access any page within this time,
+    // his/her session is considered expired (in seconds).
+    public static $inactivity_timeout = 3600;
+    private static $_instance;
+    // constructor
+    private function __construct()
+    {
+        // Use cookies to store session.
+        ini_set('session.use_cookies', 1);
+        // Force cookies for session  (phpsessionID forbidden in URL)
+        ini_set('session.use_only_cookies', 1);
+        if (!session_id()){
+            // Prevent php to use sessionID in URL if cookies are disabled.
+            ini_set('session.use_trans_sid', false);
+            session_start('poche');
+        }
+    }
+    // initialize session
+    public static function init()
+    {
+        if (!isset(self::$_instance)) {
+            self::$_instance = new Session();
+        }
+    }
+    // Returns the IP address, user agent and language of the client
+    // (Used to prevent session cookie hijacking.)
+    private static function _allInfos()
+    {
+        $infos = $_SERVER["REMOTE_ADDR"];
+        if (isset($_SERVER['HTTP_X_FORWARDED_FOR'])) {
+            $infos.=$_SERVER['HTTP_X_FORWARDED_FOR'];
+        }
+        if (isset($_SERVER['HTTP_CLIENT_IP'])) {
+            $infos.='_'.$_SERVER['HTTP_CLIENT_IP'];
+        }
+        $infos.='_'.$_SERVER['HTTP_USER_AGENT'];
+        $infos.='_'.$_SERVER['HTTP_ACCEPT_LANGUAGE'];
+        return sha1($infos);
+    }
+    // Check that user/password is correct and init some SESSION variables.
+    public static function login($login,$password,$login_test,$password_test,
+                                 $pValues = array())
+    {
+        foreach ($pValues as $key => $value) {
+            $_SESSION[$key] = $value;
+        }
+        if ($login==$login_test && $password==$password_test){
+            // generate unique random number to sign forms (HMAC)
+            $_SESSION['uid'] = sha1(uniqid('',true).'_'.mt_rand());
+            $_SESSION['info']=Session::_allInfos();
+            $_SESSION['username']=$login;
+            // Set session expiration.
+            $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
+            return true;
+        }
+        return false;
+    }
+    // Force logout
+    public static function logout()
+    {
+        unset($_SESSION['uid'],$_SESSION['info'],$_SESSION['expires_on'],$_SESSION['tokens'], $_SESSION['login'], $_SESSION['pass']);
+    }
+    // Make sure user is logged in.
+    public static function isLogged()
+    {
+        if (!isset ($_SESSION['uid'])
+            || $_SESSION['info']!=Session::_allInfos()
+            || time()>=$_SESSION['expires_on']){
+            Session::logout();
+            return false;
+        }
+        // User accessed a page : Update his/her session expiration date.
+        $_SESSION['expires_on']=time()+Session::$inactivity_timeout;
+        return true;
+    }
+    // Returns a token.
+    public static function getToken()
+    {
+        if (!isset($_SESSION['tokens'])){
+            $_SESSION['tokens']=array();
+        }
+        // We generate a random string and store it on the server side.
+        $rnd = sha1(uniqid('',true).'_'.mt_rand());
+        $_SESSION['tokens'][$rnd]=1;
+        return $rnd;
+    }
+    // Tells if a token is ok. Using this function will destroy the token.
+    // return true if token is ok.
+    public static function isToken($token)
+    {
+        if (isset($_SESSION['tokens'][$token]))
+        {
+            unset($_SESSION['tokens'][$token]); // Token is used: destroy it.
+            return true; // Token is ok.
+        }
+        return false; // Wrong token, or already used.
+    }
+}
+\ No newline at end of file
diff --git a/inc/3rdparty/simple_html_dom.php b/inc/3rdparty/simple_html_dom.php
new file mode 100644
index 00000000..43b94e57
--- /dev/null
+++ b/inc/3rdparty/simple_html_dom.php
@@ -0,0 +1,1722 @@
+<?php
+/**
+ * Website: http://sourceforge.net/projects/simplehtmldom/
+ * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
+ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
+ * Contributions by:
+ *       Yousuke Kumakura (Attribute filters)
+ *       Vadim Voituk (Negative indexes supports of "find" method)
+ *       Antcs (Constructor with automatically load contents either text or file/url)
+ *
+ * all affected sections have comments starting with "PaperG"
+ *
+ * Paperg - Added case insensitive testing of the value of the selector.
+ * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
+ *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
+ *  it will almost always be smaller by some amount.
+ *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
+ *  but for most purposes, it's a really good estimation.
+ * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
+ * Allow the user to tell us how much they trust the html.
+ * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.
+ * This allows for us to find tags based on the text they contain.
+ * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
+ * Paperg: added parse_charset so that we know about the character set of the source document.
+ *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
+ *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
+ *
+ * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.
+ * PaperG (John Schlick) Added get_display_size for "IMG" tags.
+ *
+ * Licensed under The MIT License
+ * Redistributions of files must retain the above copyright notice.
+ *
+ * @author S.C. Chen <me578022@gmail.com>
+ * @author John Schlick
+ * @author Rus Carroll
+ * @version 1.5 ($Rev: 202 $)
+ * @package PlaceLocalInclude
+ * @subpackage simple_html_dom
+ */
+/**
+ * All of the Defines for the classes below.
+ * @author S.C. Chen <me578022@gmail.com>
+ */
+define('HDOM_TYPE_ELEMENT', 1);
+define('HDOM_TYPE_COMMENT', 2);
+define('HDOM_TYPE_TEXT',        3);
+define('HDOM_TYPE_ENDTAG',  4);
+define('HDOM_TYPE_ROOT',        5);
+define('HDOM_TYPE_UNKNOWN', 6);
+define('HDOM_QUOTE_DOUBLE', 0);
+define('HDOM_QUOTE_SINGLE', 1);
+define('HDOM_QUOTE_NO',  3);
+define('HDOM_INFO_BEGIN',   0);
+define('HDOM_INFO_END',  1);
+define('HDOM_INFO_QUOTE',   2);
+define('HDOM_INFO_SPACE',   3);
+define('HDOM_INFO_TEXT',        4);
+define('HDOM_INFO_INNER',   5);
+define('HDOM_INFO_OUTER',   6);
+define('HDOM_INFO_ENDSPACE',7);
+define('DEFAULT_TARGET_CHARSET', 'UTF-8');
+define('DEFAULT_BR_TEXT', "\r\n");
+define('DEFAULT_SPAN_TEXT', " ");
+define('MAX_FILE_SIZE', 600000);
+// helper functions
+// -----------------------------------------------------------------------------
+// get html dom from file
+// $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
+function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+{
+        // We DO force the tags to be terminated.
+        $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
+        // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
+        $contents = file_get_contents($url, $use_include_path, $context, $offset);
+        // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
+        //$contents = retrieve_url_contents($url);
+        if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
+        {
+                return false;
+        }
+        // The second parameter can force the selectors to all be lowercase.
+        $dom->load($contents, $lowercase, $stripRN);
+        return $dom;
+}
+// get html dom from string
+function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+{
+        $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
+        if (empty($str) || strlen($str) > MAX_FILE_SIZE)
+        {
+                $dom->clear();
+                return false;
+        }
+        $dom->load($str, $lowercase, $stripRN);
+        return $dom;
+}
+// dump html dom tree
+function dump_html_tree($node, $show_attr=true, $deep=0)
+{
+        $node->dump($node);
+}
+/**
+ * simple html dom node
+ * PaperG - added ability for "find" routine to lowercase the value of the selector.
+ * PaperG - added $tag_start to track the start position of the tag in the total byte index
+ *
+ * @package PlaceLocalInclude
+ */
+class simple_html_dom_node
+{
+        public $nodetype = HDOM_TYPE_TEXT;
+        public $tag = 'text';
+        public $attr = array();
+        public $children = array();
+        public $nodes = array();
+        public $parent = null;
+        // The "info" array - see HDOM_INFO_... for what each element contains.
+        public $_ = array();
+        public $tag_start = 0;
+        private $dom = null;
+        function __construct($dom)
+        {
+                $this->dom = $dom;
+                $dom->nodes[] = $this;
+        }
+        function __destruct()
+        {
+                $this->clear();
+        }
+        function __toString()
+        {
+                return $this->outertext();
+        }
+        // clean up memory due to php5 circular references memory leak...
+        function clear()
+        {
+                $this->dom = null;
+                $this->nodes = null;
+                $this->parent = null;
+                $this->children = null;
+        }
+        // dump node's tree
+        function dump($show_attr=true, $deep=0)
+        {
+                $lead = str_repeat('    ', $deep);
+                echo $lead.$this->tag;
+                if ($show_attr && count($this->attr)>0)
+                {
+                        echo '(';
+                        foreach ($this->attr as $k=>$v)
+                                echo "[$k]=>\"".$this->$k.'", ';
+                        echo ')';
+                }
+                echo "\n";
+                if ($this->nodes)
+                {
+                        foreach ($this->nodes as $c)
+                        {
+                                $c->dump($show_attr, $deep+1);
+                        }
+                }
+        }
+        // Debugging function to dump a single dom node with a bunch of information about it.
+        function dump_node($echo=true)
+        {
+                $string = $this->tag;
+                if (count($this->attr)>0)
+                {
+                        $string .= '(';
+                        foreach ($this->attr as $k=>$v)
+                        {
+                                $string .= "[$k]=>\"".$this->$k.'", ';
+                        }
+                        $string .= ')';
+                }
+                if (count($this->_)>0)
+                {
+                        $string .= ' $_ (';
+                        foreach ($this->_ as $k=>$v)
+                        {
+                                if (is_array($v))
+                                {
+                                        $string .= "[$k]=>(";
+                                        foreach ($v as $k2=>$v2)
+                                        {
+                                                $string .= "[$k2]=>\"".$v2.'", ';
+                                        }
+                                        $string .= ")";
+                                } else {
+                                        $string .= "[$k]=>\"".$v.'", ';
+                                }
+                        }
+                        $string .= ")";
+                }
+                if (isset($this->text))
+                {
+                        $string .= " text: (" . $this->text . ")";
+                }
+                $string .= " HDOM_INNER_INFO: '";
+                if (isset($node->_[HDOM_INFO_INNER]))
+                {
+                        $string .= $node->_[HDOM_INFO_INNER] . "'";
+                }
+                else
+                {
+                        $string .= ' NULL ';
+                }
+                $string .= " children: " . count($this->children);
+                $string .= " nodes: " . count($this->nodes);
+                $string .= " tag_start: " . $this->tag_start;
+                $string .= "\n";
+                if ($echo)
+                {
+                        echo $string;
+                        return;
+                }
+                else
+                {
+                        return $string;
+                }
+        }
+        // returns the parent of node
+        // If a node is passed in, it will reset the parent of the current node to that one.
+        function parent($parent=null)
+        {
+                // I am SURE that this doesn't work properly.
+                // It fails to unset the current node from it's current parents nodes or children list first.
+                if ($parent !== null)
+                {
+                        $this->parent = $parent;
+                        $this->parent->nodes[] = $this;
+                        $this->parent->children[] = $this;
+                }
+                return $this->parent;
+        }
+        // verify that node has children
+        function has_child()
+        {
+                return !empty($this->children);
+        }
+        // returns children of node
+        function children($idx=-1)
+        {
+                if ($idx===-1)
+                {
+                        return $this->children;
+                }
+                if (isset($this->children[$idx])) return $this->children[$idx];
+                return null;
+        }
+        // returns the first child of node
+        function first_child()
+        {
+                if (count($this->children)>0)
+                {
+                        return $this->children[0];
+                }
+                return null;
+        }
+        // returns the last child of node
+        function last_child()
+        {
+                if (($count=count($this->children))>0)
+                {
+                        return $this->children[$count-1];
+                }
+                return null;
+        }
+        // returns the next sibling of node
+        function next_sibling()
+        {
+                if ($this->parent===null)
+                {
+                        return null;
+                }
+                $idx = 0;
+                $count = count($this->parent->children);
+                while ($idx<$count && $this!==$this->parent->children[$idx])
+                {
+                        ++$idx;
+                }
+                if (++$idx>=$count)
+                {
+                        return null;
+                }
+                return $this->parent->children[$idx];
+        }
+        // returns the previous sibling of node
+        function prev_sibling()
+        {
+                if ($this->parent===null) return null;
+                $idx = 0;
+                $count = count($this->parent->children);
+                while ($idx<$count && $this!==$this->parent->children[$idx])
+                        ++$idx;
+                if (--$idx<0) return null;
+                return $this->parent->children[$idx];
+        }
+        // function to locate a specific ancestor tag in the path to the root.
+        function find_ancestor_tag($tag)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+                // Start by including ourselves in the comparison.
+                $returnDom = $this;
+                while (!is_null($returnDom))
+                {
+                        if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }
+                        if ($returnDom->tag == $tag)
+                        {
+                                break;
+                        }
+                        $returnDom = $returnDom->parent;
+                }
+                return $returnDom;
+        }
+        // get dom node's inner html
+        function innertext()
+        {
+                if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
+                if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+                $ret = '';
+                foreach ($this->nodes as $n)
+                        $ret .= $n->outertext();
+                return $ret;
+        }
+        // get dom node's outer text (with tag)
+        function outertext()
+        {
+                global $debug_object;
+                if (is_object($debug_object))
+                {
+                        $text = '';
+                        if ($this->tag == 'text')
+                        {
+                                if (!empty($this->text))
+                                {
+                                        $text = " with text: " . $this->text;
+                                }
+                        }
+                        $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
+                }
+                if ($this->tag==='root') return $this->innertext();
+                // trigger callback
+                if ($this->dom && $this->dom->callback!==null)
+                {
+                        call_user_func_array($this->dom->callback, array($this));
+                }
+                if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
+                if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+                // render begin tag
+                if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
+                {
+                        $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
+                } else {
+                        $ret = "";
+                }
+                // render inner text
+                if (isset($this->_[HDOM_INFO_INNER]))
+                {
+                        // If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.
+                        if ($this->tag != "br")
+                        {
+                                $ret .= $this->_[HDOM_INFO_INNER];
+                        }
+                } else {
+                        if ($this->nodes)
+                        {
+                                foreach ($this->nodes as $n)
+                                {
+                                        $ret .= $this->convert_text($n->outertext());
+                                }
+                        }
+                }
+                // render end tag
+                if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
+                        $ret .= '</'.$this->tag.'>';
+                return $ret;
+        }
+        // get dom node's plain text
+        function text()
+        {
+                if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
+                switch ($this->nodetype)
+                {
+                        case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+                        case HDOM_TYPE_COMMENT: return '';
+                        case HDOM_TYPE_UNKNOWN: return '';
+                }
+                if (strcasecmp($this->tag, 'script')===0) return '';
+                if (strcasecmp($this->tag, 'style')===0) return '';
+                $ret = '';
+                // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
+                // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
+                // WHY is this happening?
+                if (!is_null($this->nodes))
+                {
+                        foreach ($this->nodes as $n)
+                        {
+                                $ret .= $this->convert_text($n->text());
+                        }
+                        // If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.
+                        if ($this->tag == "span")
+                        {
+                                $ret .= $this->dom->default_span_text;
+                        }
+                }
+                return $ret;
+        }
+        function xmltext()
+        {
+                $ret = $this->innertext();
+                $ret = str_ireplace('<![CDATA[', '', $ret);
+                $ret = str_replace(']]>', '', $ret);
+                return $ret;
+        }
+        // build node's text with tag
+        function makeup()
+        {
+                // text, comment, unknown
+                if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
+                $ret = '<'.$this->tag;
+                $i = -1;
+                foreach ($this->attr as $key=>$val)
+                {
+                        ++$i;
+                        // skip removed attribute
+                        if ($val===null || $val===false)
+                                continue;
+                        $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
+                        //no value attr: nowrap, checked selected...
+                        if ($val===true)
+                                $ret .= $key;
+                        else {
+                                switch ($this->_[HDOM_INFO_QUOTE][$i])
+                                {
+                                        case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
+                                        case HDOM_QUOTE_SINGLE: $quote = '\''; break;
+                                        default: $quote = '';
+                                }
+                                $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
+                        }
+                }
+                $ret = $this->dom->restore_noise($ret);
+                return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
+        }
+        // find elements by css selector
+        //PaperG - added ability for find to lowercase the value of the selector.
+        function find($selector, $idx=null, $lowercase=false)
+        {
+                $selectors = $this->parse_selector($selector);
+                if (($count=count($selectors))===0) return array();
+                $found_keys = array();
+                // find each selector
+                for ($c=0; $c<$count; ++$c)
+                {
+                        // The change on the below line was documented on the sourceforge code tracker id 2788009
+                        // used to be: if (($levle=count($selectors[0]))===0) return array();
+                        if (($levle=count($selectors[$c]))===0) return array();
+                        if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
+                        $head = array($this->_[HDOM_INFO_BEGIN]=>1);
+                        // handle descendant selectors, no recursive!
+                        for ($l=0; $l<$levle; ++$l)
+                        {
+                                $ret = array();
+                                foreach ($head as $k=>$v)
+                                {
+                                        $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
+                                        //PaperG - Pass this optional parameter on to the seek function.
+                                        $n->seek($selectors[$c][$l], $ret, $lowercase);
+                                }
+                                $head = $ret;
+                        }
+                        foreach ($head as $k=>$v)
+                        {
+                                if (!isset($found_keys[$k]))
+                                        $found_keys[$k] = 1;
+                        }
+                }
+                // sort keys
+                ksort($found_keys);
+                $found = array();
+                foreach ($found_keys as $k=>$v)
+                        $found[] = $this->dom->nodes[$k];
+                // return nth-element or array
+                if (is_null($idx)) return $found;
+                else if ($idx<0) $idx = count($found) + $idx;
+                return (isset($found[$idx])) ? $found[$idx] : null;
+        }
+        // seek for given conditions
+        // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
+        protected function seek($selector, &$ret, $lowercase=false)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+                list($tag, $key, $val, $exp, $no_key) = $selector;
+                // xpath index
+                if ($tag && $key && is_numeric($key))
+                {
+                        $count = 0;
+                        foreach ($this->children as $c)
+                        {
+                                if ($tag==='*' || $tag===$c->tag) {
+                                        if (++$count==$key) {
+                                                $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
+                                                return;
+                                        }
+                                }
+                        }
+                        return;
+                }
+                $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
+                if ($end==0) {
+                        $parent = $this->parent;
+                        while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
+                                $end -= 1;
+                                $parent = $parent->parent;
+                        }
+                        $end += $parent->_[HDOM_INFO_END];
+                }
+                for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
+                        $node = $this->dom->nodes[$i];
+                        $pass = true;
+                        if ($tag==='*' && !$key) {
+                                if (in_array($node, $this->children, true))
+                                        $ret[$i] = 1;
+                                continue;
+                        }
+                        // compare tag
+                        if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
+                        // compare key
+                        if ($pass && $key) {
+                                if ($no_key) {
+                                        if (isset($node->attr[$key])) $pass=false;
+                                } else {
+                                        if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
+                                }
+                        }
+                        // compare value
+                        if ($pass && $key && $val  && $val!=='*') {
+                                // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
+                                if ($key == "plaintext") {
+                                        // $node->plaintext actually returns $node->text();
+                                        $nodeKeyValue = $node->text();
+                                } else {
+                                        // this is a normal search, we want the value of that attribute of the tag.
+                                        $nodeKeyValue = $node->attr[$key];
+                                }
+                                if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
+                                //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
+                                if ($lowercase) {
+                                        $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
+                                } else {
+                                        $check = $this->match($exp, $val, $nodeKeyValue);
+                                }
+                                if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}
+                                // handle multiple class
+                                if (!$check && strcasecmp($key, 'class')===0) {
+                                        foreach (explode(' ',$node->attr[$key]) as $k) {
+                                                // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
+                                                if (!empty($k)) {
+                                                        if ($lowercase) {
+                                                                $check = $this->match($exp, strtolower($val), strtolower($k));
+                                                        } else {
+                                                                $check = $this->match($exp, $val, $k);
+                                                        }
+                                                        if ($check) break;
+                                                }
+                                        }
+                                }
+                                if (!$check) $pass = false;
+                        }
+                        if ($pass) $ret[$i] = 1;
+                        unset($node);
+                }
+                // It's passed by reference so this is actually what this function returns.
+                if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}
+        }
+        protected function match($exp, $pattern, $value) {
+                global $debug_object;
+                if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+                switch ($exp) {
+                        case '=':
+                                return ($value===$pattern);
+                        case '!=':
+                                return ($value!==$pattern);
+                        case '^=':
+                                return preg_match("/^".preg_quote($pattern,'/')."/", $value);
+                        case '$=':
+                                return preg_match("/".preg_quote($pattern,'/')."$/", $value);
+                        case '*=':
+                                if ($pattern[0]=='/') {
+                                        return preg_match($pattern, $value);
+                                }
+                                return preg_match("/".$pattern."/i", $value);
+                }
+                return false;
+        }
+        protected function parse_selector($selector_string) {
+                global $debug_object;
+                if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+                // pattern of CSS selectors, modified from mootools
+                // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
+                // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
+// Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.
+// This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
+// farther study is required to determine of this should be documented or removed.
+//              $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+                $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
+                preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
+                if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}
+                $selectors = array();
+                $result = array();
+                //print_r($matches);
+                foreach ($matches as $m) {
+                        $m[0] = trim($m[0]);
+                        if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
+                        // for browser generated xpath
+                        if ($m[1]==='tbody') continue;
+                        list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
+                        if (!empty($m[2])) {$key='id'; $val=$m[2];}
+                        if (!empty($m[3])) {$key='class'; $val=$m[3];}
+                        if (!empty($m[4])) {$key=$m[4];}
+                        if (!empty($m[5])) {$exp=$m[5];}
+                        if (!empty($m[6])) {$val=$m[6];}
+                        // convert to lowercase
+                        if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
+                        //elements that do NOT have the specified attribute
+                        if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
+                        $result[] = array($tag, $key, $val, $exp, $no_key);
+                        if (trim($m[7])===',') {
+                                $selectors[] = $result;
+                                $result = array();
+                        }
+                }
+                if (count($result)>0)
+                        $selectors[] = $result;
+                return $selectors;
+        }
+        function __get($name) {
+                if (isset($this->attr[$name]))
+                {
+                        return $this->convert_text($this->attr[$name]);
+                }
+                switch ($name) {
+                        case 'outertext': return $this->outertext();
+                        case 'innertext': return $this->innertext();
+                        case 'plaintext': return $this->text();
+                        case 'xmltext': return $this->xmltext();
+                        default: return array_key_exists($name, $this->attr);
+                }
+        }
+        function __set($name, $value) {
+                switch ($name) {
+                        case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
+                        case 'innertext':
+                                if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
+                                return $this->_[HDOM_INFO_INNER] = $value;
+                }
+                if (!isset($this->attr[$name])) {
+                        $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
+                        $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
+                }
+                $this->attr[$name] = $value;
+        }
+        function __isset($name) {
+                switch ($name) {
+                        case 'outertext': return true;
+                        case 'innertext': return true;
+                        case 'plaintext': return true;
+                }
+                //no value attr: nowrap, checked selected...
+                return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
+        }
+        function __unset($name) {
+                if (isset($this->attr[$name]))
+                        unset($this->attr[$name]);
+        }
+        // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
+        function convert_text($text)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}
+                $converted_text = $text;
+                $sourceCharset = "";
+                $targetCharset = "";
+                if ($this->dom)
+                {
+                        $sourceCharset = strtoupper($this->dom->_charset);
+                        $targetCharset = strtoupper($this->dom->_target_charset);
+                }
+                if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
+                if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
+                {
+                        // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
+                        if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
+                        {
+                                $converted_text = $text;
+                        }
+                        else
+                        {
+                                $converted_text = iconv($sourceCharset, $targetCharset, $text);
+                        }
+                }
+                // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
+                if ($targetCharset == 'UTF-8')
+                {
+                        if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
+                        {
+                                $converted_text = substr($converted_text, 3);
+                        }
+                        if (substr($converted_text, -3) == "\xef\xbb\xbf")
+                        {
+                                $converted_text = substr($converted_text, 0, -3);
+                        }
+                }
+                return $converted_text;
+        }
+        /**
+        * Returns true if $string is valid UTF-8 and false otherwise.
+        *
+        * @param mixed $str String to be tested
+        * @return boolean
+        */
+        static function is_utf8($str)
+        {
+                $c=0; $b=0;
+                $bits=0;
+                $len=strlen($str);
+                for($i=0; $i<$len; $i++)
+                {
+                        $c=ord($str[$i]);
+                        if($c > 128)
+                        {
+                                if(($c >= 254)) return false;
+                                elseif($c >= 252) $bits=6;
+                                elseif($c >= 248) $bits=5;
+                                elseif($c >= 240) $bits=4;
+                                elseif($c >= 224) $bits=3;
+                                elseif($c >= 192) $bits=2;
+                                else return false;
+                                if(($i+$bits) > $len) return false;
+                                while($bits > 1)
+                                {
+                                        $i++;
+                                        $b=ord($str[$i]);
+                                        if($b < 128 || $b > 191) return false;
+                                        $bits--;
+                                }
+                        }
+                }
+                return true;
+        }
+        /*
+        function is_utf8($string)
+        {
+                //this is buggy
+                return (utf8_encode(utf8_decode($string)) == $string);
+        }
+        */
+        /**
+         * Function to try a few tricks to determine the displayed size of an img on the page.
+         * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
+         *
+         * @author John Schlick
+         * @version April 19 2012
+         * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
+         */
+        function get_display_size()
+        {
+                global $debug_object;
+                $width = -1;
+                $height = -1;
+                if ($this->tag !== 'img')
+                {
+                        return false;
+                }
+                // See if there is aheight or width attribute in the tag itself.
+                if (isset($this->attr['width']))
+                {
+                        $width = $this->attr['width'];
+                }
+                if (isset($this->attr['height']))
+                {
+                        $height = $this->attr['height'];
+                }
+                // Now look for an inline style.
+                if (isset($this->attr['style']))
+                {
+                        // Thanks to user gnarf from stackoverflow for this regular expression.
+                        $attributes = array();
+                        preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
+                        foreach ($matches as $match) {
+                          $attributes[$match[1]] = $match[2];
+                        }
+                        // If there is a width in the style attributes:
+                        if (isset($attributes['width']) && $width == -1)
+                        {
+                                // check that the last two characters are px (pixels)
+                                if (strtolower(substr($attributes['width'], -2)) == 'px')
+                                {
+                                        $proposed_width = substr($attributes['width'], 0, -2);
+                                        // Now make sure that it's an integer and not something stupid.
+                                        if (filter_var($proposed_width, FILTER_VALIDATE_INT))
+                                        {
+                                                $width = $proposed_width;
+                                        }
+                                }
+                        }
+                        // If there is a width in the style attributes:
+                        if (isset($attributes['height']) && $height == -1)
+                        {
+                                // check that the last two characters are px (pixels)
+                                if (strtolower(substr($attributes['height'], -2)) == 'px')
+                                {
+                                        $proposed_height = substr($attributes['height'], 0, -2);
+                                        // Now make sure that it's an integer and not something stupid.
+                                        if (filter_var($proposed_height, FILTER_VALIDATE_INT))
+                                        {
+                                                $height = $proposed_height;
+                                        }
+                                }
+                        }
+                }
+                // Future enhancement:
+                // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
+                // Far future enhancement
+                // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
+                // Note that in this case, the class or id will have the img subselector for it to apply to the image.
+                // ridiculously far future development
+                // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
+                $result = array('height' => $height,
+                                                'width' => $width);
+                return $result;
+        }
+        // camel naming conventions
+        function getAllAttributes() {return $this->attr;}
+        function getAttribute($name) {return $this->__get($name);}
+        function setAttribute($name, $value) {$this->__set($name, $value);}
+        function hasAttribute($name) {return $this->__isset($name);}
+        function removeAttribute($name) {$this->__set($name, null);}
+        function getElementById($id) {return $this->find("#$id", 0);}
+        function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
+        function getElementByTagName($name) {return $this->find($name, 0);}
+        function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
+        function parentNode() {return $this->parent();}
+        function childNodes($idx=-1) {return $this->children($idx);}
+        function firstChild() {return $this->first_child();}
+        function lastChild() {return $this->last_child();}
+        function nextSibling() {return $this->next_sibling();}
+        function previousSibling() {return $this->prev_sibling();}
+        function hasChildNodes() {return $this->has_child();}
+        function nodeName() {return $this->tag;}
+        function appendChild($node) {$node->parent($this); return $node;}
+}
+/**
+ * simple html dom parser
+ * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
+ * Paperg - change $size from protected to public so we can easily access it
+ * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.
+ *
+ * @package PlaceLocalInclude
+ */
+class simple_html_dom
+{
+        public $root = null;
+        public $nodes = array();
+        public $callback = null;
+        public $lowercase = false;
+        // Used to keep track of how large the text was when we started.
+        public $original_size;
+        public $size;
+        protected $pos;
+        protected $doc;
+        protected $char;
+        protected $cursor;
+        protected $parent;
+        protected $noise = array();
+        protected $token_blank = " \t\r\n";
+        protected $token_equal = ' =/>';
+        protected $token_slash = " />\r\n\t";
+        protected $token_attr = ' >';
+        // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
+        public $_charset = '';
+        public $_target_charset = '';
+        protected $default_br_text = "";
+        public $default_span_text = "";
+        // use isset instead of in_array, performance boost about 30%...
+        protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
+        protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
+        // Known sourceforge issue #2977341
+        // B tags that are not closed cause us to return everything to the end of the document.
+        protected $optional_closing_tags = array(
+                'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
+                'th'=>array('th'=>1),
+                'td'=>array('td'=>1),
+                'li'=>array('li'=>1),
+                'dt'=>array('dt'=>1, 'dd'=>1),
+                'dd'=>array('dd'=>1, 'dt'=>1),
+                'dl'=>array('dd'=>1, 'dt'=>1),
+                'p'=>array('p'=>1),
+                'nobr'=>array('nobr'=>1),
+                'b'=>array('b'=>1),
+                'option'=>array('option'=>1),
+        );
+        function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+        {
+                if ($str)
+                {
+                        if (preg_match("/^http:\/\//i",$str) || is_file($str))
+                        {
+                                $this->load_file($str);
+                        }
+                        else
+                        {
+                                $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
+                        }
+                }
+                // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
+                if (!$forceTagsClosed) {
+                        $this->optional_closing_array=array();
+                }
+                $this->_target_charset = $target_charset;
+        }
+        function __destruct()
+        {
+                $this->clear();
+        }
+        // load html from string
+        function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+        {
+                global $debug_object;
+                // prepare
+                $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
+                // strip out comments
+                $this->remove_noise("'<!--(.*?)-->'is");
+                // strip out cdata
+                $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
+                // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
+                // Script tags removal now preceeds style tag removal.
+                // strip out <script> tags
+                $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
+                $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
+                // strip out <style> tags
+                $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
+                $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
+                // strip out preformatted tags
+                $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
+                // strip out server side scripts
+                $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
+                // strip smarty scripts
+                $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
+                // parsing
+                while ($this->parse());
+                // end
+                $this->root->_[HDOM_INFO_END] = $this->cursor;
+                $this->parse_charset();
+                // make load function chainable
+                return $this;
+        }
+        // load html from file
+        function load_file()
+        {
+                $args = func_get_args();
+                $this->load(call_user_func_array('file_get_contents', $args), true);
+                // Throw an error if we can't properly load the dom.
+                if (($error=error_get_last())!==null) {
+                        $this->clear();
+                        return false;
+                }
+        }
+        // set callback function
+        function set_callback($function_name)
+        {
+                $this->callback = $function_name;
+        }
+        // remove callback function
+        function remove_callback()
+        {
+                $this->callback = null;
+        }
+        // save dom as string
+        function save($filepath='')
+        {
+                $ret = $this->root->innertext();
+                if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
+                return $ret;
+        }
+        // find dom node by css selector
+        // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
+        function find($selector, $idx=null, $lowercase=false)
+        {
+                return $this->root->find($selector, $idx, $lowercase);
+        }
+        // clean up memory due to php5 circular references memory leak...
+        function clear()
+        {
+                foreach ($this->nodes as $n) {$n->clear(); $n = null;}
+                // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
+                if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
+                if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
+                if (isset($this->root)) {$this->root->clear(); unset($this->root);}
+                unset($this->doc);
+                unset($this->noise);
+        }
+        function dump($show_attr=true)
+        {
+                $this->root->dump($show_attr);
+        }
+        // prepare HTML data and init everything
+        protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
+        {
+                $this->clear();
+                // set the length of content before we do anything to it.
+                $this->size = strlen($str);
+                // Save the original size of the html that we got in.  It might be useful to someone.
+                $this->original_size = $this->size;
+                //before we save the string as the doc...  strip out the \r \n's if we are told to.
+                if ($stripRN) {
+                        $str = str_replace("\r", " ", $str);
+                        $str = str_replace("\n", " ", $str);
+                        // set the length of content since we have changed it.
+                        $this->size = strlen($str);
+                }
+                $this->doc = $str;
+                $this->pos = 0;
+                $this->cursor = 1;
+                $this->noise = array();
+                $this->nodes = array();
+                $this->lowercase = $lowercase;
+                $this->default_br_text = $defaultBRText;
+                $this->default_span_text = $defaultSpanText;
+                $this->root = new simple_html_dom_node($this);
+                $this->root->tag = 'root';
+                $this->root->_[HDOM_INFO_BEGIN] = -1;
+                $this->root->nodetype = HDOM_TYPE_ROOT;
+                $this->parent = $this->root;
+                if ($this->size>0) $this->char = $this->doc[0];
+        }
+        // parse html content
+        protected function parse()
+        {
+                if (($s = $this->copy_until_char('<'))==='')
+                {
+                        return $this->read_tag();
+                }
+                // text
+                $node = new simple_html_dom_node($this);
+                ++$this->cursor;
+                $node->_[HDOM_INFO_TEXT] = $s;
+                $this->link_nodes($node, false);
+                return true;
+        }
+        // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
+        // NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
+        // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
+        protected function parse_charset()
+        {
+                global $debug_object;
+                $charset = null;
+                if (function_exists('get_last_retrieve_url_contents_content_type'))
+                {
+                        $contentTypeHeader = get_last_retrieve_url_contents_content_type();
+                        $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
+                        if ($success)
+                        {
+                                $charset = $matches[1];
+                                if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);}
+                        }
+                }
+                if (empty($charset))
+                {
+                        $el = $this->root->find('meta[http-equiv=Content-Type]',0);
+                        if (!empty($el))
+                        {
+                                $fullvalue = $el->content;
+                                if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);}
+                                if (!empty($fullvalue))
+                                {
+                                        $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
+                                        if ($success)
+                                        {
+                                                $charset = $matches[1];
+                                        }
+                                        else
+                                        {
+                                                // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
+                                                if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
+                                                $charset = 'ISO-8859-1';
+                                        }
+                                }
+                        }
+                }
+                // If we couldn't find a charset above, then lets try to detect one based on the text we got...
+                if (empty($charset))
+                {
+                        // Have php try to detect the encoding from the text given to us.
+                        $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
+                        if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);}
+                        // and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
+                        if ($charset === false)
+                        {
+                                if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');}
+                                $charset = 'UTF-8';
+                        }
+                }
+                // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
+                if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
+                {
+                        if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
+                        $charset = 'CP1252';
+                }
+                if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);}
+                return $this->_charset = $charset;
+        }
+        // read tag info
+        protected function read_tag()
+        {
+                if ($this->char!=='<')
+                {
+                        $this->root->_[HDOM_INFO_END] = $this->cursor;
+                        return false;
+                }
+                $begin_tag_pos = $this->pos;
+                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                // end tag
+                if ($this->char==='/')
+                {
+                        $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                        // This represents the change in the simple_html_dom trunk from revision 180 to 181.
+                        // $this->skip($this->token_blank_t);
+                        $this->skip($this->token_blank);
+                        $tag = $this->copy_until_char('>');
+                        // skip attributes in end tag
+                        if (($pos = strpos($tag, ' '))!==false)
+                                $tag = substr($tag, 0, $pos);
+                        $parent_lower = strtolower($this->parent->tag);
+                        $tag_lower = strtolower($tag);
+                        if ($parent_lower!==$tag_lower)
+                        {
+                                if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
+                                {
+                                        $this->parent->_[HDOM_INFO_END] = 0;
+                                        $org_parent = $this->parent;
+                                        while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
+                                                $this->parent = $this->parent->parent;
+                                        if (strtolower($this->parent->tag)!==$tag_lower) {
+                                                $this->parent = $org_parent; // restore origonal parent
+                                                if ($this->parent->parent) $this->parent = $this->parent->parent;
+                                                $this->parent->_[HDOM_INFO_END] = $this->cursor;
+                                                return $this->as_text_node($tag);
+                                        }
+                                }
+                                else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
+                                {
+                                        $this->parent->_[HDOM_INFO_END] = 0;
+                                        $org_parent = $this->parent;
+                                        while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
+                                                $this->parent = $this->parent->parent;
+                                        if (strtolower($this->parent->tag)!==$tag_lower)
+                                        {
+                                                $this->parent = $org_parent; // restore origonal parent
+                                                $this->parent->_[HDOM_INFO_END] = $this->cursor;
+                                                return $this->as_text_node($tag);
+                                        }
+                                }
+                                else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
+                                {
+                                        $this->parent->_[HDOM_INFO_END] = 0;
+                                        $this->parent = $this->parent->parent;
+                                }
+                                else
+                                        return $this->as_text_node($tag);
+                        }
+                        $this->parent->_[HDOM_INFO_END] = $this->cursor;
+                        if ($this->parent->parent) $this->parent = $this->parent->parent;
+                        $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                        return true;
+                }
+                $node = new simple_html_dom_node($this);
+                $node->_[HDOM_INFO_BEGIN] = $this->cursor;
+                ++$this->cursor;
+                $tag = $this->copy_until($this->token_slash);
+                $node->tag_start = $begin_tag_pos;
+                // doctype, cdata & comments...
+                if (isset($tag[0]) && $tag[0]==='!') {
+                        $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
+                        if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
+                                $node->nodetype = HDOM_TYPE_COMMENT;
+                                $node->tag = 'comment';
+                        } else {
+                                $node->nodetype = HDOM_TYPE_UNKNOWN;
+                                $node->tag = 'unknown';
+                        }
+                        if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
+                        $this->link_nodes($node, true);
+                        $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                        return true;
+                }
+                // text
+                if ($pos=strpos($tag, '<')!==false) {
+                        $tag = '<' . substr($tag, 0, -1);
+                        $node->_[HDOM_INFO_TEXT] = $tag;
+                        $this->link_nodes($node, false);
+                        $this->char = $this->doc[--$this->pos]; // prev
+                        return true;
+                }
+                if (!preg_match("/^[\w-:]+$/", $tag)) {
+                        $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
+                        if ($this->char==='<') {
+                                $this->link_nodes($node, false);
+                                return true;
+                        }
+                        if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
+                        $this->link_nodes($node, false);
+                        $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                        return true;
+                }
+                // begin tag
+                $node->nodetype = HDOM_TYPE_ELEMENT;
+                $tag_lower = strtolower($tag);
+                $node->tag = ($this->lowercase) ? $tag_lower : $tag;
+                // handle optional closing tags
+                if (isset($this->optional_closing_tags[$tag_lower]) )
+                {
+                        while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
+                        {
+                                $this->parent->_[HDOM_INFO_END] = 0;
+                                $this->parent = $this->parent->parent;
+                        }
+                        $node->parent = $this->parent;
+                }
+                $guard = 0; // prevent infinity loop
+                $space = array($this->copy_skip($this->token_blank), '', '');
+                // attributes
+                do
+                {
+                        if ($this->char!==null && $space[0]==='')
+                        {
+                                break;
+                        }
+                        $name = $this->copy_until($this->token_equal);
+                        if ($guard===$this->pos)
+                        {
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                continue;
+                        }
+                        $guard = $this->pos;
+                        // handle endless '<'
+                        if ($this->pos>=$this->size-1 && $this->char!=='>') {
+                                $node->nodetype = HDOM_TYPE_TEXT;
+                                $node->_[HDOM_INFO_END] = 0;
+                                $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
+                                $node->tag = 'text';
+                                $this->link_nodes($node, false);
+                                return true;
+                        }
+                        // handle mismatch '<'
+                        if ($this->doc[$this->pos-1]=='<') {
+                                $node->nodetype = HDOM_TYPE_TEXT;
+                                $node->tag = 'text';
+                                $node->attr = array();
+                                $node->_[HDOM_INFO_END] = 0;
+                                $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
+                                $this->pos -= 2;
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                $this->link_nodes($node, false);
+                                return true;
+                        }
+                        if ($name!=='/' && $name!=='') {
+                                $space[1] = $this->copy_skip($this->token_blank);
+                                $name = $this->restore_noise($name);
+                                if ($this->lowercase) $name = strtolower($name);
+                                if ($this->char==='=') {
+                                        $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                        $this->parse_attr($node, $name, $space);
+                                }
+                                else {
+                                        //no value attr: nowrap, checked selected...
+                                        $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
+                                        $node->attr[$name] = true;
+                                        if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
+                                }
+                                $node->_[HDOM_INFO_SPACE][] = $space;
+                                $space = array($this->copy_skip($this->token_blank), '', '');
+                        }
+                        else
+                                break;
+                } while ($this->char!=='>' && $this->char!=='/');
+                $this->link_nodes($node, true);
+                $node->_[HDOM_INFO_ENDSPACE] = $space[0];
+                // check self closing
+                if ($this->copy_until_char_escape('>')==='/')
+                {
+                        $node->_[HDOM_INFO_ENDSPACE] .= '/';
+                        $node->_[HDOM_INFO_END] = 0;
+                }
+                else
+                {
+                        // reset parent
+                        if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
+                }
+                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                // If it's a BR tag, we need to set it's text to the default text.
+                // This way when we see it in plaintext, we can generate formatting that the user wants.
+                // since a br tag never has sub nodes, this works well.
+                if ($node->tag == "br")
+                {
+                        $node->_[HDOM_INFO_INNER] = $this->default_br_text;
+                }
+                return true;
+        }
+        // parse attributes
+        protected function parse_attr($node, $name, &$space)
+        {
+                // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
+                // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
+                if (isset($node->attr[$name]))
+                {
+                        return;
+                }
+                $space[2] = $this->copy_skip($this->token_blank);
+                switch ($this->char) {
+                        case '"':
+                                $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                break;
+                        case '\'':
+                                $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
+                                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                                break;
+                        default:
+                                $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
+                                $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
+                }
+                // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
+                $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
+                $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
+                // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
+                if ($name == "class") {
+                        $node->attr[$name] = trim($node->attr[$name]);
+                }
+        }
+        // link node's parent
+        protected function link_nodes(&$node, $is_child)
+        {
+                $node->parent = $this->parent;
+                $this->parent->nodes[] = $node;
+                if ($is_child)
+                {
+                        $this->parent->children[] = $node;
+                }
+        }
+        // as a text node
+        protected function as_text_node($tag)
+        {
+                $node = new simple_html_dom_node($this);
+                ++$this->cursor;
+                $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
+                $this->link_nodes($node, false);
+                $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                return true;
+        }
+        protected function skip($chars)
+        {
+                $this->pos += strspn($this->doc, $chars, $this->pos);
+                $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+        }
+        protected function copy_skip($chars)
+        {
+                $pos = $this->pos;
+                $len = strspn($this->doc, $chars, $pos);
+                $this->pos += $len;
+                $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                if ($len===0) return '';
+                return substr($this->doc, $pos, $len);
+        }
+        protected function copy_until($chars)
+        {
+                $pos = $this->pos;
+                $len = strcspn($this->doc, $chars, $pos);
+                $this->pos += $len;
+                $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
+                return substr($this->doc, $pos, $len);
+        }
+        protected function copy_until_char($char)
+        {
+                if ($this->char===null) return '';
+                if (($pos = strpos($this->doc, $char, $this->pos))===false) {
+                        $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
+                        $this->char = null;
+                        $this->pos = $this->size;
+                        return $ret;
+                }
+                if ($pos===$this->pos) return '';
+                $pos_old = $this->pos;
+                $this->char = $this->doc[$pos];
+                $this->pos = $pos;
+                return substr($this->doc, $pos_old, $pos-$pos_old);
+        }
+        protected function copy_until_char_escape($char)
+        {
+                if ($this->char===null) return '';
+                $start = $this->pos;
+                while (1)
+                {
+                        if (($pos = strpos($this->doc, $char, $start))===false)
+                        {
+                                $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
+                                $this->char = null;
+                                $this->pos = $this->size;
+                                return $ret;
+                        }
+                        if ($pos===$this->pos) return '';
+                        if ($this->doc[$pos-1]==='\\') {
+                                $start = $pos+1;
+                                continue;
+                        }
+                        $pos_old = $this->pos;
+                        $this->char = $this->doc[$pos];
+                        $this->pos = $pos;
+                        return substr($this->doc, $pos_old, $pos-$pos_old);
+                }
+        }
+        // remove noise from html content
+        // save the noise in the $this->noise array.
+        protected function remove_noise($pattern, $remove_tag=false)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+                $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
+                for ($i=$count-1; $i>-1; --$i)
+                {
+                        $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
+                        if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); }
+                        $idx = ($remove_tag) ? 0 : 1;
+                        $this->noise[$key] = $matches[$i][$idx][0];
+                        $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
+                }
+                // reset the length of content
+                $this->size = strlen($this->doc);
+                if ($this->size>0)
+                {
+                        $this->char = $this->doc[0];
+                }
+        }
+        // restore noise to html content
+        function restore_noise($text)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+                while (($pos=strpos($text, '___noise___'))!==false)
+                {
+                        // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
+                        if (strlen($text) > $pos+15)
+                        {
+                                $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
+                                if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); }
+                                if (isset($this->noise[$key]))
+                                {
+                                        $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
+                                }
+                                else
+                                {
+                                        // do this to prevent an infinite loop.
+                                        $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
+                                }
+                        }
+                        else
+                        {
+                                // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
+                                $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
+                        }
+                }
+                return $text;
+        }
+        // Sometimes we NEED one of the noise elements.
+        function search_noise($text)
+        {
+                global $debug_object;
+                if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }
+                foreach($this->noise as $noiseElement)
+                {
+                        if (strpos($noiseElement, $text)!==false)
+                        {
+                                return $noiseElement;
+                        }
+                }
+        }
+        function __toString()
+        {
+                return $this->root->innertext();
+        }
+        function __get($name)
+        {
+                switch ($name)
+                {
+                        case 'outertext':
+                                return $this->root->innertext();
+                        case 'innertext':
+                                return $this->root->innertext();
+                        case 'plaintext':
+                                return $this->root->text();
+                        case 'charset':
+                                return $this->_charset;
+                        case 'target_charset':
+                                return $this->_target_charset;
+                }
+        }
+        // camel naming conventions
+        function childNodes($idx=-1) {return $this->root->childNodes($idx);}
+        function firstChild() {return $this->root->first_child();}
+        function lastChild() {return $this->root->last_child();}
+        function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
+        function createTextNode($value) {return @end(str_get_html($value)->nodes);}
+        function getElementById($id) {return $this->find("#$id", 0);}
+        function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
+        function getElementByTagName($name) {return $this->find($name, 0);}
+        function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
+        function loadFile() {$args = func_get_args();$this->load_file($args);}
+}
+?>
+\ No newline at end of file
author	Nicolas Lœuillet <nicolas.loeuillet@gmail.com>	2013-08-02 22:40:51 +0200
committer	Nicolas Lœuillet <nicolas.loeuillet@gmail.com>	2013-08-02 22:40:51 +0200
commit	a4565e88edbc8e3bd092a475469769c86a4c350c (patch)
tree	a6a3c935b03a23ff87575c8c315cf8ba78fe68c2 /inc/3rdparty
parent	f6c9baab3efeec1d0efa151e276fc08d5b58f9e9 (diff)
download	wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.gz wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.tar.zst wallabag-a4565e88edbc8e3bd092a475469769c86a4c350c.zip