* @author S.C. Chen <me578022@gmail.com>\r
* @author John Schlick\r
* @author Rus Carroll\r
- * @version 1.5 ($Rev: 202 $)\r
+ * @version 1.5 ($Rev: 210 $)\r
* @package PlaceLocalInclude\r
* @subpackage simple_html_dom\r
*/\r
{\r
return $this->children;\r
}\r
- if (isset($this->children[$idx])) return $this->children[$idx];\r
+ if (isset($this->children[$idx]))\r
+ {\r
+ return $this->children[$idx];\r
+ }\r
return null;\r
}\r
\r
function find_ancestor_tag($tag)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }\r
\r
// Start by including ourselves in the comparison.\r
$returnDom = $this;\r
\r
while (!is_null($returnDom))\r
{\r
- if (is_object($debug_object)) { $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log(2, "Current tag is: " . $returnDom->tag); }\r
\r
if ($returnDom->tag == $tag)\r
{\r
$text = " with text: " . $this->text;\r
}\r
}\r
- $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);\r
+ $debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);\r
}\r
\r
if ($this->tag==='root') return $this->innertext();\r
foreach ($head as $k=>$v)\r
{\r
if (!isset($found_keys[$k]))\r
+ {\r
$found_keys[$k] = 1;\r
+ }\r
}\r
}\r
\r
protected function seek($selector, &$ret, $lowercase=false)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }\r
\r
list($tag, $key, $val, $exp, $no_key) = $selector;\r
\r
// this is a normal search, we want the value of that attribute of the tag.\r
$nodeKeyValue = $node->attr[$key];\r
}\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}\r
\r
//PaperG - If lowercase is set, do a case insensitive test of the value of the selector.\r
if ($lowercase) {\r
} else {\r
$check = $this->match($exp, $val, $nodeKeyValue);\r
}\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, "after match: " . ($check ? "true" : "false"));}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "after match: " . ($check ? "true" : "false"));}\r
\r
// handle multiple class\r
if (!$check && strcasecmp($key, 'class')===0) {\r
unset($node);\r
}\r
// It's passed by reference so this is actually what this function returns.\r
- if (is_object($debug_object)) {$debug_object->debugLog(1, "EXIT - ret: ", $ret);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(1, "EXIT - ret: ", $ret);}\r
}\r
\r
protected function match($exp, $pattern, $value) {\r
global $debug_object;\r
- if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}\r
\r
switch ($exp) {\r
case '=':\r
\r
protected function parse_selector($selector_string) {\r
global $debug_object;\r
- if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}\r
\r
// pattern of CSS selectors, modified from mootools\r
// Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.\r
// $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";\r
$pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";\r
preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, "Matches Array: ", $matches);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, "Matches Array: ", $matches);}\r
\r
$selectors = array();\r
$result = array();\r
return $selectors;\r
}\r
\r
- function __get($name) {\r
+ function __get($name)\r
+ {\r
if (isset($this->attr[$name]))\r
{\r
return $this->convert_text($this->attr[$name]);\r
}\r
- switch ($name) {\r
+ switch ($name)\r
+ {\r
case 'outertext': return $this->outertext();\r
case 'innertext': return $this->innertext();\r
case 'plaintext': return $this->text();\r
}\r
}\r
\r
- function __set($name, $value) {\r
- switch ($name) {\r
+ function __set($name, $value)\r
+ {\r
+ global $debug_object;\r
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}\r
+\r
+ switch ($name)\r
+ {\r
case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;\r
case 'innertext':\r
if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;\r
return $this->_[HDOM_INFO_INNER] = $value;\r
}\r
- if (!isset($this->attr[$name])) {\r
+ if (!isset($this->attr[$name]))\r
+ {\r
$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');\r
$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;\r
}\r
$this->attr[$name] = $value;\r
}\r
\r
- function __isset($name) {\r
- switch ($name) {\r
+ function __isset($name)\r
+ {\r
+ switch ($name)\r
+ {\r
case 'outertext': return true;\r
case 'innertext': return true;\r
case 'plaintext': return true;\r
function convert_text($text)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) {$debug_object->debugLogEntry(1);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}\r
\r
$converted_text = $text;\r
\r
$sourceCharset = strtoupper($this->dom->_charset);\r
$targetCharset = strtoupper($this->dom->_target_charset);\r
}\r
- if (is_object($debug_object)) {$debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}\r
\r
if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))\r
{\r
\r
// prepare\r
$this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);\r
- // strip out comments\r
- $this->remove_noise("'<!--(.*?)-->'is");\r
// strip out cdata\r
$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);\r
+ // strip out comments\r
+ $this->remove_noise("'<!--(.*?)-->'is");\r
// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037\r
// Script tags removal now preceeds style tag removal.\r
// strip out <script> tags\r
// load html from file\r
function load_file()\r
{\r
+ //external error: NOT related to dom loading\r
+ $extError=error_get_last();\r
+\r
$args = func_get_args();\r
$this->load(call_user_func_array('file_get_contents', $args), true);\r
+\r
// Throw an error if we can't properly load the dom.\r
- if (($error=error_get_last())!==null) {\r
+ $error=error_get_last();\r
+ if ($error!==$extError) {\r
$this->clear();\r
return false;\r
}\r
if ($success)\r
{\r
$charset = $matches[1];\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'header content-type found charset of: ' . $charset);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'header content-type found charset of: ' . $charset);}\r
}\r
\r
}\r
\r
if (empty($charset))\r
{\r
- $el = $this->root->find('meta[http-equiv=Content-Type]',0);\r
+ $el = $this->root->find('meta[http-equiv=Content-Type]',0, true);\r
if (!empty($el))\r
{\r
$fullvalue = $el->content;\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag found' . $fullvalue);}\r
\r
if (!empty($fullvalue))\r
{\r
- $success = preg_match('/charset=(.+)/', $fullvalue, $matches);\r
+ $success = preg_match('/charset=(.+)/i', $fullvalue, $matches);\r
if ($success)\r
{\r
$charset = $matches[1];\r
else\r
{\r
// If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}\r
$charset = 'ISO-8859-1';\r
}\r
}\r
// If we couldn't find a charset above, then lets try to detect one based on the text we got...\r
if (empty($charset))\r
{\r
- // Have php try to detect the encoding from the text given to us.\r
- $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'mb_detect found: ' . $charset);}\r
+ // Use this in case mb_detect_charset isn't installed/loaded on this machine.\r
+ $charset = false;\r
+ if (function_exists('mb_detect_encoding'))\r
+ {\r
+ // Have php try to detect the encoding from the text given to us.\r
+ $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'mb_detect found: ' . $charset);}\r
+ }\r
\r
// and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...\r
if ($charset === false)\r
{\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8');}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'since mb_detect failed - using default of utf-8');}\r
$charset = 'UTF-8';\r
}\r
}\r
// Since CP1252 is a superset, if we get one of it's subsets, we want it instead.\r
if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))\r
{\r
- if (is_object($debug_object)) {$debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}\r
$charset = 'CP1252';\r
}\r
\r
- if (is_object($debug_object)) {$debug_object->debugLog(1, 'EXIT - ' . $charset);}\r
+ if (is_object($debug_object)) {$debug_object->debug_log(1, 'EXIT - ' . $charset);}\r
\r
return $this->_charset = $charset;\r
}\r
protected function remove_noise($pattern, $remove_tag=false)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }\r
\r
$count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);\r
\r
for ($i=$count-1; $i>-1; --$i)\r
{\r
$key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);\r
- if (is_object($debug_object)) { $debug_object->debugLog(2, 'key is: ' . $key); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'key is: ' . $key); }\r
$idx = ($remove_tag) ? 0 : 1;\r
$this->noise[$key] = $matches[$i][$idx][0];\r
$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));\r
function restore_noise($text)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }\r
\r
while (($pos=strpos($text, '___noise___'))!==false)\r
{\r
if (strlen($text) > $pos+15)\r
{\r
$key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];\r
- if (is_object($debug_object)) { $debug_object->debugLog(2, 'located key of: ' . $key); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log(2, 'located key of: ' . $key); }\r
\r
if (isset($this->noise[$key]))\r
{\r
function search_noise($text)\r
{\r
global $debug_object;\r
- if (is_object($debug_object)) { $debug_object->debugLogEntry(1); }\r
+ if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }\r
\r
foreach($this->noise as $noiseElement)\r
{\r