]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/simple_html_dom.php
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
7 * Yousuke Kumakura (Attribute filters)
8 * Vadim Voituk (Negative indexes supports of "find" method)
9 * Antcs (Constructor with automatically load contents either text or file/url)
11 * all affected sections have comments starting with "PaperG"
13 * Paperg - Added case insensitive testing of the value of the selector.
14 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16 * it will almost always be smaller by some amount.
17 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18 * but for most purposes, it's a really good estimation.
19 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 * Allow the user to tell us how much they trust the html.
21 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 * This allows for us to find tags based on the text they contain.
23 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 * Paperg: added parse_charset so that we know about the character set of the source document.
25 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
28 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
31 * Licensed under The MIT License
32 * Redistributions of files must retain the above copyright notice.
34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick
37 * @version 1.5 ($Rev: 202 $)
38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom
43 * All of the Defines for the classes below.
44 * @author S.C. Chen <me578022@gmail.com>
46 define ( 'HDOM_TYPE_ELEMENT' , 1 );
47 define ( 'HDOM_TYPE_COMMENT' , 2 );
48 define ( 'HDOM_TYPE_TEXT' , 3 );
49 define ( 'HDOM_TYPE_ENDTAG' , 4 );
50 define ( 'HDOM_TYPE_ROOT' , 5 );
51 define ( 'HDOM_TYPE_UNKNOWN' , 6 );
52 define ( 'HDOM_QUOTE_DOUBLE' , 0 );
53 define ( 'HDOM_QUOTE_SINGLE' , 1 );
54 define ( 'HDOM_QUOTE_NO' , 3 );
55 define ( 'HDOM_INFO_BEGIN' , 0 );
56 define ( 'HDOM_INFO_END' , 1 );
57 define ( 'HDOM_INFO_QUOTE' , 2 );
58 define ( 'HDOM_INFO_SPACE' , 3 );
59 define ( 'HDOM_INFO_TEXT' , 4 );
60 define ( 'HDOM_INFO_INNER' , 5 );
61 define ( 'HDOM_INFO_OUTER' , 6 );
62 define ( 'HDOM_INFO_ENDSPACE' , 7 );
63 define ( 'DEFAULT_TARGET_CHARSET' , 'UTF-8' );
64 define ( 'DEFAULT_BR_TEXT' , " \r\n " );
65 define ( 'DEFAULT_SPAN_TEXT' , " " );
66 define ( 'MAX_FILE_SIZE' , 600000 );
68 // -----------------------------------------------------------------------------
69 // get html dom from file
70 // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71 function file_get_html ( $url , $use_include_path = false , $context = null , $offset = - 1 , $maxLen =- 1 , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
73 // We DO force the tags to be terminated.
74 $dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
75 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 $contents = file_get_contents ( $url , $use_include_path , $context , $offset );
77 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 //$contents = retrieve_url_contents($url);
79 if ( empty ( $contents ) || strlen ( $contents ) > MAX_FILE_SIZE
)
83 // The second parameter can force the selectors to all be lowercase.
84 $dom- > load ( $contents , $lowercase , $stripRN );
88 // get html dom from string
89 function str_get_html ( $str , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
91 $dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
92 if ( empty ( $str ) || strlen ( $str ) > MAX_FILE_SIZE
)
97 $dom- > load ( $str , $lowercase , $stripRN );
101 // dump html dom tree
102 function dump_html_tree ( $node , $show_attr = true , $deep = 0 )
109 * simple html dom node
110 * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 * PaperG - added $tag_start to track the start position of the tag in the total byte index
113 * @package PlaceLocalInclude
115 class simple_html_dom_node
117 public $nodetype = HDOM_TYPE_TEXT
;
118 public $tag = 'text' ;
119 public $attr = array ();
120 public $children = array ();
121 public $nodes = array ();
122 public $parent = null ;
123 // The "info" array - see HDOM_INFO_... for what each element contains.
125 public $tag_start = 0 ;
128 function __construct ( $dom )
131 $dom- > nodes
[] = $this ;
134 function __destruct ()
139 function __toString ()
141 return $this- > outertext ();
144 // clean up memory due to php5 circular references memory leak...
149 $this- > parent
= null ;
150 $this- > children
= null ;
154 function dump ( $show_attr = true , $deep = 0 )
156 $lead = str_repeat ( ' ' , $deep );
158 echo $lead . $this- > tag
;
159 if ( $show_attr && count ( $this- > attr
)> 0 )
162 foreach ( $this- > attr
as $k => $v )
163 echo "[ $k ]=> \" " . $this- > $k . '", ' ;
170 foreach ( $this- > nodes
as $c )
172 $c- > dump ( $show_attr , $deep +
1 );
178 // Debugging function to dump a single dom node with a bunch of information about it.
179 function dump_node ( $echo = true )
182 $string = $this- > tag
;
183 if ( count ( $this- > attr
)> 0 )
186 foreach ( $this- > attr
as $k => $v )
188 $string .= "[ $k ]=> \" " . $this- > $k . '", ' ;
192 if ( count ( $this- > _
)> 0 )
195 foreach ( $this- > _
as $k => $v )
199 $string .= "[ $k ]=>(" ;
200 foreach ( $v as $k2 => $v2 )
202 $string .= "[ $k2 ]=> \" " . $v2 . '", ' ;
206 $string .= "[ $k ]=> \" " . $v . '", ' ;
212 if ( isset ( $this- > text
))
214 $string .= " text: (" . $this- > text
. ")" ;
217 $string .= " HDOM_INNER_INFO: '" ;
218 if ( isset ( $node- > _
[ HDOM_INFO_INNER
]))
220 $string .= $node- > _
[ HDOM_INFO_INNER
] . "'" ;
227 $string .= " children: " . count ( $this- > children
);
228 $string .= " nodes: " . count ( $this- > nodes
);
229 $string .= " tag_start: " . $this- > tag_start
;
243 // returns the parent of node
244 // If a node is passed in, it will reset the parent of the current node to that one.
245 function parent ( $parent = null )
247 // I am SURE that this doesn't work properly.
248 // It fails to unset the current node from it's current parents nodes or children list first.
249 if ( $parent !== null )
251 $this- > parent
= $parent ;
252 $this- > parent
-> nodes
[] = $this ;
253 $this- > parent
-> children
[] = $this ;
256 return $this- > parent
;
259 // verify that node has children
262 return ! empty ( $this- > children
);
265 // returns children of node
266 function children ( $idx =- 1 )
270 return $this- > children
;
272 if ( isset ( $this- > children
[ $idx ])) return $this- > children
[ $idx ];
276 // returns the first child of node
277 function first_child ()
279 if ( count ( $this- > children
)> 0 )
281 return $this- > children
[ 0 ];
286 // returns the last child of node
287 function last_child ()
289 if (( $count = count ( $this- > children
))> 0 )
291 return $this- > children
[ $count-1 ];
296 // returns the next sibling of node
297 function next_sibling ()
299 if ( $this- > parent
=== null )
305 $count = count ( $this- > parent
-> children
);
306 while ( $idx < $count && $this !== $this- > parent
-> children
[ $idx ])
314 return $this- > parent
-> children
[ $idx ];
317 // returns the previous sibling of node
318 function prev_sibling ()
320 if ( $this- > parent
=== null ) return null ;
322 $count = count ( $this- > parent
-> children
);
323 while ( $idx < $count && $this !== $this- > parent
-> children
[ $idx ])
325 if (-- $idx < 0 ) return null ;
326 return $this- > parent
-> children
[ $idx ];
329 // function to locate a specific ancestor tag in the path to the root.
330 function find_ancestor_tag ( $tag )
332 global $debug_object ;
333 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 ); }
335 // Start by including ourselves in the comparison.
338 while (! is_null ( $returnDom ))
340 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , "Current tag is: " . $returnDom
-> tag
); }
342 if ( $returnDom- > tag
== $tag )
346 $returnDom = $returnDom- > parent
;
351 // get dom node's inner html
354 if ( isset ( $this- > _
[ HDOM_INFO_INNER
])) return $this- > _
[ HDOM_INFO_INNER
];
355 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
358 foreach ( $this- > nodes
as $n )
359 $ret .= $n- > outertext ();
363 // get dom node's outer text (with tag)
366 global $debug_object ;
367 if ( is_object ( $debug_object ))
370 if ( $this- > tag
== 'text' )
372 if (! empty ( $this- > text
))
374 $text = " with text: " . $this- > text
;
377 $debug_object- > debugLog ( 1 , 'Innertext of tag: ' . $this- > tag
. $text );
380 if ( $this- > tag
=== 'root' ) return $this- > innertext ();
383 if ( $this- > dom
&& $this- > dom
-> callback
!== null )
385 call_user_func_array ( $this- > dom
-> callback
, array ( $this ));
388 if ( isset ( $this- > _
[ HDOM_INFO_OUTER
])) return $this- > _
[ HDOM_INFO_OUTER
];
389 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
392 if ( $this- > dom
&& $this- > dom
-> nodes
[ $this- > _
[ HDOM_INFO_BEGIN
]])
394 $ret = $this- > dom
-> nodes
[ $this- > _
[ HDOM_INFO_BEGIN
]]-> makeup ();
400 if ( isset ( $this- > _
[ HDOM_INFO_INNER
]))
402 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
403 if ( $this- > tag
!= "br" )
405 $ret .= $this- > _
[ HDOM_INFO_INNER
];
410 foreach ( $this- > nodes
as $n )
412 $ret .= $this- > convert_text ( $n- > outertext ());
418 if ( isset ( $this- > _
[ HDOM_INFO_END
]) && $this- > _
[ HDOM_INFO_END
]!= 0 )
419 $ret .= '</' . $this- > tag
. '>' ;
423 // get dom node's plain text
426 if ( isset ( $this- > _
[ HDOM_INFO_INNER
])) return $this- > _
[ HDOM_INFO_INNER
];
427 switch ( $this- > nodetype
)
429 case HDOM_TYPE_TEXT
: return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
430 case HDOM_TYPE_COMMENT
: return '' ;
431 case HDOM_TYPE_UNKNOWN
: return '' ;
433 if ( strcasecmp ( $this- > tag
, 'script' )=== 0 ) return '' ;
434 if ( strcasecmp ( $this- > tag
, 'style' )=== 0 ) return '' ;
437 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
438 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
439 // WHY is this happening?
440 if (! is_null ( $this- > nodes
))
442 foreach ( $this- > nodes
as $n )
444 $ret .= $this- > convert_text ( $n- > text ());
447 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
448 if ( $this- > tag
== "span" )
450 $ret .= $this- > dom
-> default_span_text
;
460 $ret = $this- > innertext ();
461 $ret = str_ireplace ( '<![CDATA[' , '' , $ret );
462 $ret = str_replace ( ']]>' , '' , $ret );
466 // build node's text with tag
469 // text, comment, unknown
470 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
472 $ret = '<' . $this- > tag
;
475 foreach ( $this- > attr
as $key => $val )
479 // skip removed attribute
480 if ( $val === null || $val === false )
483 $ret .= $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 0 ];
484 //no value attr: nowrap, checked selected...
488 switch ( $this- > _
[ HDOM_INFO_QUOTE
][ $i ])
490 case HDOM_QUOTE_DOUBLE
: $quote = '"' ; break ;
491 case HDOM_QUOTE_SINGLE
: $quote = ' \' ' ; break ;
492 default : $quote = '' ;
494 $ret .= $key . $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 1 ]. '=' . $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 2 ]. $quote . $val . $quote ;
497 $ret = $this- > dom
-> restore_noise ( $ret );
498 return $ret . $this- > _
[ HDOM_INFO_ENDSPACE
] . '>' ;
501 // find elements by css selector
502 //PaperG - added ability for find to lowercase the value of the selector.
503 function find ( $selector , $idx = null , $lowercase = false )
505 $selectors = $this- > parse_selector ( $selector );
506 if (( $count = count ( $selectors ))=== 0 ) return array ();
507 $found_keys = array ();
509 // find each selector
510 for ( $c = 0 ; $c < $count ; ++
$c )
512 // The change on the below line was documented on the sourceforge code tracker id 2788009
513 // used to be: if (($levle=count($selectors[0]))===0) return array();
514 if (( $levle = count ( $selectors [ $c ]))=== 0 ) return array ();
515 if (! isset ( $this- > _
[ HDOM_INFO_BEGIN
])) return array ();
517 $head = array ( $this- > _
[ HDOM_INFO_BEGIN
]=> 1 );
519 // handle descendant selectors, no recursive!
520 for ( $l = 0 ; $l < $levle ; ++
$l )
523 foreach ( $head as $k => $v )
525 $n = ( $k ===- 1 ) ? $this- > dom
-> root
: $this- > dom
-> nodes
[ $k ];
526 //PaperG - Pass this optional parameter on to the seek function.
527 $n- > seek ( $selectors [ $c ][ $l ], $ret , $lowercase );
532 foreach ( $head as $k => $v )
534 if (! isset ( $found_keys [ $k ]))
543 foreach ( $found_keys as $k => $v )
544 $found [] = $this- > dom
-> nodes
[ $k ];
546 // return nth-element or array
547 if ( is_null ( $idx )) return $found ;
548 else if ( $idx < 0 ) $idx = count ( $found ) +
$idx ;
549 return ( isset ( $found [ $idx ])) ? $found [ $idx ] : null ;
552 // seek for given conditions
553 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
554 protected function seek ( $selector , & $ret , $lowercase = false )
556 global $debug_object ;
557 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 ); }
559 list ( $tag , $key , $val , $exp , $no_key ) = $selector ;
562 if ( $tag && $key && is_numeric ( $key ))
565 foreach ( $this- > children
as $c )
567 if ( $tag === '*' || $tag === $c- > tag
) {
568 if ( ++
$count == $key ) {
569 $ret [ $c- > _
[ HDOM_INFO_BEGIN
]] = 1 ;
577 $end = (! empty ( $this- > _
[ HDOM_INFO_END
])) ? $this- > _
[ HDOM_INFO_END
] : 0 ;
579 $parent = $this- > parent
;
580 while (! isset ( $parent- > _
[ HDOM_INFO_END
]) && $parent !== null ) {
582 $parent = $parent- > parent
;
584 $end +
= $parent- > _
[ HDOM_INFO_END
];
587 for ( $i = $this- > _
[ HDOM_INFO_BEGIN
] +
1 ; $i < $end ; ++
$i ) {
588 $node = $this- > dom
-> nodes
[ $i ];
592 if ( $tag === '*' && ! $key ) {
593 if ( in_array ( $node , $this- > children
, true ))
599 if ( $tag && $tag != $node- > tag
&& $tag !== '*' ) { $pass
= false ;}
603 if ( isset ( $node- > attr
[ $key ])) $pass = false ;
605 if (( $key != "plaintext" ) && ! isset ( $node- > attr
[ $key ])) $pass = false ;
609 if ( $pass && $key && $val && $val !== '*' ) {
610 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
611 if ( $key == "plaintext" ) {
612 // $node->plaintext actually returns $node->text();
613 $nodeKeyValue = $node- > text ();
615 // this is a normal search, we want the value of that attribute of the tag.
616 $nodeKeyValue = $node- > attr
[ $key ];
618 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , "testing node: " . $node
-> tag
. " for attribute: " . $key
. $exp
. $val
. " where nodes value is: " . $nodeKeyValue
);}
620 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
622 $check = $this- > match ( $exp , strtolower ( $val ), strtolower ( $nodeKeyValue ));
624 $check = $this- > match ( $exp , $val , $nodeKeyValue );
626 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , "after match: " . ( $check
? "true" : "false" ));}
628 // handle multiple class
629 if (! $check && strcasecmp ( $key , 'class' )=== 0 ) {
630 foreach ( explode ( ' ' , $node- > attr
[ $key ]) as $k ) {
631 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
634 $check = $this- > match ( $exp , strtolower ( $val ), strtolower ( $k ));
636 $check = $this- > match ( $exp , $val , $k );
642 if (! $check ) $pass = false ;
644 if ( $pass ) $ret [ $i ] = 1 ;
647 // It's passed by reference so this is actually what this function returns.
648 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 1 , "EXIT - ret: " , $ret
);}
651 protected function match ( $exp , $pattern , $value ) {
652 global $debug_object ;
653 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 );}
657 return ( $value === $pattern );
659 return ( $value !== $pattern );
661 return preg_match ( "/^" . preg_quote ( $pattern , '/' ). "/" , $value );
663 return preg_match ( "/" . preg_quote ( $pattern , '/' ). "$/" , $value );
665 if ( $pattern [ 0 ]== '/' ) {
666 return preg_match ( $pattern , $value );
668 return preg_match ( "/" . $pattern . "/i" , $value );
673 protected function parse_selector ( $selector_string ) {
674 global $debug_object ;
675 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 );}
677 // pattern of CSS selectors, modified from mootools
678 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
679 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
680 // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
681 // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
682 // farther study is required to determine of this should be documented or removed.
683 // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
684 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[ \" ']?(.*?)[ \" ']?)?\])?([\/, ]+)/is" ;
685 preg_match_all ( $pattern , trim ( $selector_string ). ' ' , $matches , PREG_SET_ORDER
);
686 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , "Matches Array: " , $matches
);}
688 $selectors = array ();
692 foreach ( $matches as $m ) {
694 if ( $m [ 0 ]=== '' || $m [ 0 ]=== '/' || $m [ 0 ]=== '//' ) continue ;
695 // for browser generated xpath
696 if ( $m [ 1 ]=== 'tbody' ) continue ;
698 list ( $tag , $key , $val , $exp , $no_key ) = array ( $m [ 1 ], null , null , '=' , false );
699 if (! empty ( $m [ 2 ])) { $key
= 'id' ; $val
= $m
[ 2 ];}
700 if (! empty ( $m [ 3 ])) { $key
= 'class' ; $val
= $m
[ 3 ];}
701 if (! empty ( $m [ 4 ])) { $key
= $m
[ 4 ];}
702 if (! empty ( $m [ 5 ])) { $exp
= $m
[ 5 ];}
703 if (! empty ( $m [ 6 ])) { $val
= $m
[ 6 ];}
705 // convert to lowercase
706 if ( $this- > dom
-> lowercase
) { $tag
= strtolower ( $tag
); $key
= strtolower ( $key
);}
707 //elements that do NOT have the specified attribute
708 if ( isset ( $key [ 0 ]) && $key [ 0 ]=== '!' ) { $key
= substr ( $key
, 1 ); $no_key
= true ;}
710 $result [] = array ( $tag , $key , $val , $exp , $no_key );
711 if ( trim ( $m [ 7 ])=== ',' ) {
712 $selectors [] = $result ;
716 if ( count ( $result )> 0 )
717 $selectors [] = $result ;
721 function __get ( $name ) {
722 if ( isset ( $this- > attr
[ $name ]))
724 return $this- > convert_text ( $this- > attr
[ $name ]);
727 case 'outertext' : return $this- > outertext ();
728 case 'innertext' : return $this- > innertext ();
729 case 'plaintext' : return $this- > text ();
730 case 'xmltext' : return $this- > xmltext ();
731 default : return array_key_exists ( $name , $this- > attr
);
735 function __set ( $name , $value ) {
737 case 'outertext' : return $this- > _
[ HDOM_INFO_OUTER
] = $value ;
739 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > _
[ HDOM_INFO_TEXT
] = $value ;
740 return $this- > _
[ HDOM_INFO_INNER
] = $value ;
742 if (! isset ( $this- > attr
[ $name ])) {
743 $this- > _
[ HDOM_INFO_SPACE
][] = array ( ' ' , '' , '' );
744 $this- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_DOUBLE
;
746 $this- > attr
[ $name ] = $value ;
749 function __isset ( $name ) {
751 case 'outertext' : return true ;
752 case 'innertext' : return true ;
753 case 'plaintext' : return true ;
755 //no value attr: nowrap, checked selected...
756 return ( array_key_exists ( $name , $this- > attr
)) ? true : isset ( $this- > attr
[ $name ]);
759 function __unset ( $name ) {
760 if ( isset ( $this- > attr
[ $name ]))
761 unset ( $this- > attr
[ $name ]);
764 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
765 function convert_text ( $text )
767 global $debug_object ;
768 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 );}
770 $converted_text = $text ;
777 $sourceCharset = strtoupper ( $this- > dom
-> _charset
);
778 $targetCharset = strtoupper ( $this- > dom
-> _target_charset
);
780 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 3 , "source charset: " . $sourceCharset
. " target charaset: " . $targetCharset
);}
782 if (! empty ( $sourceCharset ) && ! empty ( $targetCharset ) && ( strcasecmp ( $sourceCharset , $targetCharset ) != 0 ))
784 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
785 if (( strcasecmp ( $targetCharset , 'UTF-8' ) == 0 ) && ( $this- > is_utf8 ( $text )))
787 $converted_text = $text ;
791 $converted_text = iconv ( $sourceCharset , $targetCharset , $text );
795 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
796 if ( $targetCharset == 'UTF-8' )
798 if ( substr ( $converted_text , 0 , 3 ) == " \xef\xbb\xbf " )
800 $converted_text = substr ( $converted_text , 3 );
802 if ( substr ( $converted_text , - 3 ) == " \xef\xbb\xbf " )
804 $converted_text = substr ( $converted_text , 0 , - 3 );
808 return $converted_text ;
812 * Returns true if $string is valid UTF-8 and false otherwise.
814 * @param mixed $str String to be tested
817 static function is_utf8 ( $str )
822 for ( $i = 0 ; $i < $len ; $i ++
)
827 if (( $c >= 254 )) return false ;
828 elseif ( $c >= 252 ) $bits = 6 ;
829 elseif ( $c >= 248 ) $bits = 5 ;
830 elseif ( $c >= 240 ) $bits = 4 ;
831 elseif ( $c >= 224 ) $bits = 3 ;
832 elseif ( $c >= 192 ) $bits = 2 ;
834 if (( $i +
$bits ) > $len ) return false ;
839 if ( $b < 128 || $b > 191 ) return false ;
847 function is_utf8($string)
850 return (utf8_encode(utf8_decode($string)) == $string);
855 * Function to try a few tricks to determine the displayed size of an img on the page.
856 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
858 * @author John Schlick
859 * @version April 19 2012
860 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
862 function get_display_size ()
864 global $debug_object ;
869 if ( $this- > tag
!== 'img' )
874 // See if there is aheight or width attribute in the tag itself.
875 if ( isset ( $this- > attr
[ 'width' ]))
877 $width = $this- > attr
[ 'width' ];
880 if ( isset ( $this- > attr
[ 'height' ]))
882 $height = $this- > attr
[ 'height' ];
885 // Now look for an inline style.
886 if ( isset ( $this- > attr
[ 'style' ]))
888 // Thanks to user gnarf from stackoverflow for this regular expression.
889 $attributes = array ();
890 preg_match_all ( "/([\w-]+)\s*:\s*([^;]+)\s*;?/" , $this- > attr
[ 'style' ], $matches , PREG_SET_ORDER
);
891 foreach ( $matches as $match ) {
892 $attributes [ $match [ 1 ]] = $match [ 2 ];
895 // If there is a width in the style attributes:
896 if ( isset ( $attributes [ 'width' ]) && $width == - 1 )
898 // check that the last two characters are px (pixels)
899 if ( strtolower ( substr ( $attributes [ 'width' ], - 2 )) == 'px' )
901 $proposed_width = substr ( $attributes [ 'width' ], 0 , - 2 );
902 // Now make sure that it's an integer and not something stupid.
903 if ( filter_var ( $proposed_width , FILTER_VALIDATE_INT
))
905 $width = $proposed_width ;
910 // If there is a width in the style attributes:
911 if ( isset ( $attributes [ 'height' ]) && $height == - 1 )
913 // check that the last two characters are px (pixels)
914 if ( strtolower ( substr ( $attributes [ 'height' ], - 2 )) == 'px' )
916 $proposed_height = substr ( $attributes [ 'height' ], 0 , - 2 );
917 // Now make sure that it's an integer and not something stupid.
918 if ( filter_var ( $proposed_height , FILTER_VALIDATE_INT
))
920 $height = $proposed_height ;
927 // Future enhancement:
928 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
930 // Far future enhancement
931 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
932 // Note that in this case, the class or id will have the img subselector for it to apply to the image.
934 // ridiculously far future development
935 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
937 $result = array ( 'height' => $height ,
942 // camel naming conventions
943 function getAllAttributes () { return $this
-> attr
;}
944 function getAttribute ( $name ) { return $this
-> __get ( $name
);}
945 function setAttribute ( $name , $value ) { $this
-> __set ( $name
, $value
);}
946 function hasAttribute ( $name ) { return $this
-> __isset ( $name
);}
947 function removeAttribute ( $name ) { $this
-> __set ( $name
, null );}
948 function getElementById ( $id ) { return $this
-> find ( "#$id" , 0 );}
949 function getElementsById ( $id , $idx = null ) { return $this
-> find ( "#$id" , $idx
);}
950 function getElementByTagName ( $name ) { return $this
-> find ( $name
, 0 );}
951 function getElementsByTagName ( $name , $idx = null ) { return $this
-> find ( $name
, $idx
);}
952 function parentNode () { return $this
-> parent ();}
953 function childNodes ( $idx =- 1 ) { return $this
-> children ( $idx
);}
954 function firstChild () { return $this
-> first_child ();}
955 function lastChild () { return $this
-> last_child ();}
956 function nextSibling () { return $this
-> next_sibling ();}
957 function previousSibling () { return $this
-> prev_sibling ();}
958 function hasChildNodes () { return $this
-> has_child ();}
959 function nodeName () { return $this
-> tag
;}
960 function appendChild ( $node ) { $node
-> parent ( $this
); return $node
;}
965 * simple html dom parser
966 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
967 * Paperg - change $size from protected to public so we can easily access it
968 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
970 * @package PlaceLocalInclude
972 class simple_html_dom
975 public $nodes = array ();
976 public $callback = null ;
977 public $lowercase = false ;
978 // Used to keep track of how large the text was when we started.
979 public $original_size ;
986 protected $noise = array ();
987 protected $token_blank = " \t\r\n " ;
988 protected $token_equal = ' =/>' ;
989 protected $token_slash = " /> \r\n\t " ;
990 protected $token_attr = ' >' ;
991 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
992 public $_charset = '' ;
993 public $_target_charset = '' ;
994 protected $default_br_text = "" ;
995 public $default_span_text = "" ;
997 // use isset instead of in_array, performance boost about 30%...
998 protected $self_closing_tags = array ( 'img' => 1 , 'br' => 1 , 'input' => 1 , 'meta' => 1 , 'link' => 1 , 'hr' => 1 , 'base' => 1 , 'embed' => 1 , 'spacer' => 1 );
999 protected $block_tags = array ( 'root' => 1 , 'body' => 1 , 'form' => 1 , 'div' => 1 , 'span' => 1 , 'table' => 1 );
1000 // Known sourceforge issue #2977341
1001 // B tags that are not closed cause us to return everything to the end of the document.
1002 protected $optional_closing_tags = array (
1003 'tr' => array ( 'tr' => 1 , 'td' => 1 , 'th' => 1 ),
1004 'th' => array ( 'th' => 1 ),
1005 'td' => array ( 'td' => 1 ),
1006 'li' => array ( 'li' => 1 ),
1007 'dt' => array ( 'dt' => 1 , 'dd' => 1 ),
1008 'dd' => array ( 'dd' => 1 , 'dt' => 1 ),
1009 'dl' => array ( 'dd' => 1 , 'dt' => 1 ),
1011 'nobr' => array ( 'nobr' => 1 ),
1013 'option' => array ( 'option' => 1 ),
1016 function __construct ( $str = null , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1020 if ( preg_match ( "/^http:\/\//i" , $str ) || is_file ( $str ))
1022 $this- > load_file ( $str );
1026 $this- > load ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
1029 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1030 if (! $forceTagsClosed ) {
1031 $this- > optional_closing_array
= array ();
1033 $this- > _target_charset
= $target_charset ;
1036 function __destruct ()
1041 // load html from string
1042 function load ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1044 global $debug_object ;
1047 $this- > prepare ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
1048 // strip out comments
1049 $this- > remove_noise ( "'<!--(.*?)-->'is" );
1051 $this- > remove_noise ( "'<!\[CDATA\[(.*?)\]\]>'is" , true );
1052 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1053 // Script tags removal now preceeds style tag removal.
1054 // strip out <script> tags
1055 $this- > remove_noise ( "'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is" );
1056 $this- > remove_noise ( "'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is" );
1057 // strip out <style> tags
1058 $this- > remove_noise ( "'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is" );
1059 $this- > remove_noise ( "'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is" );
1060 // strip out preformatted tags
1061 $this- > remove_noise ( "'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is" );
1062 // strip out server side scripts
1063 $this- > remove_noise ( "'(< \? )(.*?)( \? >)'s" , true );
1064 // strip smarty scripts
1065 $this- > remove_noise ( "'(\ {\w)(.*?)(\} )'s" , true );
1068 while ( $this- > parse ());
1070 $this- > root
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1071 $this- > parse_charset ();
1073 // make load function chainable
1078 // load html from file
1079 function load_file ()
1081 $args = func_get_args ();
1082 $this- > load ( call_user_func_array ( 'file_get_contents' , $args ), true );
1083 // Throw an error if we can't properly load the dom.
1084 if (( $error = error_get_last ())!== null ) {
1090 // set callback function
1091 function set_callback ( $function_name )
1093 $this- > callback
= $function_name ;
1096 // remove callback function
1097 function remove_callback ()
1099 $this- > callback
= null ;
1102 // save dom as string
1103 function save ( $filepath = '' )
1105 $ret = $this- > root
-> innertext ();
1106 if ( $filepath !== '' ) file_put_contents ( $filepath , $ret , LOCK_EX
);
1110 // find dom node by css selector
1111 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1112 function find ( $selector , $idx = null , $lowercase = false )
1114 return $this- > root
-> find ( $selector , $idx , $lowercase );
1117 // clean up memory due to php5 circular references memory leak...
1120 foreach ( $this- > nodes
as $n ) { $n
-> clear (); $n
= null ;}
1121 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1122 if ( isset ( $this- > children
)) foreach ( $this- > children
as $n ) { $n
-> clear (); $n
= null ;}
1123 if ( isset ( $this- > parent
)) { $this
-> parent
-> clear (); unset ( $this
-> parent
);}
1124 if ( isset ( $this- > root
)) { $this
-> root
-> clear (); unset ( $this
-> root
);}
1126 unset ( $this- > noise
);
1129 function dump ( $show_attr = true )
1131 $this- > root
-> dump ( $show_attr );
1134 // prepare HTML data and init everything
1135 protected function prepare ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1139 // set the length of content before we do anything to it.
1140 $this- > size
= strlen ( $str );
1141 // Save the original size of the html that we got in. It might be useful to someone.
1142 $this- > original_size
= $this- > size
;
1144 //before we save the string as the doc... strip out the \r \n's if we are told to.
1146 $str = str_replace ( " \r " , " " , $str );
1147 $str = str_replace ( " \n " , " " , $str );
1149 // set the length of content since we have changed it.
1150 $this- > size
= strlen ( $str );
1156 $this- > noise
= array ();
1157 $this- > nodes
= array ();
1158 $this- > lowercase
= $lowercase ;
1159 $this- > default_br_text
= $defaultBRText ;
1160 $this- > default_span_text
= $defaultSpanText ;
1161 $this- > root
= new simple_html_dom_node ( $this );
1162 $this- > root
-> tag
= 'root' ;
1163 $this- > root
-> _
[ HDOM_INFO_BEGIN
] = - 1 ;
1164 $this- > root
-> nodetype
= HDOM_TYPE_ROOT
;
1165 $this- > parent
= $this- > root
;
1166 if ( $this- > size
> 0 ) $this- > char
= $this- > doc
[ 0 ];
1169 // parse html content
1170 protected function parse ()
1172 if (( $s = $this- > copy_until_char ( '<' ))=== '' )
1174 return $this- > read_tag ();
1178 $node = new simple_html_dom_node ( $this );
1180 $node- > _
[ HDOM_INFO_TEXT
] = $s ;
1181 $this- > link_nodes ( $node , false );
1185 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1186 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1187 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1188 protected function parse_charset ()
1190 global $debug_object ;
1194 if ( function_exists ( 'get_last_retrieve_url_contents_content_type' ))
1196 $contentTypeHeader = get_last_retrieve_url_contents_content_type ();
1197 $success = preg_match ( '/charset=(.+)/' , $contentTypeHeader , $matches );
1200 $charset = $matches [ 1 ];
1201 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'header content-type found charset of: ' . $charset
);}
1206 if ( empty ( $charset ))
1208 $el = $this- > root
-> find ( 'meta[http-equiv=Content-Type]' , 0 );
1211 $fullvalue = $el- > content
;
1212 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'meta content-type tag found' . $fullvalue
);}
1214 if (! empty ( $fullvalue ))
1216 $success = preg_match ( '/charset=(.+)/' , $fullvalue , $matches );
1219 $charset = $matches [ 1 ];
1223 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1224 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'meta content-type tag couldn \' t be parsed. using iso-8859 default.' );}
1225 $charset = 'ISO-8859-1' ;
1231 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1232 if ( empty ( $charset ))
1234 // Have php try to detect the encoding from the text given to us.
1235 $charset = mb_detect_encoding ( $this- > root
-> plaintext
. "ascii" , $encoding_list = array ( "UTF-8" , "CP1252" ) );
1236 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'mb_detect found: ' . $charset
);}
1238 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1239 if ( $charset === false )
1241 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'since mb_detect failed - using default of utf-8' );}
1246 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1247 if (( strtolower ( $charset ) == strtolower ( 'ISO-8859-1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin-1' )))
1249 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'replacing ' . $charset
. ' with CP1252 as its a superset' );}
1250 $charset = 'CP1252' ;
1253 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 1 , 'EXIT - ' . $charset
);}
1255 return $this- > _charset
= $charset ;
1259 protected function read_tag ()
1261 if ( $this- > char
!== '<' )
1263 $this- > root
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1266 $begin_tag_pos = $this- > pos
;
1267 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1270 if ( $this- > char
=== '/' )
1272 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1273 // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1274 // $this->skip($this->token_blank_t);
1275 $this- > skip ( $this- > token_blank
);
1276 $tag = $this- > copy_until_char ( '>' );
1278 // skip attributes in end tag
1279 if (( $pos = strpos ( $tag , ' ' ))!== false )
1280 $tag = substr ( $tag , 0 , $pos );
1282 $parent_lower = strtolower ( $this- > parent
-> tag
);
1283 $tag_lower = strtolower ( $tag );
1285 if ( $parent_lower !== $tag_lower )
1287 if ( isset ( $this- > optional_closing_tags
[ $parent_lower ]) && isset ( $this- > block_tags
[ $tag_lower ]))
1289 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1290 $org_parent = $this- > parent
;
1292 while (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1293 $this- > parent
= $this- > parent
-> parent
;
1295 if ( strtolower ( $this- > parent
-> tag
)!== $tag_lower ) {
1296 $this- > parent
= $org_parent ; // restore origonal parent
1297 if ( $this- > parent
-> parent
) $this- > parent
= $this- > parent
-> parent
;
1298 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1299 return $this- > as_text_node ( $tag );
1302 else if (( $this- > parent
-> parent
) && isset ( $this- > block_tags
[ $tag_lower ]))
1304 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1305 $org_parent = $this- > parent
;
1307 while (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1308 $this- > parent
= $this- > parent
-> parent
;
1310 if ( strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1312 $this- > parent
= $org_parent ; // restore origonal parent
1313 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1314 return $this- > as_text_node ( $tag );
1317 else if (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> parent
-> tag
)=== $tag_lower )
1319 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1320 $this- > parent
= $this- > parent
-> parent
;
1323 return $this- > as_text_node ( $tag );
1326 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1327 if ( $this- > parent
-> parent
) $this- > parent
= $this- > parent
-> parent
;
1329 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1333 $node = new simple_html_dom_node ( $this );
1334 $node- > _
[ HDOM_INFO_BEGIN
] = $this- > cursor
;
1336 $tag = $this- > copy_until ( $this- > token_slash
);
1337 $node- > tag_start
= $begin_tag_pos ;
1339 // doctype, cdata & comments...
1340 if ( isset ( $tag [ 0 ]) && $tag [ 0 ]=== '!' ) {
1341 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $this- > copy_until_char ( '>' );
1343 if ( isset ( $tag [ 2 ]) && $tag [ 1 ]=== '-' && $tag [ 2 ]=== '-' ) {
1344 $node- > nodetype
= HDOM_TYPE_COMMENT
;
1345 $node- > tag
= 'comment' ;
1347 $node- > nodetype
= HDOM_TYPE_UNKNOWN
;
1348 $node- > tag
= 'unknown' ;
1350 if ( $this- > char
=== '>' ) $node- > _
[ HDOM_INFO_TEXT
].= '>' ;
1351 $this- > link_nodes ( $node , true );
1352 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1357 if ( $pos = strpos ( $tag , '<' )!== false ) {
1358 $tag = '<' . substr ( $tag , 0 , - 1 );
1359 $node- > _
[ HDOM_INFO_TEXT
] = $tag ;
1360 $this- > link_nodes ( $node , false );
1361 $this- > char
= $this- > doc
[-- $this- > pos
]; // prev
1365 if (! preg_match ( "/^[\w-:]+$/" , $tag )) {
1366 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $this- > copy_until ( '<>' );
1367 if ( $this- > char
=== '<' ) {
1368 $this- > link_nodes ( $node , false );
1372 if ( $this- > char
=== '>' ) $node- > _
[ HDOM_INFO_TEXT
].= '>' ;
1373 $this- > link_nodes ( $node , false );
1374 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1379 $node- > nodetype
= HDOM_TYPE_ELEMENT
;
1380 $tag_lower = strtolower ( $tag );
1381 $node- > tag
= ( $this- > lowercase
) ? $tag_lower : $tag ;
1383 // handle optional closing tags
1384 if ( isset ( $this- > optional_closing_tags
[ $tag_lower ]) )
1386 while ( isset ( $this- > optional_closing_tags
[ $tag_lower ][ strtolower ( $this- > parent
-> tag
)]))
1388 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1389 $this- > parent
= $this- > parent
-> parent
;
1391 $node- > parent
= $this- > parent
;
1394 $guard = 0 ; // prevent infinity loop
1395 $space = array ( $this- > copy_skip ( $this- > token_blank
), '' , '' );
1400 if ( $this- > char
!== null && $space [ 0 ]=== '' )
1404 $name = $this- > copy_until ( $this- > token_equal
);
1405 if ( $guard === $this- > pos
)
1407 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1410 $guard = $this- > pos
;
1412 // handle endless '<'
1413 if ( $this- > pos
>= $this- > size
- 1 && $this- > char
!== '>' ) {
1414 $node- > nodetype
= HDOM_TYPE_TEXT
;
1415 $node- > _
[ HDOM_INFO_END
] = 0 ;
1416 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $space [ 0 ] . $name ;
1417 $node- > tag
= 'text' ;
1418 $this- > link_nodes ( $node , false );
1422 // handle mismatch '<'
1423 if ( $this- > doc
[ $this- > pos
- 1 ]== '<' ) {
1424 $node- > nodetype
= HDOM_TYPE_TEXT
;
1425 $node- > tag
= 'text' ;
1426 $node- > attr
= array ();
1427 $node- > _
[ HDOM_INFO_END
] = 0 ;
1428 $node- > _
[ HDOM_INFO_TEXT
] = substr ( $this- > doc
, $begin_tag_pos , $this- > pos
- $begin_tag_pos-1 );
1430 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1431 $this- > link_nodes ( $node , false );
1435 if ( $name !== '/' && $name !== '' ) {
1436 $space [ 1 ] = $this- > copy_skip ( $this- > token_blank
);
1437 $name = $this- > restore_noise ( $name );
1438 if ( $this- > lowercase
) $name = strtolower ( $name );
1439 if ( $this- > char
=== '=' ) {
1440 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1441 $this- > parse_attr ( $node , $name , $space );
1444 //no value attr: nowrap, checked selected...
1445 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_NO
;
1446 $node- > attr
[ $name ] = true ;
1447 if ( $this- > char
!= '>' ) $this- > char
= $this- > doc
[-- $this- > pos
]; // prev
1449 $node- > _
[ HDOM_INFO_SPACE
][] = $space ;
1450 $space = array ( $this- > copy_skip ( $this- > token_blank
), '' , '' );
1454 } while ( $this- > char
!== '>' && $this- > char
!== '/' );
1456 $this- > link_nodes ( $node , true );
1457 $node- > _
[ HDOM_INFO_ENDSPACE
] = $space [ 0 ];
1459 // check self closing
1460 if ( $this- > copy_until_char_escape ( '>' )=== '/' )
1462 $node- > _
[ HDOM_INFO_ENDSPACE
] .= '/' ;
1463 $node- > _
[ HDOM_INFO_END
] = 0 ;
1468 if (! isset ( $this- > self_closing_tags
[ strtolower ( $node- > tag
)])) $this- > parent
= $node ;
1470 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1472 // If it's a BR tag, we need to set it's text to the default text.
1473 // This way when we see it in plaintext, we can generate formatting that the user wants.
1474 // since a br tag never has sub nodes, this works well.
1475 if ( $node- > tag
== "br" )
1477 $node- > _
[ HDOM_INFO_INNER
] = $this- > default_br_text
;
1484 protected function parse_attr ( $node , $name , & $space )
1486 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1487 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1488 if ( isset ( $node- > attr
[ $name ]))
1493 $space [ 2 ] = $this- > copy_skip ( $this- > token_blank
);
1494 switch ( $this- > char
) {
1496 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_DOUBLE
;
1497 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1498 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until_char_escape ( '"' ));
1499 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1502 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_SINGLE
;
1503 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1504 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until_char_escape ( ' \' ' ));
1505 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1508 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_NO
;
1509 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until ( $this- > token_attr
));
1511 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1512 $node- > attr
[ $name ] = str_replace ( " \r " , "" , $node- > attr
[ $name ]);
1513 $node- > attr
[ $name ] = str_replace ( " \n " , "" , $node- > attr
[ $name ]);
1514 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1515 if ( $name == "class" ) {
1516 $node- > attr
[ $name ] = trim ( $node- > attr
[ $name ]);
1520 // link node's parent
1521 protected function link_nodes (& $node , $is_child )
1523 $node- > parent
= $this- > parent
;
1524 $this- > parent
-> nodes
[] = $node ;
1527 $this- > parent
-> children
[] = $node ;
1532 protected function as_text_node ( $tag )
1534 $node = new simple_html_dom_node ( $this );
1536 $node- > _
[ HDOM_INFO_TEXT
] = '</' . $tag . '>' ;
1537 $this- > link_nodes ( $node , false );
1538 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1542 protected function skip ( $chars )
1544 $this- > pos +
= strspn ( $this- > doc
, $chars , $this- > pos
);
1545 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1548 protected function copy_skip ( $chars )
1551 $len = strspn ( $this- > doc
, $chars , $pos );
1553 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1554 if ( $len === 0 ) return '' ;
1555 return substr ( $this- > doc
, $pos , $len );
1558 protected function copy_until ( $chars )
1561 $len = strcspn ( $this- > doc
, $chars , $pos );
1563 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1564 return substr ( $this- > doc
, $pos , $len );
1567 protected function copy_until_char ( $char )
1569 if ( $this- > char
=== null ) return '' ;
1571 if (( $pos = strpos ( $this- > doc
, $char , $this- > pos
))=== false ) {
1572 $ret = substr ( $this- > doc
, $this- > pos
, $this- > size
- $this- > pos
);
1574 $this- > pos
= $this- > size
;
1578 if ( $pos === $this- > pos
) return '' ;
1579 $pos_old = $this- > pos
;
1580 $this- > char
= $this- > doc
[ $pos ];
1582 return substr ( $this- > doc
, $pos_old , $pos-$pos_old );
1585 protected function copy_until_char_escape ( $char )
1587 if ( $this- > char
=== null ) return '' ;
1589 $start = $this- > pos
;
1592 if (( $pos = strpos ( $this- > doc
, $char , $start ))=== false )
1594 $ret = substr ( $this- > doc
, $this- > pos
, $this- > size
- $this- > pos
);
1596 $this- > pos
= $this- > size
;
1600 if ( $pos === $this- > pos
) return '' ;
1602 if ( $this- > doc
[ $pos-1 ]=== ' \\ ' ) {
1607 $pos_old = $this- > pos
;
1608 $this- > char
= $this- > doc
[ $pos ];
1610 return substr ( $this- > doc
, $pos_old , $pos-$pos_old );
1614 // remove noise from html content
1615 // save the noise in the $this->noise array.
1616 protected function remove_noise ( $pattern , $remove_tag = false )
1618 global $debug_object ;
1619 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 ); }
1621 $count = preg_match_all ( $pattern , $this- > doc
, $matches , PREG_SET_ORDER
| PREG_OFFSET_CAPTURE
);
1623 for ( $i = $count-1 ; $i >- 1 ; -- $i )
1625 $key = '___noise___' . sprintf ( '% 5d' , count ( $this- > noise
) +
1000 );
1626 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'key is: ' . $key
); }
1627 $idx = ( $remove_tag ) ? 0 : 1 ;
1628 $this- > noise
[ $key ] = $matches [ $i ][ $idx ][ 0 ];
1629 $this- > doc
= substr_replace ( $this- > doc
, $key , $matches [ $i ][ $idx ][ 1 ], strlen ( $matches [ $i ][ $idx ][ 0 ]));
1632 // reset the length of content
1633 $this- > size
= strlen ( $this- > doc
);
1636 $this- > char
= $this- > doc
[ 0 ];
1640 // restore noise to html content
1641 function restore_noise ( $text )
1643 global $debug_object ;
1644 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 ); }
1646 while (( $pos = strpos ( $text , '___noise___' ))!== false )
1648 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1649 if ( strlen ( $text ) > $pos +
15 )
1651 $key = '___noise___' . $text [ $pos +
11 ]. $text [ $pos +
12 ]. $text [ $pos +
13 ]. $text [ $pos +
14 ]. $text [ $pos +
15 ];
1652 if ( is_object ( $debug_object )) { $debug_object
-> debugLog ( 2 , 'located key of: ' . $key
); }
1654 if ( isset ( $this- > noise
[ $key ]))
1656 $text = substr ( $text , 0 , $pos ). $this- > noise
[ $key ]. substr ( $text , $pos +
16 );
1660 // do this to prevent an infinite loop.
1661 $text = substr ( $text , 0 , $pos ). 'UNDEFINED NOISE FOR KEY: ' . $key . substr ( $text , $pos +
16 );
1666 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1667 $text = substr ( $text , 0 , $pos ). 'NO NUMERIC NOISE KEY' . substr ( $text , $pos +
11 );
1673 // Sometimes we NEED one of the noise elements.
1674 function search_noise ( $text )
1676 global $debug_object ;
1677 if ( is_object ( $debug_object )) { $debug_object
-> debugLogEntry ( 1 ); }
1679 foreach ( $this- > noise
as $noiseElement )
1681 if ( strpos ( $noiseElement , $text )!== false )
1683 return $noiseElement ;
1687 function __toString ()
1689 return $this- > root
-> innertext ();
1692 function __get ( $name )
1697 return $this- > root
-> innertext ();
1699 return $this- > root
-> innertext ();
1701 return $this- > root
-> text ();
1703 return $this- > _charset
;
1704 case 'target_charset' :
1705 return $this- > _target_charset
;
1709 // camel naming conventions
1710 function childNodes ( $idx =- 1 ) { return $this
-> root
-> childNodes ( $idx
);}
1711 function firstChild () { return $this
-> root
-> first_child ();}
1712 function lastChild () { return $this
-> root
-> last_child ();}
1713 function createElement ( $name , $value = null ) { return @ str_get_html ( "<$name>$value</$name>" )-> first_child ();}
1714 function createTextNode ( $value ) { return @ end ( str_get_html ( $value
)-> nodes
);}
1715 function getElementById ( $id ) { return $this
-> find ( "#$id" , 0 );}
1716 function getElementsById ( $id , $idx = null ) { return $this
-> find ( "#$id" , $idx
);}
1717 function getElementByTagName ( $name ) { return $this
-> find ( $name
, 0 );}
1718 function getElementsByTagName ( $name , $idx =- 1 ) { return $this
-> find ( $name
, $idx
);}
1719 function loadFile () { $args
= func_get_args (); $this
-> load_file ( $args
);}