]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/simple_html_dom.php
3 * Website: http://sourceforge.net/projects/simplehtmldom/
4 * Additional projects that may be used: http://sourceforge.net/projects/debugobject/
5 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
7 * Yousuke Kumakura (Attribute filters)
8 * Vadim Voituk (Negative indexes supports of "find" method)
9 * Antcs (Constructor with automatically load contents either text or file/url)
11 * all affected sections have comments starting with "PaperG"
13 * Paperg - Added case insensitive testing of the value of the selector.
14 * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.
15 * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,
16 * it will almost always be smaller by some amount.
17 * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.
18 * but for most purposes, it's a really good estimation.
19 * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.
20 * Allow the user to tell us how much they trust the html.
21 * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node.
22 * This allows for us to find tags based on the text they contain.
23 * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.
24 * Paperg: added parse_charset so that we know about the character set of the source document.
25 * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the
26 * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.
28 * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that.
29 * PaperG (John Schlick) Added get_display_size for "IMG" tags.
31 * Licensed under The MIT License
32 * Redistributions of files must retain the above copyright notice.
34 * @author S.C. Chen <me578022@gmail.com>
35 * @author John Schlick
37 * @version 1.5 ($Rev: 210 $)
38 * @package PlaceLocalInclude
39 * @subpackage simple_html_dom
43 * All of the Defines for the classes below.
44 * @author S.C. Chen <me578022@gmail.com>
46 define ( 'HDOM_TYPE_ELEMENT' , 1 );
47 define ( 'HDOM_TYPE_COMMENT' , 2 );
48 define ( 'HDOM_TYPE_TEXT' , 3 );
49 define ( 'HDOM_TYPE_ENDTAG' , 4 );
50 define ( 'HDOM_TYPE_ROOT' , 5 );
51 define ( 'HDOM_TYPE_UNKNOWN' , 6 );
52 define ( 'HDOM_QUOTE_DOUBLE' , 0 );
53 define ( 'HDOM_QUOTE_SINGLE' , 1 );
54 define ( 'HDOM_QUOTE_NO' , 3 );
55 define ( 'HDOM_INFO_BEGIN' , 0 );
56 define ( 'HDOM_INFO_END' , 1 );
57 define ( 'HDOM_INFO_QUOTE' , 2 );
58 define ( 'HDOM_INFO_SPACE' , 3 );
59 define ( 'HDOM_INFO_TEXT' , 4 );
60 define ( 'HDOM_INFO_INNER' , 5 );
61 define ( 'HDOM_INFO_OUTER' , 6 );
62 define ( 'HDOM_INFO_ENDSPACE' , 7 );
63 define ( 'DEFAULT_TARGET_CHARSET' , 'UTF-8' );
64 define ( 'DEFAULT_BR_TEXT' , " \r\n " );
65 define ( 'DEFAULT_SPAN_TEXT' , " " );
66 define ( 'MAX_FILE_SIZE' , 600000 );
68 // -----------------------------------------------------------------------------
69 // get html dom from file
70 // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
71 function file_get_html ( $url , $use_include_path = false , $context = null , $offset = - 1 , $maxLen =- 1 , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
73 // We DO force the tags to be terminated.
74 $dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
75 // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.
76 $contents = file_get_contents ( $url , $use_include_path , $context , $offset );
77 // Paperg - use our own mechanism for getting the contents as we want to control the timeout.
78 //$contents = retrieve_url_contents($url);
79 if ( empty ( $contents ) || strlen ( $contents ) > MAX_FILE_SIZE
)
83 // The second parameter can force the selectors to all be lowercase.
84 $dom- > load ( $contents , $lowercase , $stripRN );
88 // get html dom from string
89 function str_get_html ( $str , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
91 $dom = new simple_html_dom ( null , $lowercase , $forceTagsClosed , $target_charset , $stripRN , $defaultBRText , $defaultSpanText );
92 if ( empty ( $str ) || strlen ( $str ) > MAX_FILE_SIZE
)
97 $dom- > load ( $str , $lowercase , $stripRN );
101 // dump html dom tree
102 function dump_html_tree ( $node , $show_attr = true , $deep = 0 )
109 * simple html dom node
110 * PaperG - added ability for "find" routine to lowercase the value of the selector.
111 * PaperG - added $tag_start to track the start position of the tag in the total byte index
113 * @package PlaceLocalInclude
115 class simple_html_dom_node
117 public $nodetype = HDOM_TYPE_TEXT
;
118 public $tag = 'text' ;
119 public $attr = array ();
120 public $children = array ();
121 public $nodes = array ();
122 public $parent = null ;
123 // The "info" array - see HDOM_INFO_... for what each element contains.
125 public $tag_start = 0 ;
128 function __construct ( $dom )
131 $dom- > nodes
[] = $this ;
134 function __destruct ()
139 function __toString ()
141 return $this- > outertext ();
144 // clean up memory due to php5 circular references memory leak...
149 $this- > parent
= null ;
150 $this- > children
= null ;
154 function dump ( $show_attr = true , $deep = 0 )
156 $lead = str_repeat ( ' ' , $deep );
158 echo $lead . $this- > tag
;
159 if ( $show_attr && count ( $this- > attr
)> 0 )
162 foreach ( $this- > attr
as $k => $v )
163 echo "[ $k ]=> \" " . $this- > $k . '", ' ;
170 foreach ( $this- > nodes
as $c )
172 $c- > dump ( $show_attr , $deep +
1 );
178 // Debugging function to dump a single dom node with a bunch of information about it.
179 function dump_node ( $echo = true )
182 $string = $this- > tag
;
183 if ( count ( $this- > attr
)> 0 )
186 foreach ( $this- > attr
as $k => $v )
188 $string .= "[ $k ]=> \" " . $this- > $k . '", ' ;
192 if ( count ( $this- > _
)> 0 )
195 foreach ( $this- > _
as $k => $v )
199 $string .= "[ $k ]=>(" ;
200 foreach ( $v as $k2 => $v2 )
202 $string .= "[ $k2 ]=> \" " . $v2 . '", ' ;
206 $string .= "[ $k ]=> \" " . $v . '", ' ;
212 if ( isset ( $this- > text
))
214 $string .= " text: (" . $this- > text
. ")" ;
217 $string .= " HDOM_INNER_INFO: '" ;
218 if ( isset ( $node- > _
[ HDOM_INFO_INNER
]))
220 $string .= $node- > _
[ HDOM_INFO_INNER
] . "'" ;
227 $string .= " children: " . count ( $this- > children
);
228 $string .= " nodes: " . count ( $this- > nodes
);
229 $string .= " tag_start: " . $this- > tag_start
;
243 // returns the parent of node
244 // If a node is passed in, it will reset the parent of the current node to that one.
245 function parent ( $parent = null )
247 // I am SURE that this doesn't work properly.
248 // It fails to unset the current node from it's current parents nodes or children list first.
249 if ( $parent !== null )
251 $this- > parent
= $parent ;
252 $this- > parent
-> nodes
[] = $this ;
253 $this- > parent
-> children
[] = $this ;
256 return $this- > parent
;
259 // verify that node has children
262 return ! empty ( $this- > children
);
265 // returns children of node
266 function children ( $idx =- 1 )
270 return $this- > children
;
272 if ( isset ( $this- > children
[ $idx ]))
274 return $this- > children
[ $idx ];
279 // returns the first child of node
280 function first_child ()
282 if ( count ( $this- > children
)> 0 )
284 return $this- > children
[ 0 ];
289 // returns the last child of node
290 function last_child ()
292 if (( $count = count ( $this- > children
))> 0 )
294 return $this- > children
[ $count-1 ];
299 // returns the next sibling of node
300 function next_sibling ()
302 if ( $this- > parent
=== null )
308 $count = count ( $this- > parent
-> children
);
309 while ( $idx < $count && $this !== $this- > parent
-> children
[ $idx ])
317 return $this- > parent
-> children
[ $idx ];
320 // returns the previous sibling of node
321 function prev_sibling ()
323 if ( $this- > parent
=== null ) return null ;
325 $count = count ( $this- > parent
-> children
);
326 while ( $idx < $count && $this !== $this- > parent
-> children
[ $idx ])
328 if (-- $idx < 0 ) return null ;
329 return $this- > parent
-> children
[ $idx ];
332 // function to locate a specific ancestor tag in the path to the root.
333 function find_ancestor_tag ( $tag )
335 global $debug_object ;
336 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 ); }
338 // Start by including ourselves in the comparison.
341 while (! is_null ( $returnDom ))
343 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , "Current tag is: " . $returnDom
-> tag
); }
345 if ( $returnDom- > tag
== $tag )
349 $returnDom = $returnDom- > parent
;
354 // get dom node's inner html
357 if ( isset ( $this- > _
[ HDOM_INFO_INNER
])) return $this- > _
[ HDOM_INFO_INNER
];
358 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
361 foreach ( $this- > nodes
as $n )
362 $ret .= $n- > outertext ();
366 // get dom node's outer text (with tag)
369 global $debug_object ;
370 if ( is_object ( $debug_object ))
373 if ( $this- > tag
== 'text' )
375 if (! empty ( $this- > text
))
377 $text = " with text: " . $this- > text
;
380 $debug_object- > debug_log ( 1 , 'Innertext of tag: ' . $this- > tag
. $text );
383 if ( $this- > tag
=== 'root' ) return $this- > innertext ();
386 if ( $this- > dom
&& $this- > dom
-> callback
!== null )
388 call_user_func_array ( $this- > dom
-> callback
, array ( $this ));
391 if ( isset ( $this- > _
[ HDOM_INFO_OUTER
])) return $this- > _
[ HDOM_INFO_OUTER
];
392 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
395 if ( $this- > dom
&& $this- > dom
-> nodes
[ $this- > _
[ HDOM_INFO_BEGIN
]])
397 $ret = $this- > dom
-> nodes
[ $this- > _
[ HDOM_INFO_BEGIN
]]-> makeup ();
403 if ( isset ( $this- > _
[ HDOM_INFO_INNER
]))
405 // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added.
406 if ( $this- > tag
!= "br" )
408 $ret .= $this- > _
[ HDOM_INFO_INNER
];
413 foreach ( $this- > nodes
as $n )
415 $ret .= $this- > convert_text ( $n- > outertext ());
421 if ( isset ( $this- > _
[ HDOM_INFO_END
]) && $this- > _
[ HDOM_INFO_END
]!= 0 )
422 $ret .= '</' . $this- > tag
. '>' ;
426 // get dom node's plain text
429 if ( isset ( $this- > _
[ HDOM_INFO_INNER
])) return $this- > _
[ HDOM_INFO_INNER
];
430 switch ( $this- > nodetype
)
432 case HDOM_TYPE_TEXT
: return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
433 case HDOM_TYPE_COMMENT
: return '' ;
434 case HDOM_TYPE_UNKNOWN
: return '' ;
436 if ( strcasecmp ( $this- > tag
, 'script' )=== 0 ) return '' ;
437 if ( strcasecmp ( $this- > tag
, 'style' )=== 0 ) return '' ;
440 // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.
441 // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.
442 // WHY is this happening?
443 if (! is_null ( $this- > nodes
))
445 foreach ( $this- > nodes
as $n )
447 $ret .= $this- > convert_text ( $n- > text ());
450 // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all.
451 if ( $this- > tag
== "span" )
453 $ret .= $this- > dom
-> default_span_text
;
463 $ret = $this- > innertext ();
464 $ret = str_ireplace ( '<![CDATA[' , '' , $ret );
465 $ret = str_replace ( ']]>' , '' , $ret );
469 // build node's text with tag
472 // text, comment, unknown
473 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > dom
-> restore_noise ( $this- > _
[ HDOM_INFO_TEXT
]);
475 $ret = '<' . $this- > tag
;
478 foreach ( $this- > attr
as $key => $val )
482 // skip removed attribute
483 if ( $val === null || $val === false )
486 $ret .= $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 0 ];
487 //no value attr: nowrap, checked selected...
491 switch ( $this- > _
[ HDOM_INFO_QUOTE
][ $i ])
493 case HDOM_QUOTE_DOUBLE
: $quote = '"' ; break ;
494 case HDOM_QUOTE_SINGLE
: $quote = ' \' ' ; break ;
495 default : $quote = '' ;
497 $ret .= $key . $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 1 ]. '=' . $this- > _
[ HDOM_INFO_SPACE
][ $i ][ 2 ]. $quote . $val . $quote ;
500 $ret = $this- > dom
-> restore_noise ( $ret );
501 return $ret . $this- > _
[ HDOM_INFO_ENDSPACE
] . '>' ;
504 // find elements by css selector
505 //PaperG - added ability for find to lowercase the value of the selector.
506 function find ( $selector , $idx = null , $lowercase = false )
508 $selectors = $this- > parse_selector ( $selector );
509 if (( $count = count ( $selectors ))=== 0 ) return array ();
510 $found_keys = array ();
512 // find each selector
513 for ( $c = 0 ; $c < $count ; ++
$c )
515 // The change on the below line was documented on the sourceforge code tracker id 2788009
516 // used to be: if (($levle=count($selectors[0]))===0) return array();
517 if (( $levle = count ( $selectors [ $c ]))=== 0 ) return array ();
518 if (! isset ( $this- > _
[ HDOM_INFO_BEGIN
])) return array ();
520 $head = array ( $this- > _
[ HDOM_INFO_BEGIN
]=> 1 );
522 // handle descendant selectors, no recursive!
523 for ( $l = 0 ; $l < $levle ; ++
$l )
526 foreach ( $head as $k => $v )
528 $n = ( $k ===- 1 ) ? $this- > dom
-> root
: $this- > dom
-> nodes
[ $k ];
529 //PaperG - Pass this optional parameter on to the seek function.
530 $n- > seek ( $selectors [ $c ][ $l ], $ret , $lowercase );
535 foreach ( $head as $k => $v )
537 if (! isset ( $found_keys [ $k ]))
548 foreach ( $found_keys as $k => $v )
549 $found [] = $this- > dom
-> nodes
[ $k ];
551 // return nth-element or array
552 if ( is_null ( $idx )) return $found ;
553 else if ( $idx < 0 ) $idx = count ( $found ) +
$idx ;
554 return ( isset ( $found [ $idx ])) ? $found [ $idx ] : null ;
557 // seek for given conditions
558 // PaperG - added parameter to allow for case insensitive testing of the value of a selector.
559 protected function seek ( $selector , & $ret , $lowercase = false )
561 global $debug_object ;
562 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 ); }
564 list ( $tag , $key , $val , $exp , $no_key ) = $selector ;
567 if ( $tag && $key && is_numeric ( $key ))
570 foreach ( $this- > children
as $c )
572 if ( $tag === '*' || $tag === $c- > tag
) {
573 if ( ++
$count == $key ) {
574 $ret [ $c- > _
[ HDOM_INFO_BEGIN
]] = 1 ;
582 $end = (! empty ( $this- > _
[ HDOM_INFO_END
])) ? $this- > _
[ HDOM_INFO_END
] : 0 ;
584 $parent = $this- > parent
;
585 while (! isset ( $parent- > _
[ HDOM_INFO_END
]) && $parent !== null ) {
587 $parent = $parent- > parent
;
589 $end +
= $parent- > _
[ HDOM_INFO_END
];
592 for ( $i = $this- > _
[ HDOM_INFO_BEGIN
] +
1 ; $i < $end ; ++
$i ) {
593 $node = $this- > dom
-> nodes
[ $i ];
597 if ( $tag === '*' && ! $key ) {
598 if ( in_array ( $node , $this- > children
, true ))
604 if ( $tag && $tag != $node- > tag
&& $tag !== '*' ) { $pass
= false ;}
608 if ( isset ( $node- > attr
[ $key ])) $pass = false ;
610 if (( $key != "plaintext" ) && ! isset ( $node- > attr
[ $key ])) $pass = false ;
614 if ( $pass && $key && $val && $val !== '*' ) {
615 // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?
616 if ( $key == "plaintext" ) {
617 // $node->plaintext actually returns $node->text();
618 $nodeKeyValue = $node- > text ();
620 // this is a normal search, we want the value of that attribute of the tag.
621 $nodeKeyValue = $node- > attr
[ $key ];
623 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , "testing node: " . $node
-> tag
. " for attribute: " . $key
. $exp
. $val
. " where nodes value is: " . $nodeKeyValue
);}
625 //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.
627 $check = $this- > match ( $exp , strtolower ( $val ), strtolower ( $nodeKeyValue ));
629 $check = $this- > match ( $exp , $val , $nodeKeyValue );
631 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , "after match: " . ( $check
? "true" : "false" ));}
633 // handle multiple class
634 if (! $check && strcasecmp ( $key , 'class' )=== 0 ) {
635 foreach ( explode ( ' ' , $node- > attr
[ $key ]) as $k ) {
636 // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.
639 $check = $this- > match ( $exp , strtolower ( $val ), strtolower ( $k ));
641 $check = $this- > match ( $exp , $val , $k );
647 if (! $check ) $pass = false ;
649 if ( $pass ) $ret [ $i ] = 1 ;
652 // It's passed by reference so this is actually what this function returns.
653 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 1 , "EXIT - ret: " , $ret
);}
656 protected function match ( $exp , $pattern , $value ) {
657 global $debug_object ;
658 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 );}
662 return ( $value === $pattern );
664 return ( $value !== $pattern );
666 return preg_match ( "/^" . preg_quote ( $pattern , '/' ). "/" , $value );
668 return preg_match ( "/" . preg_quote ( $pattern , '/' ). "$/" , $value );
670 if ( $pattern [ 0 ]== '/' ) {
671 return preg_match ( $pattern , $value );
673 return preg_match ( "/" . $pattern . "/i" , $value );
678 protected function parse_selector ( $selector_string ) {
679 global $debug_object ;
680 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 );}
682 // pattern of CSS selectors, modified from mootools
683 // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.
684 // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.
685 // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured.
686 // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
687 // farther study is required to determine of this should be documented or removed.
688 // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
689 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[ \" ']?(.*?)[ \" ']?)?\])?([\/, ]+)/is" ;
690 preg_match_all ( $pattern , trim ( $selector_string ). ' ' , $matches , PREG_SET_ORDER
);
691 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , "Matches Array: " , $matches
);}
693 $selectors = array ();
697 foreach ( $matches as $m ) {
699 if ( $m [ 0 ]=== '' || $m [ 0 ]=== '/' || $m [ 0 ]=== '//' ) continue ;
700 // for browser generated xpath
701 if ( $m [ 1 ]=== 'tbody' ) continue ;
703 list ( $tag , $key , $val , $exp , $no_key ) = array ( $m [ 1 ], null , null , '=' , false );
704 if (! empty ( $m [ 2 ])) { $key
= 'id' ; $val
= $m
[ 2 ];}
705 if (! empty ( $m [ 3 ])) { $key
= 'class' ; $val
= $m
[ 3 ];}
706 if (! empty ( $m [ 4 ])) { $key
= $m
[ 4 ];}
707 if (! empty ( $m [ 5 ])) { $exp
= $m
[ 5 ];}
708 if (! empty ( $m [ 6 ])) { $val
= $m
[ 6 ];}
710 // convert to lowercase
711 if ( $this- > dom
-> lowercase
) { $tag
= strtolower ( $tag
); $key
= strtolower ( $key
);}
712 //elements that do NOT have the specified attribute
713 if ( isset ( $key [ 0 ]) && $key [ 0 ]=== '!' ) { $key
= substr ( $key
, 1 ); $no_key
= true ;}
715 $result [] = array ( $tag , $key , $val , $exp , $no_key );
716 if ( trim ( $m [ 7 ])=== ',' ) {
717 $selectors [] = $result ;
721 if ( count ( $result )> 0 )
722 $selectors [] = $result ;
726 function __get ( $name )
728 if ( isset ( $this- > attr
[ $name ]))
730 return $this- > convert_text ( $this- > attr
[ $name ]);
734 case 'outertext' : return $this- > outertext ();
735 case 'innertext' : return $this- > innertext ();
736 case 'plaintext' : return $this- > text ();
737 case 'xmltext' : return $this- > xmltext ();
738 default : return array_key_exists ( $name , $this- > attr
);
742 function __set ( $name , $value )
744 global $debug_object ;
745 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 );}
749 case 'outertext' : return $this- > _
[ HDOM_INFO_OUTER
] = $value ;
751 if ( isset ( $this- > _
[ HDOM_INFO_TEXT
])) return $this- > _
[ HDOM_INFO_TEXT
] = $value ;
752 return $this- > _
[ HDOM_INFO_INNER
] = $value ;
754 if (! isset ( $this- > attr
[ $name ]))
756 $this- > _
[ HDOM_INFO_SPACE
][] = array ( ' ' , '' , '' );
757 $this- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_DOUBLE
;
759 $this- > attr
[ $name ] = $value ;
762 function __isset ( $name )
766 case 'outertext' : return true ;
767 case 'innertext' : return true ;
768 case 'plaintext' : return true ;
770 //no value attr: nowrap, checked selected...
771 return ( array_key_exists ( $name , $this- > attr
)) ? true : isset ( $this- > attr
[ $name ]);
774 function __unset ( $name ) {
775 if ( isset ( $this- > attr
[ $name ]))
776 unset ( $this- > attr
[ $name ]);
779 // PaperG - Function to convert the text from one character set to another if the two sets are not the same.
780 function convert_text ( $text )
782 global $debug_object ;
783 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 );}
785 $converted_text = $text ;
792 $sourceCharset = strtoupper ( $this- > dom
-> _charset
);
793 $targetCharset = strtoupper ( $this- > dom
-> _target_charset
);
795 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 3 , "source charset: " . $sourceCharset
. " target charaset: " . $targetCharset
);}
797 if (! empty ( $sourceCharset ) && ! empty ( $targetCharset ) && ( strcasecmp ( $sourceCharset , $targetCharset ) != 0 ))
799 // Check if the reported encoding could have been incorrect and the text is actually already UTF-8
800 if (( strcasecmp ( $targetCharset , 'UTF-8' ) == 0 ) && ( $this- > is_utf8 ( $text )))
802 $converted_text = $text ;
806 $converted_text = iconv ( $sourceCharset , $targetCharset , $text );
810 // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
811 if ( $targetCharset == 'UTF-8' )
813 if ( substr ( $converted_text , 0 , 3 ) == " \xef\xbb\xbf " )
815 $converted_text = substr ( $converted_text , 3 );
817 if ( substr ( $converted_text , - 3 ) == " \xef\xbb\xbf " )
819 $converted_text = substr ( $converted_text , 0 , - 3 );
823 return $converted_text ;
827 * Returns true if $string is valid UTF-8 and false otherwise.
829 * @param mixed $str String to be tested
832 static function is_utf8 ( $str )
837 for ( $i = 0 ; $i < $len ; $i ++
)
842 if (( $c >= 254 )) return false ;
843 elseif ( $c >= 252 ) $bits = 6 ;
844 elseif ( $c >= 248 ) $bits = 5 ;
845 elseif ( $c >= 240 ) $bits = 4 ;
846 elseif ( $c >= 224 ) $bits = 3 ;
847 elseif ( $c >= 192 ) $bits = 2 ;
849 if (( $i +
$bits ) > $len ) return false ;
854 if ( $b < 128 || $b > 191 ) return false ;
862 function is_utf8($string)
865 return (utf8_encode(utf8_decode($string)) == $string);
870 * Function to try a few tricks to determine the displayed size of an img on the page.
871 * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.
873 * @author John Schlick
874 * @version April 19 2012
875 * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.
877 function get_display_size ()
879 global $debug_object ;
884 if ( $this- > tag
!== 'img' )
889 // See if there is aheight or width attribute in the tag itself.
890 if ( isset ( $this- > attr
[ 'width' ]))
892 $width = $this- > attr
[ 'width' ];
895 if ( isset ( $this- > attr
[ 'height' ]))
897 $height = $this- > attr
[ 'height' ];
900 // Now look for an inline style.
901 if ( isset ( $this- > attr
[ 'style' ]))
903 // Thanks to user gnarf from stackoverflow for this regular expression.
904 $attributes = array ();
905 preg_match_all ( "/([\w-]+)\s*:\s*([^;]+)\s*;?/" , $this- > attr
[ 'style' ], $matches , PREG_SET_ORDER
);
906 foreach ( $matches as $match ) {
907 $attributes [ $match [ 1 ]] = $match [ 2 ];
910 // If there is a width in the style attributes:
911 if ( isset ( $attributes [ 'width' ]) && $width == - 1 )
913 // check that the last two characters are px (pixels)
914 if ( strtolower ( substr ( $attributes [ 'width' ], - 2 )) == 'px' )
916 $proposed_width = substr ( $attributes [ 'width' ], 0 , - 2 );
917 // Now make sure that it's an integer and not something stupid.
918 if ( filter_var ( $proposed_width , FILTER_VALIDATE_INT
))
920 $width = $proposed_width ;
925 // If there is a width in the style attributes:
926 if ( isset ( $attributes [ 'height' ]) && $height == - 1 )
928 // check that the last two characters are px (pixels)
929 if ( strtolower ( substr ( $attributes [ 'height' ], - 2 )) == 'px' )
931 $proposed_height = substr ( $attributes [ 'height' ], 0 , - 2 );
932 // Now make sure that it's an integer and not something stupid.
933 if ( filter_var ( $proposed_height , FILTER_VALIDATE_INT
))
935 $height = $proposed_height ;
942 // Future enhancement:
943 // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.
945 // Far future enhancement
946 // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width
947 // Note that in this case, the class or id will have the img subselector for it to apply to the image.
949 // ridiculously far future development
950 // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
952 $result = array ( 'height' => $height ,
957 // camel naming conventions
958 function getAllAttributes () { return $this
-> attr
;}
959 function getAttribute ( $name ) { return $this
-> __get ( $name
);}
960 function setAttribute ( $name , $value ) { $this
-> __set ( $name
, $value
);}
961 function hasAttribute ( $name ) { return $this
-> __isset ( $name
);}
962 function removeAttribute ( $name ) { $this
-> __set ( $name
, null );}
963 function getElementById ( $id ) { return $this
-> find ( "#$id" , 0 );}
964 function getElementsById ( $id , $idx = null ) { return $this
-> find ( "#$id" , $idx
);}
965 function getElementByTagName ( $name ) { return $this
-> find ( $name
, 0 );}
966 function getElementsByTagName ( $name , $idx = null ) { return $this
-> find ( $name
, $idx
);}
967 function parentNode () { return $this
-> parent ();}
968 function childNodes ( $idx =- 1 ) { return $this
-> children ( $idx
);}
969 function firstChild () { return $this
-> first_child ();}
970 function lastChild () { return $this
-> last_child ();}
971 function nextSibling () { return $this
-> next_sibling ();}
972 function previousSibling () { return $this
-> prev_sibling ();}
973 function hasChildNodes () { return $this
-> has_child ();}
974 function nodeName () { return $this
-> tag
;}
975 function appendChild ( $node ) { $node
-> parent ( $this
); return $node
;}
980 * simple html dom parser
981 * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.
982 * Paperg - change $size from protected to public so we can easily access it
983 * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it.
985 * @package PlaceLocalInclude
987 class simple_html_dom
990 public $nodes = array ();
991 public $callback = null ;
992 public $lowercase = false ;
993 // Used to keep track of how large the text was when we started.
994 public $original_size ;
1001 protected $noise = array ();
1002 protected $token_blank = " \t\r\n " ;
1003 protected $token_equal = ' =/>' ;
1004 protected $token_slash = " /> \r\n\t " ;
1005 protected $token_attr = ' >' ;
1006 // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.
1007 public $_charset = '' ;
1008 public $_target_charset = '' ;
1009 protected $default_br_text = "" ;
1010 public $default_span_text = "" ;
1012 // use isset instead of in_array, performance boost about 30%...
1013 protected $self_closing_tags = array ( 'img' => 1 , 'br' => 1 , 'input' => 1 , 'meta' => 1 , 'link' => 1 , 'hr' => 1 , 'base' => 1 , 'embed' => 1 , 'spacer' => 1 );
1014 protected $block_tags = array ( 'root' => 1 , 'body' => 1 , 'form' => 1 , 'div' => 1 , 'span' => 1 , 'table' => 1 );
1015 // Known sourceforge issue #2977341
1016 // B tags that are not closed cause us to return everything to the end of the document.
1017 protected $optional_closing_tags = array (
1018 'tr' => array ( 'tr' => 1 , 'td' => 1 , 'th' => 1 ),
1019 'th' => array ( 'th' => 1 ),
1020 'td' => array ( 'td' => 1 ),
1021 'li' => array ( 'li' => 1 ),
1022 'dt' => array ( 'dt' => 1 , 'dd' => 1 ),
1023 'dd' => array ( 'dd' => 1 , 'dt' => 1 ),
1024 'dl' => array ( 'dd' => 1 , 'dt' => 1 ),
1026 'nobr' => array ( 'nobr' => 1 ),
1028 'option' => array ( 'option' => 1 ),
1031 function __construct ( $str = null , $lowercase = true , $forceTagsClosed = true , $target_charset = DEFAULT_TARGET_CHARSET
, $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1035 if ( preg_match ( "/^http:\/\//i" , $str ) || is_file ( $str ))
1037 $this- > load_file ( $str );
1041 $this- > load ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
1044 // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.
1045 if (! $forceTagsClosed ) {
1046 $this- > optional_closing_array
= array ();
1048 $this- > _target_charset
= $target_charset ;
1051 function __destruct ()
1056 // load html from string
1057 function load ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1059 global $debug_object ;
1062 $this- > prepare ( $str , $lowercase , $stripRN , $defaultBRText , $defaultSpanText );
1064 $this- > remove_noise ( "'<!\[CDATA\[(.*?)\]\]>'is" , true );
1065 // strip out comments
1066 $this- > remove_noise ( "'<!--(.*?)-->'is" );
1067 // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1068 // Script tags removal now preceeds style tag removal.
1069 // strip out <script> tags
1070 $this- > remove_noise ( "'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is" );
1071 $this- > remove_noise ( "'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is" );
1072 // strip out <style> tags
1073 $this- > remove_noise ( "'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is" );
1074 $this- > remove_noise ( "'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is" );
1075 // strip out preformatted tags
1076 $this- > remove_noise ( "'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is" );
1077 // strip out server side scripts
1078 $this- > remove_noise ( "'(< \? )(.*?)( \? >)'s" , true );
1079 // strip smarty scripts
1080 $this- > remove_noise ( "'(\ {\w)(.*?)(\} )'s" , true );
1083 while ( $this- > parse ());
1085 $this- > root
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1086 $this- > parse_charset ();
1088 // make load function chainable
1093 // load html from file
1094 function load_file ()
1096 //external error: NOT related to dom loading
1097 $extError = error_get_last ();
1099 $args = func_get_args ();
1100 $this- > load ( call_user_func_array ( 'file_get_contents' , $args ), true );
1102 // Throw an error if we can't properly load the dom.
1103 $error = error_get_last ();
1104 if ( $error !== $extError ) {
1110 // set callback function
1111 function set_callback ( $function_name )
1113 $this- > callback
= $function_name ;
1116 // remove callback function
1117 function remove_callback ()
1119 $this- > callback
= null ;
1122 // save dom as string
1123 function save ( $filepath = '' )
1125 $ret = $this- > root
-> innertext ();
1126 if ( $filepath !== '' ) file_put_contents ( $filepath , $ret , LOCK_EX
);
1130 // find dom node by css selector
1131 // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
1132 function find ( $selector , $idx = null , $lowercase = false )
1134 return $this- > root
-> find ( $selector , $idx , $lowercase );
1137 // clean up memory due to php5 circular references memory leak...
1140 foreach ( $this- > nodes
as $n ) { $n
-> clear (); $n
= null ;}
1141 // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.
1142 if ( isset ( $this- > children
)) foreach ( $this- > children
as $n ) { $n
-> clear (); $n
= null ;}
1143 if ( isset ( $this- > parent
)) { $this
-> parent
-> clear (); unset ( $this
-> parent
);}
1144 if ( isset ( $this- > root
)) { $this
-> root
-> clear (); unset ( $this
-> root
);}
1146 unset ( $this- > noise
);
1149 function dump ( $show_attr = true )
1151 $this- > root
-> dump ( $show_attr );
1154 // prepare HTML data and init everything
1155 protected function prepare ( $str , $lowercase = true , $stripRN = true , $defaultBRText = DEFAULT_BR_TEXT
, $defaultSpanText = DEFAULT_SPAN_TEXT
)
1159 // set the length of content before we do anything to it.
1160 $this- > size
= strlen ( $str );
1161 // Save the original size of the html that we got in. It might be useful to someone.
1162 $this- > original_size
= $this- > size
;
1164 //before we save the string as the doc... strip out the \r \n's if we are told to.
1166 $str = str_replace ( " \r " , " " , $str );
1167 $str = str_replace ( " \n " , " " , $str );
1169 // set the length of content since we have changed it.
1170 $this- > size
= strlen ( $str );
1176 $this- > noise
= array ();
1177 $this- > nodes
= array ();
1178 $this- > lowercase
= $lowercase ;
1179 $this- > default_br_text
= $defaultBRText ;
1180 $this- > default_span_text
= $defaultSpanText ;
1181 $this- > root
= new simple_html_dom_node ( $this );
1182 $this- > root
-> tag
= 'root' ;
1183 $this- > root
-> _
[ HDOM_INFO_BEGIN
] = - 1 ;
1184 $this- > root
-> nodetype
= HDOM_TYPE_ROOT
;
1185 $this- > parent
= $this- > root
;
1186 if ( $this- > size
> 0 ) $this- > char
= $this- > doc
[ 0 ];
1189 // parse html content
1190 protected function parse ()
1192 if (( $s = $this- > copy_until_char ( '<' ))=== '' )
1194 return $this- > read_tag ();
1198 $node = new simple_html_dom_node ( $this );
1200 $node- > _
[ HDOM_INFO_TEXT
] = $s ;
1201 $this- > link_nodes ( $node , false );
1205 // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.
1206 // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec
1207 // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.
1208 protected function parse_charset ()
1210 global $debug_object ;
1214 if ( function_exists ( 'get_last_retrieve_url_contents_content_type' ))
1216 $contentTypeHeader = get_last_retrieve_url_contents_content_type ();
1217 $success = preg_match ( '/charset=(.+)/' , $contentTypeHeader , $matches );
1220 $charset = $matches [ 1 ];
1221 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'header content-type found charset of: ' . $charset
);}
1226 if ( empty ( $charset ))
1228 $el = $this- > root
-> find ( 'meta[http-equiv=Content-Type]' , 0 , true );
1231 $fullvalue = $el- > content
;
1232 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'meta content-type tag found' . $fullvalue
);}
1234 if (! empty ( $fullvalue ))
1236 $success = preg_match ( '/charset=(.+)/i' , $fullvalue , $matches );
1239 $charset = $matches [ 1 ];
1243 // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1
1244 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'meta content-type tag couldn \' t be parsed. using iso-8859 default.' );}
1245 $charset = 'ISO-8859-1' ;
1251 // If we couldn't find a charset above, then lets try to detect one based on the text we got...
1252 if ( empty ( $charset ))
1254 // Use this in case mb_detect_charset isn't installed/loaded on this machine.
1256 if ( function_exists ( 'mb_detect_encoding' ))
1258 // Have php try to detect the encoding from the text given to us.
1259 $charset = mb_detect_encoding ( $this- > root
-> plaintext
. "ascii" , $encoding_list = array ( "UTF-8" , "CP1252" ) );
1260 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'mb_detect found: ' . $charset
);}
1263 // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
1264 if ( $charset === false )
1266 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'since mb_detect failed - using default of utf-8' );}
1271 // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.
1272 if (( strtolower ( $charset ) == strtolower ( 'ISO-8859-1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin1' )) || ( strtolower ( $charset ) == strtolower ( 'Latin-1' )))
1274 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'replacing ' . $charset
. ' with CP1252 as its a superset' );}
1275 $charset = 'CP1252' ;
1278 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 1 , 'EXIT - ' . $charset
);}
1280 return $this- > _charset
= $charset ;
1284 protected function read_tag ()
1286 if ( $this- > char
!== '<' )
1288 $this- > root
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1291 $begin_tag_pos = $this- > pos
;
1292 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1295 if ( $this- > char
=== '/' )
1297 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1298 // This represents the change in the simple_html_dom trunk from revision 180 to 181.
1299 // $this->skip($this->token_blank_t);
1300 $this- > skip ( $this- > token_blank
);
1301 $tag = $this- > copy_until_char ( '>' );
1303 // skip attributes in end tag
1304 if (( $pos = strpos ( $tag , ' ' ))!== false )
1305 $tag = substr ( $tag , 0 , $pos );
1307 $parent_lower = strtolower ( $this- > parent
-> tag
);
1308 $tag_lower = strtolower ( $tag );
1310 if ( $parent_lower !== $tag_lower )
1312 if ( isset ( $this- > optional_closing_tags
[ $parent_lower ]) && isset ( $this- > block_tags
[ $tag_lower ]))
1314 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1315 $org_parent = $this- > parent
;
1317 while (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1318 $this- > parent
= $this- > parent
-> parent
;
1320 if ( strtolower ( $this- > parent
-> tag
)!== $tag_lower ) {
1321 $this- > parent
= $org_parent ; // restore origonal parent
1322 if ( $this- > parent
-> parent
) $this- > parent
= $this- > parent
-> parent
;
1323 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1324 return $this- > as_text_node ( $tag );
1327 else if (( $this- > parent
-> parent
) && isset ( $this- > block_tags
[ $tag_lower ]))
1329 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1330 $org_parent = $this- > parent
;
1332 while (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1333 $this- > parent
= $this- > parent
-> parent
;
1335 if ( strtolower ( $this- > parent
-> tag
)!== $tag_lower )
1337 $this- > parent
= $org_parent ; // restore origonal parent
1338 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1339 return $this- > as_text_node ( $tag );
1342 else if (( $this- > parent
-> parent
) && strtolower ( $this- > parent
-> parent
-> tag
)=== $tag_lower )
1344 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1345 $this- > parent
= $this- > parent
-> parent
;
1348 return $this- > as_text_node ( $tag );
1351 $this- > parent
-> _
[ HDOM_INFO_END
] = $this- > cursor
;
1352 if ( $this- > parent
-> parent
) $this- > parent
= $this- > parent
-> parent
;
1354 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1358 $node = new simple_html_dom_node ( $this );
1359 $node- > _
[ HDOM_INFO_BEGIN
] = $this- > cursor
;
1361 $tag = $this- > copy_until ( $this- > token_slash
);
1362 $node- > tag_start
= $begin_tag_pos ;
1364 // doctype, cdata & comments...
1365 if ( isset ( $tag [ 0 ]) && $tag [ 0 ]=== '!' ) {
1366 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $this- > copy_until_char ( '>' );
1368 if ( isset ( $tag [ 2 ]) && $tag [ 1 ]=== '-' && $tag [ 2 ]=== '-' ) {
1369 $node- > nodetype
= HDOM_TYPE_COMMENT
;
1370 $node- > tag
= 'comment' ;
1372 $node- > nodetype
= HDOM_TYPE_UNKNOWN
;
1373 $node- > tag
= 'unknown' ;
1375 if ( $this- > char
=== '>' ) $node- > _
[ HDOM_INFO_TEXT
].= '>' ;
1376 $this- > link_nodes ( $node , true );
1377 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1382 if ( $pos = strpos ( $tag , '<' )!== false ) {
1383 $tag = '<' . substr ( $tag , 0 , - 1 );
1384 $node- > _
[ HDOM_INFO_TEXT
] = $tag ;
1385 $this- > link_nodes ( $node , false );
1386 $this- > char
= $this- > doc
[-- $this- > pos
]; // prev
1390 if (! preg_match ( "/^[\w-:]+$/" , $tag )) {
1391 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $this- > copy_until ( '<>' );
1392 if ( $this- > char
=== '<' ) {
1393 $this- > link_nodes ( $node , false );
1397 if ( $this- > char
=== '>' ) $node- > _
[ HDOM_INFO_TEXT
].= '>' ;
1398 $this- > link_nodes ( $node , false );
1399 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1404 $node- > nodetype
= HDOM_TYPE_ELEMENT
;
1405 $tag_lower = strtolower ( $tag );
1406 $node- > tag
= ( $this- > lowercase
) ? $tag_lower : $tag ;
1408 // handle optional closing tags
1409 if ( isset ( $this- > optional_closing_tags
[ $tag_lower ]) )
1411 while ( isset ( $this- > optional_closing_tags
[ $tag_lower ][ strtolower ( $this- > parent
-> tag
)]))
1413 $this- > parent
-> _
[ HDOM_INFO_END
] = 0 ;
1414 $this- > parent
= $this- > parent
-> parent
;
1416 $node- > parent
= $this- > parent
;
1419 $guard = 0 ; // prevent infinity loop
1420 $space = array ( $this- > copy_skip ( $this- > token_blank
), '' , '' );
1425 if ( $this- > char
!== null && $space [ 0 ]=== '' )
1429 $name = $this- > copy_until ( $this- > token_equal
);
1430 if ( $guard === $this- > pos
)
1432 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1435 $guard = $this- > pos
;
1437 // handle endless '<'
1438 if ( $this- > pos
>= $this- > size
- 1 && $this- > char
!== '>' ) {
1439 $node- > nodetype
= HDOM_TYPE_TEXT
;
1440 $node- > _
[ HDOM_INFO_END
] = 0 ;
1441 $node- > _
[ HDOM_INFO_TEXT
] = '<' . $tag . $space [ 0 ] . $name ;
1442 $node- > tag
= 'text' ;
1443 $this- > link_nodes ( $node , false );
1447 // handle mismatch '<'
1448 if ( $this- > doc
[ $this- > pos
- 1 ]== '<' ) {
1449 $node- > nodetype
= HDOM_TYPE_TEXT
;
1450 $node- > tag
= 'text' ;
1451 $node- > attr
= array ();
1452 $node- > _
[ HDOM_INFO_END
] = 0 ;
1453 $node- > _
[ HDOM_INFO_TEXT
] = substr ( $this- > doc
, $begin_tag_pos , $this- > pos
- $begin_tag_pos-1 );
1455 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1456 $this- > link_nodes ( $node , false );
1460 if ( $name !== '/' && $name !== '' ) {
1461 $space [ 1 ] = $this- > copy_skip ( $this- > token_blank
);
1462 $name = $this- > restore_noise ( $name );
1463 if ( $this- > lowercase
) $name = strtolower ( $name );
1464 if ( $this- > char
=== '=' ) {
1465 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1466 $this- > parse_attr ( $node , $name , $space );
1469 //no value attr: nowrap, checked selected...
1470 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_NO
;
1471 $node- > attr
[ $name ] = true ;
1472 if ( $this- > char
!= '>' ) $this- > char
= $this- > doc
[-- $this- > pos
]; // prev
1474 $node- > _
[ HDOM_INFO_SPACE
][] = $space ;
1475 $space = array ( $this- > copy_skip ( $this- > token_blank
), '' , '' );
1479 } while ( $this- > char
!== '>' && $this- > char
!== '/' );
1481 $this- > link_nodes ( $node , true );
1482 $node- > _
[ HDOM_INFO_ENDSPACE
] = $space [ 0 ];
1484 // check self closing
1485 if ( $this- > copy_until_char_escape ( '>' )=== '/' )
1487 $node- > _
[ HDOM_INFO_ENDSPACE
] .= '/' ;
1488 $node- > _
[ HDOM_INFO_END
] = 0 ;
1493 if (! isset ( $this- > self_closing_tags
[ strtolower ( $node- > tag
)])) $this- > parent
= $node ;
1495 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1497 // If it's a BR tag, we need to set it's text to the default text.
1498 // This way when we see it in plaintext, we can generate formatting that the user wants.
1499 // since a br tag never has sub nodes, this works well.
1500 if ( $node- > tag
== "br" )
1502 $node- > _
[ HDOM_INFO_INNER
] = $this- > default_br_text
;
1509 protected function parse_attr ( $node , $name , & $space )
1511 // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037
1512 // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.
1513 if ( isset ( $node- > attr
[ $name ]))
1518 $space [ 2 ] = $this- > copy_skip ( $this- > token_blank
);
1519 switch ( $this- > char
) {
1521 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_DOUBLE
;
1522 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1523 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until_char_escape ( '"' ));
1524 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1527 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_SINGLE
;
1528 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1529 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until_char_escape ( ' \' ' ));
1530 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1533 $node- > _
[ HDOM_INFO_QUOTE
][] = HDOM_QUOTE_NO
;
1534 $node- > attr
[ $name ] = $this- > restore_noise ( $this- > copy_until ( $this- > token_attr
));
1536 // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.
1537 $node- > attr
[ $name ] = str_replace ( " \r " , "" , $node- > attr
[ $name ]);
1538 $node- > attr
[ $name ] = str_replace ( " \n " , "" , $node- > attr
[ $name ]);
1539 // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.
1540 if ( $name == "class" ) {
1541 $node- > attr
[ $name ] = trim ( $node- > attr
[ $name ]);
1545 // link node's parent
1546 protected function link_nodes (& $node , $is_child )
1548 $node- > parent
= $this- > parent
;
1549 $this- > parent
-> nodes
[] = $node ;
1552 $this- > parent
-> children
[] = $node ;
1557 protected function as_text_node ( $tag )
1559 $node = new simple_html_dom_node ( $this );
1561 $node- > _
[ HDOM_INFO_TEXT
] = '</' . $tag . '>' ;
1562 $this- > link_nodes ( $node , false );
1563 $this- > char
= ( ++
$this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1567 protected function skip ( $chars )
1569 $this- > pos +
= strspn ( $this- > doc
, $chars , $this- > pos
);
1570 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1573 protected function copy_skip ( $chars )
1576 $len = strspn ( $this- > doc
, $chars , $pos );
1578 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1579 if ( $len === 0 ) return '' ;
1580 return substr ( $this- > doc
, $pos , $len );
1583 protected function copy_until ( $chars )
1586 $len = strcspn ( $this- > doc
, $chars , $pos );
1588 $this- > char
= ( $this- > pos
< $this- > size
) ? $this- > doc
[ $this- > pos
] : null ; // next
1589 return substr ( $this- > doc
, $pos , $len );
1592 protected function copy_until_char ( $char )
1594 if ( $this- > char
=== null ) return '' ;
1596 if (( $pos = strpos ( $this- > doc
, $char , $this- > pos
))=== false ) {
1597 $ret = substr ( $this- > doc
, $this- > pos
, $this- > size
- $this- > pos
);
1599 $this- > pos
= $this- > size
;
1603 if ( $pos === $this- > pos
) return '' ;
1604 $pos_old = $this- > pos
;
1605 $this- > char
= $this- > doc
[ $pos ];
1607 return substr ( $this- > doc
, $pos_old , $pos-$pos_old );
1610 protected function copy_until_char_escape ( $char )
1612 if ( $this- > char
=== null ) return '' ;
1614 $start = $this- > pos
;
1617 if (( $pos = strpos ( $this- > doc
, $char , $start ))=== false )
1619 $ret = substr ( $this- > doc
, $this- > pos
, $this- > size
- $this- > pos
);
1621 $this- > pos
= $this- > size
;
1625 if ( $pos === $this- > pos
) return '' ;
1627 if ( $this- > doc
[ $pos-1 ]=== ' \\ ' ) {
1632 $pos_old = $this- > pos
;
1633 $this- > char
= $this- > doc
[ $pos ];
1635 return substr ( $this- > doc
, $pos_old , $pos-$pos_old );
1639 // remove noise from html content
1640 // save the noise in the $this->noise array.
1641 protected function remove_noise ( $pattern , $remove_tag = false )
1643 global $debug_object ;
1644 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 ); }
1646 $count = preg_match_all ( $pattern , $this- > doc
, $matches , PREG_SET_ORDER
| PREG_OFFSET_CAPTURE
);
1648 for ( $i = $count-1 ; $i >- 1 ; -- $i )
1650 $key = '___noise___' . sprintf ( '% 5d' , count ( $this- > noise
) +
1000 );
1651 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'key is: ' . $key
); }
1652 $idx = ( $remove_tag ) ? 0 : 1 ;
1653 $this- > noise
[ $key ] = $matches [ $i ][ $idx ][ 0 ];
1654 $this- > doc
= substr_replace ( $this- > doc
, $key , $matches [ $i ][ $idx ][ 1 ], strlen ( $matches [ $i ][ $idx ][ 0 ]));
1657 // reset the length of content
1658 $this- > size
= strlen ( $this- > doc
);
1661 $this- > char
= $this- > doc
[ 0 ];
1665 // restore noise to html content
1666 function restore_noise ( $text )
1668 global $debug_object ;
1669 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 ); }
1671 while (( $pos = strpos ( $text , '___noise___' ))!== false )
1673 // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...
1674 if ( strlen ( $text ) > $pos +
15 )
1676 $key = '___noise___' . $text [ $pos +
11 ]. $text [ $pos +
12 ]. $text [ $pos +
13 ]. $text [ $pos +
14 ]. $text [ $pos +
15 ];
1677 if ( is_object ( $debug_object )) { $debug_object
-> debug_log ( 2 , 'located key of: ' . $key
); }
1679 if ( isset ( $this- > noise
[ $key ]))
1681 $text = substr ( $text , 0 , $pos ). $this- > noise
[ $key ]. substr ( $text , $pos +
16 );
1685 // do this to prevent an infinite loop.
1686 $text = substr ( $text , 0 , $pos ). 'UNDEFINED NOISE FOR KEY: ' . $key . substr ( $text , $pos +
16 );
1691 // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.
1692 $text = substr ( $text , 0 , $pos ). 'NO NUMERIC NOISE KEY' . substr ( $text , $pos +
11 );
1698 // Sometimes we NEED one of the noise elements.
1699 function search_noise ( $text )
1701 global $debug_object ;
1702 if ( is_object ( $debug_object )) { $debug_object
-> debug_log_entry ( 1 ); }
1704 foreach ( $this- > noise
as $noiseElement )
1706 if ( strpos ( $noiseElement , $text )!== false )
1708 return $noiseElement ;
1712 function __toString ()
1714 return $this- > root
-> innertext ();
1717 function __get ( $name )
1722 return $this- > root
-> innertext ();
1724 return $this- > root
-> innertext ();
1726 return $this- > root
-> text ();
1728 return $this- > _charset
;
1729 case 'target_charset' :
1730 return $this- > _target_charset
;
1734 // camel naming conventions
1735 function childNodes ( $idx =- 1 ) { return $this
-> root
-> childNodes ( $idx
);}
1736 function firstChild () { return $this
-> root
-> first_child ();}
1737 function lastChild () { return $this
-> root
-> last_child ();}
1738 function createElement ( $name , $value = null ) { return @ str_get_html ( "<$name>$value</$name>" )-> first_child ();}
1739 function createTextNode ( $value ) { return @ end ( str_get_html ( $value
)-> nodes
);}
1740 function getElementById ( $id ) { return $this
-> find ( "#$id" , 0 );}
1741 function getElementsById ( $id , $idx = null ) { return $this
-> find ( "#$id" , $idx
);}
1742 function getElementByTagName ( $name ) { return $this
-> find ( $name
, 0 );}
1743 function getElementsByTagName ( $name , $idx =- 1 ) { return $this
-> find ( $name
, $idx
);}
1744 function loadFile () { $args
= func_get_args (); $this
-> load_file ( $args
);}