]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/content-extractor/SiteConfig.php
5 * Each instance of this class should hold extraction patterns and other directives
6 * for a website. See ContentExtractor class to see how it's used.
10 * @author Keyvan Minoukadeh
11 * @copyright 2011 Keyvan Minoukadeh
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
17 // Use first matching element as title (0 or more xpath expressions)
18 public $title = array ();
20 // Use first matching element as body (0 or more xpath expressions)
21 public $body = array ();
23 // Use first matching element as author (0 or more xpath expressions)
24 public $author = array ();
26 // Use first matching element as date (0 or more xpath expressions)
27 public $date = array ();
29 // Strip elements matching these xpath expressions (0 or more)
30 public $strip = array ();
32 // Strip elements which contain these strings (0 or more) in the id or class attribute
33 public $strip_id_or_class = array ();
35 // Strip images which contain these strings (0 or more) in the src attribute
36 public $strip_image_src = array ();
38 // Additional HTTP headers to send
40 public $http_header = array ();
42 // Process HTML with tidy before creating DOM
45 // Autodetect title/body if xpath expressions fail to produce results.
46 // Note that this applies to title and body separately, ie.
47 // * if we get a body match but no title match, this option will determine whether we autodetect title
48 // * if neither match, this determines whether we autodetect title and body.
49 // Also note that this only applies when there is at least one xpath expression in title or body, ie.
50 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
51 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
52 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
53 public $autodetect_on_failure = true ;
55 // Clean up content block - attempt to remove elements that appear to be superfluous
58 // Test URL - if present, can be used to test the config above
59 public $test_url = null ;
61 // Single-page link - should identify a link element or URL pointing to the page holding the entire article
62 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
63 // display the first page with links to the other pages at the bottom. Often there is also a link to a page
64 // which displays the entire article on one page (e.g. 'print view').
65 // This should be an XPath expression identifying the link to that page. If present and we find a match,
66 // we will retrieve that page and the rest of the options in this config will be applied to the new page.
67 public $single_page_link = array ();
69 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
70 public $single_page_link_in_feed = array ();
72 // TODO: which parser to use for turning raw HTML into a DOMDocument
73 public $parser = 'libxml' ;
75 // String replacement to be made on HTML before processing begins
76 public $replace_string = array ();
78 // the options below cannot be set in the config files which this class represents
80 public static $debug = false ;
81 protected static $config_path ;
82 protected static $config_path_fallback ;
83 protected static $config_cache = array ();
84 const HOSTNAME_REGEX
= '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/' ;
86 protected static function debug ( $msg ) {
88 $mem = round ( memory_get_usage ()/ 1024 , 2 );
89 $memPeak = round ( memory_get_peak_usage ()/ 1024 , 2 );
91 echo ' - mem used: ' , $mem , " (peak: $memPeak ) \n " ;
97 public static function set_config_path ( $path , $fallback = null ) {
98 self
:: $config_path = $path ;
99 self
:: $config_path_fallback = $fallback ;
102 public static function add_to_cache ( $host , SiteConfig
$config ) {
103 $host = strtolower ( $host );
104 self
:: $config_cache [ $host ] = $config ;
107 // returns SiteConfig instance if an appropriate one is found, false otherwise
108 public static function build ( $host ) {
109 $host = strtolower ( $host );
110 if ( substr ( $host , 0 , 4 ) == 'www.' ) $host = substr ( $host , 4 );
111 if (! $host || ( strlen ( $host ) > 200 ) || ! preg_match ( self
:: HOSTNAME_REGEX
, $host )) return false ;
112 // check for site configuration
114 $split = explode ( '.' , $host );
115 if ( count ( $split ) > 1 ) {
117 $try [] = '.' . implode ( '.' , $split );
119 foreach ( $try as $h ) {
120 if ( array_key_exists ( $h , self
:: $config_cache )) {
121 self
:: debug ( "... cached ( $h )" );
122 return self
:: $config_cache [ $h ];
123 } elseif ( file_exists ( self
:: $config_path . "/ $h .txt" )) {
124 self
:: debug ( "... from file ( $h )" );
125 $file = self
:: $config_path . "/ $h .txt" ;
130 if ( isset ( self
:: $config_path_fallback )) {
131 self
:: debug ( "... trying fallback ( $host )" );
132 foreach ( $try as $h ) {
133 if ( file_exists ( self
:: $config_path_fallback . "/ $h .txt" )) {
134 self
:: debug ( "... from fallback file ( $h )" );
135 $file = self
:: $config_path_fallback . "/ $h .txt" ;
140 self
:: debug ( "... no match in fallback directory" );
144 self
:: debug ( "... no match ( $host )" );
148 $config_file = file ( $file , FILE_IGNORE_NEW_LINES
| FILE_SKIP_EMPTY_LINES
);
149 if (! $config_file || ! is_array ( $config_file )) return false ;
150 $config = new SiteConfig ();
151 foreach ( $config_file as $line ) {
154 // skip comments, empty lines
155 if ( $line == '' || $line [ 0 ] == '#' ) continue ;
158 $command = explode ( ':' , $line , 2 );
159 // if there's no colon ':', skip this line
160 if ( count ( $command ) != 2 ) continue ;
161 $val = trim ( $command [ 1 ]);
162 $command = trim ( $command [ 0 ]);
163 if ( $command == '' || $val == '' ) continue ;
165 // check for commands where we accept multiple statements
166 if ( in_array ( $command , array ( 'title' , 'body' , 'author' , 'date' , 'strip' , 'strip_id_or_class' , 'strip_image_src' , 'single_page_link' , 'single_page_link_in_feed' , 'http_header' ))) {
167 array_push ( $config- > $command , $val );
168 // check for single statement commands that evaluate to true or false
169 } elseif ( in_array ( $command , array ( 'tidy' , 'prune' , 'autodetect_on_failure' ))) {
170 $config- > $command = ( $val == 'yes' );
171 // check for single statement commands stored as strings
172 } elseif ( in_array ( $command , array ( 'test_url' , 'parser' ))) {
173 $config- > $command = $val ;
174 } elseif (( substr ( $command , - 1 ) == ')' ) && preg_match ( '!^([a-z0-9_]+)\((.*?)\)$!i' , $command , $match )) {
175 if ( in_array ( $match [ 1 ], array ( 'replace_string' ))) {
176 $command = $match [ 1 ];
177 array_push ( $config- > $command , array ( $match [ 2 ], $val ));