diff options
Diffstat (limited to 'inc/3rdparty/libraries/content-extractor/SiteConfig.php')
-rw-r--r-- | inc/3rdparty/libraries/content-extractor/SiteConfig.php | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php new file mode 100644 index 00000000..c5e300d7 --- /dev/null +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php | |||
@@ -0,0 +1,338 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Site Config | ||
4 | * | ||
5 | * Each instance of this class should hold extraction patterns and other directives | ||
6 | * for a website. See ContentExtractor class to see how it's used. | ||
7 | * | ||
8 | * @version 0.7 | ||
9 | * @date 2012-08-27 | ||
10 | * @author Keyvan Minoukadeh | ||
11 | * @copyright 2012 Keyvan Minoukadeh | ||
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | ||
13 | */ | ||
14 | |||
15 | class SiteConfig | ||
16 | { | ||
17 | // Use first matching element as title (0 or more xpath expressions) | ||
18 | public $title = array(); | ||
19 | |||
20 | // Use first matching element as body (0 or more xpath expressions) | ||
21 | public $body = array(); | ||
22 | |||
23 | // Use first matching element as author (0 or more xpath expressions) | ||
24 | public $author = array(); | ||
25 | |||
26 | // Use first matching element as date (0 or more xpath expressions) | ||
27 | public $date = array(); | ||
28 | |||
29 | // Strip elements matching these xpath expressions (0 or more) | ||
30 | public $strip = array(); | ||
31 | |||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | ||
33 | public $strip_id_or_class = array(); | ||
34 | |||
35 | // Strip images which contain these strings (0 or more) in the src attribute | ||
36 | public $strip_image_src = array(); | ||
37 | |||
38 | // Additional HTTP headers to send | ||
39 | // NOT YET USED | ||
40 | public $http_header = array(); | ||
41 | |||
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) | ||
43 | public $tidy = null; | ||
44 | |||
45 | protected $default_tidy = true; // used if undeclared | ||
46 | |||
47 | // Autodetect title/body if xpath expressions fail to produce results. | ||
48 | // Note that this applies to title and body separately, ie. | ||
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title | ||
50 | // * if neither match, this determines whether we autodetect title and body. | ||
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | ||
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | ||
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | ||
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | ||
55 | // bool or null if undeclared | ||
56 | public $autodetect_on_failure = null; | ||
57 | protected $default_autodetect_on_failure = true; // used if undeclared | ||
58 | |||
59 | // Clean up content block - attempt to remove elements that appear to be superfluous | ||
60 | // bool or null if undeclared | ||
61 | public $prune = null; | ||
62 | protected $default_prune = true; // used if undeclared | ||
63 | |||
64 | // Test URL - if present, can be used to test the config above | ||
65 | public $test_url = array(); | ||
66 | |||
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | ||
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | ||
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | ||
70 | // which displays the entire article on one page (e.g. 'print view'). | ||
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | ||
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | ||
73 | public $single_page_link = array(); | ||
74 | |||
75 | public $next_page_link = array(); | ||
76 | |||
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | ||
78 | public $single_page_link_in_feed = array(); | ||
79 | |||
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | ||
81 | // string or null if undeclared | ||
82 | public $parser = null; | ||
83 | protected $default_parser = 'libxml'; // used if undeclared | ||
84 | |||
85 | // Strings to search for in HTML before processing begins (used with $replace_string) | ||
86 | public $find_string = array(); | ||
87 | // Strings to replace those found in $find_string before HTML processing begins | ||
88 | public $replace_string = array(); | ||
89 | |||
90 | // the options below cannot be set in the config files which this class represents | ||
91 | |||
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not | ||
93 | public $cache_key = null; | ||
94 | public static $debug = false; | ||
95 | protected static $apc = false; | ||
96 | protected static $config_path; | ||
97 | protected static $config_path_fallback; | ||
98 | protected static $config_cache = array(); | ||
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | ||
100 | |||
101 | protected static function debug($msg) { | ||
102 | if (self::$debug) { | ||
103 | //$mem = round(memory_get_usage()/1024, 2); | ||
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); | ||
105 | echo '* ',$msg; | ||
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | ||
107 | echo "\n"; | ||
108 | ob_flush(); | ||
109 | flush(); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | // enable APC caching of certain site config files? | ||
114 | // If enabled the following site config files will be | ||
115 | // cached in APC cache (when requested for first time): | ||
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ | ||
117 | // * the site config files associated with HTML fingerprints | ||
118 | // * the global site config file | ||
119 | // returns true if enabled, false otherwise | ||
120 | public static function use_apc($apc=true) { | ||
121 | if (!function_exists('apc_add')) { | ||
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); | ||
123 | return false; | ||
124 | } | ||
125 | self::$apc = $apc; | ||
126 | return $apc; | ||
127 | } | ||
128 | |||
129 | // return bool or null | ||
130 | public function tidy($use_default=true) { | ||
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; | ||
132 | return $this->tidy; | ||
133 | } | ||
134 | |||
135 | // return bool or null | ||
136 | public function prune($use_default=true) { | ||
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; | ||
138 | return $this->prune; | ||
139 | } | ||
140 | |||
141 | // return string or null | ||
142 | public function parser($use_default=true) { | ||
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; | ||
144 | return $this->parser; | ||
145 | } | ||
146 | |||
147 | // return bool or null | ||
148 | public function autodetect_on_failure($use_default=true) { | ||
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; | ||
150 | return $this->autodetect_on_failure; | ||
151 | } | ||
152 | |||
153 | public static function set_config_path($path, $fallback=null) { | ||
154 | self::$config_path = $path; | ||
155 | self::$config_path_fallback = $fallback; | ||
156 | } | ||
157 | |||
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { | ||
159 | $key = strtolower($key); | ||
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | ||
161 | if ($config->cache_key) $key = $config->cache_key; | ||
162 | self::$config_cache[$key] = $config; | ||
163 | if (self::$apc && $use_apc) { | ||
164 | self::debug("Adding site config to APC cache with key sc.$key"); | ||
165 | apc_add("sc.$key", $config); | ||
166 | } | ||
167 | self::debug("Cached site config with key $key"); | ||
168 | } | ||
169 | |||
170 | public static function is_cached($key) { | ||
171 | $key = strtolower($key); | ||
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | ||
173 | if (array_key_exists($key, self::$config_cache)) { | ||
174 | return true; | ||
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { | ||
176 | return true; | ||
177 | } | ||
178 | return false; | ||
179 | } | ||
180 | |||
181 | public function append(SiteConfig $newconfig) { | ||
182 | // check for commands where we accept multiple statements (no test_url) | ||
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { | ||
184 | // append array elements for this config variable from $newconfig to this config | ||
185 | //$this->$var = $this->$var + $newconfig->$var; | ||
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); | ||
187 | } | ||
188 | // check for single statement commands | ||
189 | // we do not overwrite existing non null values | ||
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { | ||
191 | if ($this->$var === null) $this->$var = $newconfig->$var; | ||
192 | } | ||
193 | } | ||
194 | |||
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise | ||
196 | // if $exact_host_match is true, we will not look for wildcard config matches | ||
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists | ||
198 | public static function build($host, $exact_host_match=false) { | ||
199 | $host = strtolower($host); | ||
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | ||
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; | ||
202 | // check for site configuration | ||
203 | $try = array($host); | ||
204 | // should we look for wildcard matches | ||
205 | if (!$exact_host_match) { | ||
206 | $split = explode('.', $host); | ||
207 | if (count($split) > 1) { | ||
208 | array_shift($split); | ||
209 | $try[] = '.'.implode('.', $split); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | // look for site config file in primary folder | ||
214 | self::debug(". looking for site config for $host in primary folder"); | ||
215 | foreach ($try as $h) { | ||
216 | if (array_key_exists($h, self::$config_cache)) { | ||
217 | self::debug("... site config for $h already loaded in this request"); | ||
218 | return self::$config_cache[$h]; | ||
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { | ||
220 | self::debug("... site config for $h in APC cache"); | ||
221 | return $sconfig; | ||
222 | } elseif (file_exists(self::$config_path."/$h.txt")) { | ||
223 | self::debug("... found site config ($h.txt)"); | ||
224 | $file_primary = self::$config_path."/$h.txt"; | ||
225 | $matched_name = $h; | ||
226 | break; | ||
227 | } | ||
228 | } | ||
229 | |||
230 | // if we found site config, process it | ||
231 | if (isset($file_primary)) { | ||
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | ||
233 | if (!$config_lines || !is_array($config_lines)) return false; | ||
234 | $config = self::build_from_array($config_lines); | ||
235 | // if APC caching is available and enabled, mark this for cache | ||
236 | //$config->cache_in_apc = true; | ||
237 | $config->cache_key = $matched_name; | ||
238 | |||
239 | // if autodetec on failure is off (on by default) we do not need to look | ||
240 | // in secondary folder | ||
241 | if (!$config->autodetect_on_failure()) { | ||
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); | ||
243 | return $config; | ||
244 | } | ||
245 | } | ||
246 | |||
247 | // look for site config file in secondary folder | ||
248 | if (isset(self::$config_path_fallback)) { | ||
249 | self::debug(". looking for site config for $host in secondary folder"); | ||
250 | foreach ($try as $h) { | ||
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | ||
252 | self::debug("... found site config in secondary folder ($h.txt)"); | ||
253 | $file_secondary = self::$config_path_fallback."/$h.txt"; | ||
254 | $matched_name = $h; | ||
255 | break; | ||
256 | } | ||
257 | } | ||
258 | if (!isset($file_secondary)) { | ||
259 | self::debug("... no site config match in secondary folder"); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | // return false if no config file found | ||
264 | if (!isset($file_primary) && !isset($file_secondary)) { | ||
265 | self::debug("... no site config match for $host"); | ||
266 | return false; | ||
267 | } | ||
268 | |||
269 | // return primary config if secondary not found | ||
270 | if (!isset($file_secondary) && isset($config)) { | ||
271 | return $config; | ||
272 | } | ||
273 | |||
274 | // process secondary config file | ||
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | ||
276 | if (!$config_lines || !is_array($config_lines)) { | ||
277 | // failed to process secondary | ||
278 | if (isset($config)) { | ||
279 | // return primary config | ||
280 | return $config; | ||
281 | } else { | ||
282 | return false; | ||
283 | } | ||
284 | } | ||
285 | |||
286 | // merge with primary and return | ||
287 | if (isset($config)) { | ||
288 | self::debug('. merging config files'); | ||
289 | $config->append(self::build_from_array($config_lines)); | ||
290 | return $config; | ||
291 | } else { | ||
292 | // return just secondary | ||
293 | $config = self::build_from_array($config_lines); | ||
294 | // if APC caching is available and enabled, mark this for cache | ||
295 | //$config->cache_in_apc = true; | ||
296 | $config->cache_key = $matched_name; | ||
297 | return $config; | ||
298 | } | ||
299 | } | ||
300 | |||
301 | public static function build_from_array(array $lines) { | ||
302 | $config = new SiteConfig(); | ||
303 | foreach ($lines as $line) { | ||
304 | $line = trim($line); | ||
305 | |||
306 | // skip comments, empty lines | ||
307 | if ($line == '' || $line[0] == '#') continue; | ||
308 | |||
309 | // get command | ||
310 | $command = explode(':', $line, 2); | ||
311 | // if there's no colon ':', skip this line | ||
312 | if (count($command) != 2) continue; | ||
313 | $val = trim($command[1]); | ||
314 | $command = trim($command[0]); | ||
315 | if ($command == '' || $val == '') continue; | ||
316 | |||
317 | // check for commands where we accept multiple statements | ||
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { | ||
319 | array_push($config->$command, $val); | ||
320 | // check for single statement commands that evaluate to true or false | ||
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | ||
322 | $config->$command = ($val == 'yes'); | ||
323 | // check for single statement commands stored as strings | ||
324 | } elseif (in_array($command, array('parser'))) { | ||
325 | $config->$command = $val; | ||
326 | // check for replace_string(find): replace | ||
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | ||
328 | if (in_array($match[1], array('replace_string'))) { | ||
329 | $command = $match[1]; | ||
330 | array_push($config->find_string, $match[2]); | ||
331 | array_push($config->$command, $val); | ||
332 | } | ||
333 | } | ||
334 | } | ||
335 | return $config; | ||
336 | } | ||
337 | } | ||
338 | ?> \ No newline at end of file | ||