]> git.immae.eu Git - github/wallabag/wallabag.git/blame - inc/3rdparty/libraries/content-extractor/SiteConfig.php
Fix for #664 - Missing source url attribute in RSS feeds
[github/wallabag/wallabag.git] / inc / 3rdparty / libraries / content-extractor / SiteConfig.php
CommitLineData
42c80841
NL
1<?php\r
2/**\r
3 * Site Config\r
4 * \r
5 * Each instance of this class should hold extraction patterns and other directives\r
6 * for a website. See ContentExtractor class to see how it's used.\r
7 * \r
8 * @version 0.7\r
9 * @date 2012-08-27\r
10 * @author Keyvan Minoukadeh\r
11 * @copyright 2012 Keyvan Minoukadeh\r
12 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r
13 */\r
14\r
15class SiteConfig\r
16{\r
17 // Use first matching element as title (0 or more xpath expressions)\r
18 public $title = array();\r
19 \r
20 // Use first matching element as body (0 or more xpath expressions)\r
21 public $body = array();\r
22 \r
23 // Use first matching element as author (0 or more xpath expressions)\r
24 public $author = array();\r
25 \r
26 // Use first matching element as date (0 or more xpath expressions)\r
27 public $date = array();\r
28 \r
29 // Strip elements matching these xpath expressions (0 or more)\r
30 public $strip = array();\r
31 \r
32 // Strip elements which contain these strings (0 or more) in the id or class attribute \r
33 public $strip_id_or_class = array();\r
34 \r
35 // Strip images which contain these strings (0 or more) in the src attribute \r
36 public $strip_image_src = array();\r
37 \r
38 // Additional HTTP headers to send\r
39 // NOT YET USED\r
40 public $http_header = array();\r
41 \r
42 // Process HTML with tidy before creating DOM (bool or null if undeclared)\r
43 public $tidy = null;\r
44 \r
45 protected $default_tidy = true; // used if undeclared\r
46 \r
47 // Autodetect title/body if xpath expressions fail to produce results.\r
48 // Note that this applies to title and body separately, ie. \r
49 // * if we get a body match but no title match, this option will determine whether we autodetect title \r
50 // * if neither match, this determines whether we autodetect title and body.\r
51 // Also note that this only applies when there is at least one xpath expression in title or body, ie.\r
52 // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)\r
53 // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.\r
54 // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).\r
55 // bool or null if undeclared\r
56 public $autodetect_on_failure = null;\r
57 protected $default_autodetect_on_failure = true; // used if undeclared\r
58 \r
59 // Clean up content block - attempt to remove elements that appear to be superfluous\r
60 // bool or null if undeclared\r
61 public $prune = null;\r
62 protected $default_prune = true; // used if undeclared\r
63 \r
64 // Test URL - if present, can be used to test the config above\r
65 public $test_url = array();\r
66 \r
67 // Single-page link - should identify a link element or URL pointing to the page holding the entire article\r
68 // This is useful for sites which split their articles across multiple pages. Links to such pages tend to \r
69 // display the first page with links to the other pages at the bottom. Often there is also a link to a page\r
70 // which displays the entire article on one page (e.g. 'print view').\r
71 // This should be an XPath expression identifying the link to that page. If present and we find a match,\r
72 // we will retrieve that page and the rest of the options in this config will be applied to the new page.\r
73 public $single_page_link = array();\r
74 \r
75 public $next_page_link = array();\r
76 \r
77 // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed\r
78 public $single_page_link_in_feed = array();\r
79 \r
80 // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')\r
81 // string or null if undeclared\r
82 public $parser = null;\r
83 protected $default_parser = 'libxml'; // used if undeclared\r
84 \r
85 // Strings to search for in HTML before processing begins (used with $replace_string)\r
86 public $find_string = array();\r
87 // Strings to replace those found in $find_string before HTML processing begins\r
88 public $replace_string = array();\r
89 \r
90 // the options below cannot be set in the config files which this class represents\r
91 \r
92 //public $cache_in_apc = false; // used to decide if we should cache in apc or not\r
93 public $cache_key = null;\r
94 public static $debug = false;\r
95 protected static $apc = false;\r
96 protected static $config_path;\r
97 protected static $config_path_fallback;\r
98 protected static $config_cache = array();\r
99 const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';\r
100 \r
101 protected static function debug($msg) {\r
102 if (self::$debug) {\r
103 //$mem = round(memory_get_usage()/1024, 2);\r
104 //$memPeak = round(memory_get_peak_usage()/1024, 2);\r
105 echo '* ',$msg;\r
106 //echo ' - mem used: ',$mem," (peak: $memPeak)\n";\r
107 echo "\n";\r
108 ob_flush();\r
109 flush();\r
110 }\r
111 }\r
112 \r
113 // enable APC caching of certain site config files?\r
114 // If enabled the following site config files will be \r
115 // cached in APC cache (when requested for first time):\r
116 // * anything in site_config/custom/ and its corresponding file in site_config/standard/\r
117 // * the site config files associated with HTML fingerprints\r
118 // * the global site config file\r
119 // returns true if enabled, false otherwise\r
120 public static function use_apc($apc=true) {\r
121 if (!function_exists('apc_add')) {\r
122 if ($apc) self::debug('APC will not be used (function apc_add does not exist)');\r
123 return false;\r
124 }\r
125 self::$apc = $apc;\r
126 return $apc;\r
127 }\r
128 \r
129 // return bool or null\r
130 public function tidy($use_default=true) {\r
131 if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;\r
132 return $this->tidy;\r
133 }\r
134 \r
135 // return bool or null\r
136 public function prune($use_default=true) {\r
137 if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;\r
138 return $this->prune;\r
139 }\r
140 \r
141 // return string or null\r
142 public function parser($use_default=true) {\r
143 if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;\r
144 return $this->parser;\r
145 }\r
146\r
147 // return bool or null\r
148 public function autodetect_on_failure($use_default=true) {\r
149 if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;\r
150 return $this->autodetect_on_failure;\r
151 }\r
152 \r
153 public static function set_config_path($path, $fallback=null) {\r
154 self::$config_path = $path;\r
155 self::$config_path_fallback = $fallback;\r
156 }\r
157 \r
158 public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {\r
159 $key = strtolower($key);\r
160 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r
161 if ($config->cache_key) $key = $config->cache_key;\r
162 self::$config_cache[$key] = $config;\r
163 if (self::$apc && $use_apc) {\r
164 self::debug("Adding site config to APC cache with key sc.$key");\r
165 apc_add("sc.$key", $config);\r
166 }\r
167 self::debug("Cached site config with key $key");\r
168 }\r
169 \r
170 public static function is_cached($key) {\r
171 $key = strtolower($key);\r
172 if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r
173 if (array_key_exists($key, self::$config_cache)) {\r
174 return true;\r
175 } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {\r
176 return true;\r
177 }\r
178 return false;\r
179 }\r
180 \r
181 public function append(SiteConfig $newconfig) {\r
182 // check for commands where we accept multiple statements (no test_url)\r
183 foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {\r
184 // append array elements for this config variable from $newconfig to this config\r
185 //$this->$var = $this->$var + $newconfig->$var;\r
186 $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));\r
187 }\r
188 // check for single statement commands\r
189 // we do not overwrite existing non null values\r
190 foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {\r
191 if ($this->$var === null) $this->$var = $newconfig->$var;\r
192 }\r
193 }\r
194 \r
195 // returns SiteConfig instance if an appropriate one is found, false otherwise\r
196 // if $exact_host_match is true, we will not look for wildcard config matches\r
197 // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists\r
198 public static function build($host, $exact_host_match=false) {\r
199 $host = strtolower($host);\r
200 if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);\r
201 if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;\r
202 // check for site configuration\r
203 $try = array($host);\r
204 // should we look for wildcard matches \r
205 if (!$exact_host_match) {\r
206 $split = explode('.', $host);\r
207 if (count($split) > 1) {\r
208 array_shift($split);\r
209 $try[] = '.'.implode('.', $split);\r
210 }\r
211 }\r
212 \r
213 // look for site config file in primary folder\r
214 self::debug(". looking for site config for $host in primary folder");\r
215 foreach ($try as $h) {\r
216 if (array_key_exists($h, self::$config_cache)) {\r
217 self::debug("... site config for $h already loaded in this request");\r
218 return self::$config_cache[$h];\r
219 } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {\r
220 self::debug("... site config for $h in APC cache");\r
221 return $sconfig;\r
222 } elseif (file_exists(self::$config_path."/$h.txt")) {\r
223 self::debug("... found site config ($h.txt)");\r
224 $file_primary = self::$config_path."/$h.txt";\r
225 $matched_name = $h;\r
226 break;\r
227 }\r
228 }\r
229 \r
230 // if we found site config, process it\r
231 if (isset($file_primary)) {\r
232 $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r
233 if (!$config_lines || !is_array($config_lines)) return false;\r
234 $config = self::build_from_array($config_lines);\r
235 // if APC caching is available and enabled, mark this for cache\r
236 //$config->cache_in_apc = true;\r
237 $config->cache_key = $matched_name;\r
238 \r
239 // if autodetec on failure is off (on by default) we do not need to look\r
240 // in secondary folder\r
241 if (!$config->autodetect_on_failure()) {\r
242 self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');\r
243 return $config;\r
244 }\r
245 }\r
246 \r
247 // look for site config file in secondary folder\r
248 if (isset(self::$config_path_fallback)) {\r
249 self::debug(". looking for site config for $host in secondary folder");\r
250 foreach ($try as $h) {\r
251 if (file_exists(self::$config_path_fallback."/$h.txt")) {\r
252 self::debug("... found site config in secondary folder ($h.txt)");\r
253 $file_secondary = self::$config_path_fallback."/$h.txt";\r
254 $matched_name = $h;\r
255 break;\r
256 }\r
257 }\r
258 if (!isset($file_secondary)) {\r
259 self::debug("... no site config match in secondary folder");\r
260 }\r
261 }\r
262 \r
263 // return false if no config file found\r
264 if (!isset($file_primary) && !isset($file_secondary)) {\r
265 self::debug("... no site config match for $host");\r
266 return false;\r
267 }\r
268 \r
269 // return primary config if secondary not found\r
270 if (!isset($file_secondary) && isset($config)) {\r
271 return $config;\r
272 }\r
273 \r
274 // process secondary config file\r
275 $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r
276 if (!$config_lines || !is_array($config_lines)) {\r
277 // failed to process secondary\r
278 if (isset($config)) {\r
279 // return primary config\r
280 return $config;\r
281 } else {\r
282 return false;\r
283 }\r
284 }\r
285 \r
286 // merge with primary and return\r
287 if (isset($config)) {\r
288 self::debug('. merging config files');\r
289 $config->append(self::build_from_array($config_lines));\r
290 return $config;\r
291 } else {\r
292 // return just secondary\r
293 $config = self::build_from_array($config_lines);\r
294 // if APC caching is available and enabled, mark this for cache\r
295 //$config->cache_in_apc = true;\r
296 $config->cache_key = $matched_name;\r
297 return $config;\r
298 }\r
299 }\r
300 \r
301 public static function build_from_array(array $lines) {\r
302 $config = new SiteConfig();\r
303 foreach ($lines as $line) {\r
304 $line = trim($line);\r
305 \r
306 // skip comments, empty lines\r
307 if ($line == '' || $line[0] == '#') continue;\r
308 \r
309 // get command\r
310 $command = explode(':', $line, 2);\r
311 // if there's no colon ':', skip this line\r
312 if (count($command) != 2) continue;\r
313 $val = trim($command[1]);\r
314 $command = trim($command[0]);\r
315 if ($command == '' || $val == '') continue;\r
316 \r
317 // check for commands where we accept multiple statements\r
318 if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {\r
319 array_push($config->$command, $val);\r
320 // check for single statement commands that evaluate to true or false\r
321 } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {\r
322 $config->$command = ($val == 'yes');\r
323 // check for single statement commands stored as strings\r
324 } elseif (in_array($command, array('parser'))) {\r
325 $config->$command = $val;\r
326 // check for replace_string(find): replace\r
327 } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {\r
328 if (in_array($match[1], array('replace_string'))) {\r
329 $command = $match[1];\r
330 array_push($config->find_string, $match[2]);\r
331 array_push($config->$command, $val);\r
332 }\r
333 }\r
334 }\r
335 return $config;\r
336 }\r
337}\r
338?>