]>
Commit | Line | Data |
---|---|---|
1 | <?php\r | |
2 | /**\r | |
3 | * Site Config\r | |
4 | * \r | |
5 | * Each instance of this class should hold extraction patterns and other directives\r | |
6 | * for a website. See ContentExtractor class to see how it's used.\r | |
7 | * \r | |
8 | * @version 0.7\r | |
9 | * @date 2012-08-27\r | |
10 | * @author Keyvan Minoukadeh\r | |
11 | * @copyright 2012 Keyvan Minoukadeh\r | |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3\r | |
13 | */\r | |
14 | \r | |
15 | class SiteConfig\r | |
16 | {\r | |
17 | // Use first matching element as title (0 or more xpath expressions)\r | |
18 | public $title = array();\r | |
19 | \r | |
20 | // Use first matching element as body (0 or more xpath expressions)\r | |
21 | public $body = array();\r | |
22 | \r | |
23 | // Use first matching element as author (0 or more xpath expressions)\r | |
24 | public $author = array();\r | |
25 | \r | |
26 | // Use first matching element as date (0 or more xpath expressions)\r | |
27 | public $date = array();\r | |
28 | \r | |
29 | // Strip elements matching these xpath expressions (0 or more)\r | |
30 | public $strip = array();\r | |
31 | \r | |
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute \r | |
33 | public $strip_id_or_class = array();\r | |
34 | \r | |
35 | // Strip images which contain these strings (0 or more) in the src attribute \r | |
36 | public $strip_image_src = array();\r | |
37 | \r | |
38 | // Additional HTTP headers to send\r | |
39 | // NOT YET USED\r | |
40 | public $http_header = array();\r | |
41 | \r | |
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared)\r | |
43 | public $tidy = null;\r | |
44 | \r | |
45 | protected $default_tidy = true; // used if undeclared\r | |
46 | \r | |
47 | // Autodetect title/body if xpath expressions fail to produce results.\r | |
48 | // Note that this applies to title and body separately, ie. \r | |
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title \r | |
50 | // * if neither match, this determines whether we autodetect title and body.\r | |
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie.\r | |
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)\r | |
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.\r | |
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).\r | |
55 | // bool or null if undeclared\r | |
56 | public $autodetect_on_failure = null;\r | |
57 | protected $default_autodetect_on_failure = true; // used if undeclared\r | |
58 | \r | |
59 | // Clean up content block - attempt to remove elements that appear to be superfluous\r | |
60 | // bool or null if undeclared\r | |
61 | public $prune = null;\r | |
62 | protected $default_prune = true; // used if undeclared\r | |
63 | \r | |
64 | // Test URL - if present, can be used to test the config above\r | |
65 | public $test_url = array();\r | |
66 | \r | |
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article\r | |
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to \r | |
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page\r | |
70 | // which displays the entire article on one page (e.g. 'print view').\r | |
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match,\r | |
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page.\r | |
73 | public $single_page_link = array();\r | |
74 | \r | |
75 | public $next_page_link = array();\r | |
76 | \r | |
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed\r | |
78 | public $single_page_link_in_feed = array();\r | |
79 | \r | |
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')\r | |
81 | // string or null if undeclared\r | |
82 | public $parser = null;\r | |
83 | protected $default_parser = 'libxml'; // used if undeclared\r | |
84 | \r | |
85 | // Strings to search for in HTML before processing begins (used with $replace_string)\r | |
86 | public $find_string = array();\r | |
87 | // Strings to replace those found in $find_string before HTML processing begins\r | |
88 | public $replace_string = array();\r | |
89 | \r | |
90 | // the options below cannot be set in the config files which this class represents\r | |
91 | \r | |
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not\r | |
93 | public $cache_key = null;\r | |
94 | public static $debug = false;\r | |
95 | protected static $apc = false;\r | |
96 | protected static $config_path;\r | |
97 | protected static $config_path_fallback;\r | |
98 | protected static $config_cache = array();\r | |
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';\r | |
100 | \r | |
101 | protected static function debug($msg) {\r | |
102 | if (self::$debug) {\r | |
103 | //$mem = round(memory_get_usage()/1024, 2);\r | |
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2);\r | |
105 | echo '* ',$msg;\r | |
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n";\r | |
107 | echo "\n";\r | |
108 | ob_flush();\r | |
109 | flush();\r | |
110 | }\r | |
111 | }\r | |
112 | \r | |
113 | // enable APC caching of certain site config files?\r | |
114 | // If enabled the following site config files will be \r | |
115 | // cached in APC cache (when requested for first time):\r | |
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/\r | |
117 | // * the site config files associated with HTML fingerprints\r | |
118 | // * the global site config file\r | |
119 | // returns true if enabled, false otherwise\r | |
120 | public static function use_apc($apc=true) {\r | |
121 | if (!function_exists('apc_add')) {\r | |
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)');\r | |
123 | return false;\r | |
124 | }\r | |
125 | self::$apc = $apc;\r | |
126 | return $apc;\r | |
127 | }\r | |
128 | \r | |
129 | // return bool or null\r | |
130 | public function tidy($use_default=true) {\r | |
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;\r | |
132 | return $this->tidy;\r | |
133 | }\r | |
134 | \r | |
135 | // return bool or null\r | |
136 | public function prune($use_default=true) {\r | |
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune;\r | |
138 | return $this->prune;\r | |
139 | }\r | |
140 | \r | |
141 | // return string or null\r | |
142 | public function parser($use_default=true) {\r | |
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser;\r | |
144 | return $this->parser;\r | |
145 | }\r | |
146 | \r | |
147 | // return bool or null\r | |
148 | public function autodetect_on_failure($use_default=true) {\r | |
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure;\r | |
150 | return $this->autodetect_on_failure;\r | |
151 | }\r | |
152 | \r | |
153 | public static function set_config_path($path, $fallback=null) {\r | |
154 | self::$config_path = $path;\r | |
155 | self::$config_path_fallback = $fallback;\r | |
156 | }\r | |
157 | \r | |
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {\r | |
159 | $key = strtolower($key);\r | |
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r | |
161 | if ($config->cache_key) $key = $config->cache_key;\r | |
162 | self::$config_cache[$key] = $config;\r | |
163 | if (self::$apc && $use_apc) {\r | |
164 | self::debug("Adding site config to APC cache with key sc.$key");\r | |
165 | apc_add("sc.$key", $config);\r | |
166 | }\r | |
167 | self::debug("Cached site config with key $key");\r | |
168 | }\r | |
169 | \r | |
170 | public static function is_cached($key) {\r | |
171 | $key = strtolower($key);\r | |
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);\r | |
173 | if (array_key_exists($key, self::$config_cache)) {\r | |
174 | return true;\r | |
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) {\r | |
176 | return true;\r | |
177 | }\r | |
178 | return false;\r | |
179 | }\r | |
180 | \r | |
181 | public function append(SiteConfig $newconfig) {\r | |
182 | // check for commands where we accept multiple statements (no test_url)\r | |
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) {\r | |
184 | // append array elements for this config variable from $newconfig to this config\r | |
185 | //$this->$var = $this->$var + $newconfig->$var;\r | |
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));\r | |
187 | }\r | |
188 | // check for single statement commands\r | |
189 | // we do not overwrite existing non null values\r | |
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {\r | |
191 | if ($this->$var === null) $this->$var = $newconfig->$var;\r | |
192 | }\r | |
193 | }\r | |
194 | \r | |
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise\r | |
196 | // if $exact_host_match is true, we will not look for wildcard config matches\r | |
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists\r | |
198 | public static function build($host, $exact_host_match=false) {\r | |
199 | $host = strtolower($host);\r | |
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);\r | |
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;\r | |
202 | // check for site configuration\r | |
203 | $try = array($host);\r | |
204 | // should we look for wildcard matches \r | |
205 | if (!$exact_host_match) {\r | |
206 | $split = explode('.', $host);\r | |
207 | if (count($split) > 1) {\r | |
208 | array_shift($split);\r | |
209 | $try[] = '.'.implode('.', $split);\r | |
210 | }\r | |
211 | }\r | |
212 | \r | |
213 | // look for site config file in primary folder\r | |
214 | self::debug(". looking for site config for $host in primary folder");\r | |
215 | foreach ($try as $h) {\r | |
216 | if (array_key_exists($h, self::$config_cache)) {\r | |
217 | self::debug("... site config for $h already loaded in this request");\r | |
218 | return self::$config_cache[$h];\r | |
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {\r | |
220 | self::debug("... site config for $h in APC cache");\r | |
221 | return $sconfig;\r | |
222 | } elseif (file_exists(self::$config_path."/$h.txt")) {\r | |
223 | self::debug("... found site config ($h.txt)");\r | |
224 | $file_primary = self::$config_path."/$h.txt";\r | |
225 | $matched_name = $h;\r | |
226 | break;\r | |
227 | }\r | |
228 | }\r | |
229 | \r | |
230 | // if we found site config, process it\r | |
231 | if (isset($file_primary)) {\r | |
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r | |
233 | if (!$config_lines || !is_array($config_lines)) return false;\r | |
234 | $config = self::build_from_array($config_lines);\r | |
235 | // if APC caching is available and enabled, mark this for cache\r | |
236 | //$config->cache_in_apc = true;\r | |
237 | $config->cache_key = $matched_name;\r | |
238 | \r | |
239 | // if autodetec on failure is off (on by default) we do not need to look\r | |
240 | // in secondary folder\r | |
241 | if (!$config->autodetect_on_failure()) {\r | |
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');\r | |
243 | return $config;\r | |
244 | }\r | |
245 | }\r | |
246 | \r | |
247 | // look for site config file in secondary folder\r | |
248 | if (isset(self::$config_path_fallback)) {\r | |
249 | self::debug(". looking for site config for $host in secondary folder");\r | |
250 | foreach ($try as $h) {\r | |
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) {\r | |
252 | self::debug("... found site config in secondary folder ($h.txt)");\r | |
253 | $file_secondary = self::$config_path_fallback."/$h.txt";\r | |
254 | $matched_name = $h;\r | |
255 | break;\r | |
256 | }\r | |
257 | }\r | |
258 | if (!isset($file_secondary)) {\r | |
259 | self::debug("... no site config match in secondary folder");\r | |
260 | }\r | |
261 | }\r | |
262 | \r | |
263 | // return false if no config file found\r | |
264 | if (!isset($file_primary) && !isset($file_secondary)) {\r | |
265 | self::debug("... no site config match for $host");\r | |
266 | return false;\r | |
267 | }\r | |
268 | \r | |
269 | // return primary config if secondary not found\r | |
270 | if (!isset($file_secondary) && isset($config)) {\r | |
271 | return $config;\r | |
272 | }\r | |
273 | \r | |
274 | // process secondary config file\r | |
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);\r | |
276 | if (!$config_lines || !is_array($config_lines)) {\r | |
277 | // failed to process secondary\r | |
278 | if (isset($config)) {\r | |
279 | // return primary config\r | |
280 | return $config;\r | |
281 | } else {\r | |
282 | return false;\r | |
283 | }\r | |
284 | }\r | |
285 | \r | |
286 | // merge with primary and return\r | |
287 | if (isset($config)) {\r | |
288 | self::debug('. merging config files');\r | |
289 | $config->append(self::build_from_array($config_lines));\r | |
290 | return $config;\r | |
291 | } else {\r | |
292 | // return just secondary\r | |
293 | $config = self::build_from_array($config_lines);\r | |
294 | // if APC caching is available and enabled, mark this for cache\r | |
295 | //$config->cache_in_apc = true;\r | |
296 | $config->cache_key = $matched_name;\r | |
297 | return $config;\r | |
298 | }\r | |
299 | }\r | |
300 | \r | |
301 | public static function build_from_array(array $lines) {\r | |
302 | $config = new SiteConfig();\r | |
303 | foreach ($lines as $line) {\r | |
304 | $line = trim($line);\r | |
305 | \r | |
306 | // skip comments, empty lines\r | |
307 | if ($line == '' || $line[0] == '#') continue;\r | |
308 | \r | |
309 | // get command\r | |
310 | $command = explode(':', $line, 2);\r | |
311 | // if there's no colon ':', skip this line\r | |
312 | if (count($command) != 2) continue;\r | |
313 | $val = trim($command[1]);\r | |
314 | $command = trim($command[0]);\r | |
315 | if ($command == '' || $val == '') continue;\r | |
316 | \r | |
317 | // check for commands where we accept multiple statements\r | |
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) {\r | |
319 | array_push($config->$command, $val);\r | |
320 | // check for single statement commands that evaluate to true or false\r | |
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {\r | |
322 | $config->$command = ($val == 'yes');\r | |
323 | // check for single statement commands stored as strings\r | |
324 | } elseif (in_array($command, array('parser'))) {\r | |
325 | $config->$command = $val;\r | |
326 | // check for replace_string(find): replace\r | |
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {\r | |
328 | if (in_array($match[1], array('replace_string'))) {\r | |
329 | $command = $match[1];\r | |
330 | array_push($config->find_string, $match[2]);\r | |
331 | array_push($config->$command, $val);\r | |
332 | }\r | |
333 | }\r | |
334 | }\r | |
335 | return $config;\r | |
336 | }\r | |
337 | }\r | |
338 | ?> |