diff options
Diffstat (limited to 'inc/3rdparty/libraries/content-extractor/SiteConfig.php')
-rw-r--r-- | inc/3rdparty/libraries/content-extractor/SiteConfig.php | 681 |
1 files changed, 343 insertions, 338 deletions
diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php | |||
@@ -1,338 +1,343 @@ | |||
1 | <?php | 1 | <?php |
2 | /** | 2 | /** |
3 | * Site Config | 3 | * Site Config |
4 | * | 4 | * |
5 | * Each instance of this class should hold extraction patterns and other directives | 5 | * Each instance of this class should hold extraction patterns and other directives |
6 | * for a website. See ContentExtractor class to see how it's used. | 6 | * for a website. See ContentExtractor class to see how it's used. |
7 | * | 7 | * |
8 | * @version 0.7 | 8 | * @version 0.8 |
9 | * @date 2012-08-27 | 9 | * @date 2013-04-16 |
10 | * @author Keyvan Minoukadeh | 10 | * @author Keyvan Minoukadeh |
11 | * @copyright 2012 Keyvan Minoukadeh | 11 | * @copyright 2013 Keyvan Minoukadeh |
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | 12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 |
13 | */ | 13 | */ |
14 | 14 | ||
15 | class SiteConfig | 15 | class SiteConfig |
16 | { | 16 | { |
17 | // Use first matching element as title (0 or more xpath expressions) | 17 | // Use first matching element as title (0 or more xpath expressions) |
18 | public $title = array(); | 18 | public $title = array(); |
19 | 19 | ||
20 | // Use first matching element as body (0 or more xpath expressions) | 20 | // Use first matching element as body (0 or more xpath expressions) |
21 | public $body = array(); | 21 | public $body = array(); |
22 | 22 | ||
23 | // Use first matching element as author (0 or more xpath expressions) | 23 | // Use first matching element as author (0 or more xpath expressions) |
24 | public $author = array(); | 24 | public $author = array(); |
25 | 25 | ||
26 | // Use first matching element as date (0 or more xpath expressions) | 26 | // Use first matching element as date (0 or more xpath expressions) |
27 | public $date = array(); | 27 | public $date = array(); |
28 | 28 | ||
29 | // Strip elements matching these xpath expressions (0 or more) | 29 | // Strip elements matching these xpath expressions (0 or more) |
30 | public $strip = array(); | 30 | public $strip = array(); |
31 | 31 | ||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | 32 | // Strip elements which contain these strings (0 or more) in the id or class attribute |
33 | public $strip_id_or_class = array(); | 33 | public $strip_id_or_class = array(); |
34 | 34 | ||
35 | // Strip images which contain these strings (0 or more) in the src attribute | 35 | // Strip images which contain these strings (0 or more) in the src attribute |
36 | public $strip_image_src = array(); | 36 | public $strip_image_src = array(); |
37 | 37 | ||
38 | // Additional HTTP headers to send | 38 | // Additional HTTP headers to send |
39 | // NOT YET USED | 39 | // NOT YET USED |
40 | public $http_header = array(); | 40 | public $http_header = array(); |
41 | 41 | ||
42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) | 42 | // Process HTML with tidy before creating DOM (bool or null if undeclared) |
43 | public $tidy = null; | 43 | public $tidy = null; |
44 | 44 | ||
45 | protected $default_tidy = true; // used if undeclared | 45 | protected $default_tidy = true; // used if undeclared |
46 | 46 | ||
47 | // Autodetect title/body if xpath expressions fail to produce results. | 47 | // Autodetect title/body if xpath expressions fail to produce results. |
48 | // Note that this applies to title and body separately, ie. | 48 | // Note that this applies to title and body separately, ie. |
49 | // * if we get a body match but no title match, this option will determine whether we autodetect title | 49 | // * if we get a body match but no title match, this option will determine whether we autodetect title |
50 | // * if neither match, this determines whether we autodetect title and body. | 50 | // * if neither match, this determines whether we autodetect title and body. |
51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | 51 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. |
52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | 52 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) |
53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | 53 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. |
54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | 54 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). |
55 | // bool or null if undeclared | 55 | // bool or null if undeclared |
56 | public $autodetect_on_failure = null; | 56 | public $autodetect_on_failure = null; |
57 | protected $default_autodetect_on_failure = true; // used if undeclared | 57 | protected $default_autodetect_on_failure = true; // used if undeclared |
58 | 58 | ||
59 | // Clean up content block - attempt to remove elements that appear to be superfluous | 59 | // Clean up content block - attempt to remove elements that appear to be superfluous |
60 | // bool or null if undeclared | 60 | // bool or null if undeclared |
61 | public $prune = null; | 61 | public $prune = null; |
62 | protected $default_prune = true; // used if undeclared | 62 | protected $default_prune = true; // used if undeclared |
63 | 63 | ||
64 | // Test URL - if present, can be used to test the config above | 64 | // Test URL - if present, can be used to test the config above |
65 | public $test_url = array(); | 65 | public $test_url = array(); |
66 | 66 | ||
67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | 67 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article |
68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | 68 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to |
69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | 69 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page |
70 | // which displays the entire article on one page (e.g. 'print view'). | 70 | // which displays the entire article on one page (e.g. 'print view'). |
71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | 71 | // This should be an XPath expression identifying the link to that page. If present and we find a match, |
72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | 72 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. |
73 | public $single_page_link = array(); | 73 | public $single_page_link = array(); |
74 | 74 | ||
75 | public $next_page_link = array(); | 75 | public $next_page_link = array(); |
76 | 76 | ||
77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | 77 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed |
78 | public $single_page_link_in_feed = array(); | 78 | public $single_page_link_in_feed = array(); |
79 | 79 | ||
80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') | 80 | // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') |
81 | // string or null if undeclared | 81 | // string or null if undeclared |
82 | public $parser = null; | 82 | public $parser = null; |
83 | protected $default_parser = 'libxml'; // used if undeclared | 83 | protected $default_parser = 'libxml'; // used if undeclared |
84 | 84 | ||
85 | // Strings to search for in HTML before processing begins (used with $replace_string) | 85 | // Strings to search for in HTML before processing begins (used with $replace_string) |
86 | public $find_string = array(); | 86 | public $find_string = array(); |
87 | // Strings to replace those found in $find_string before HTML processing begins | 87 | // Strings to replace those found in $find_string before HTML processing begins |
88 | public $replace_string = array(); | 88 | public $replace_string = array(); |
89 | 89 | ||
90 | // the options below cannot be set in the config files which this class represents | 90 | // the options below cannot be set in the config files which this class represents |
91 | 91 | ||
92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not | 92 | //public $cache_in_apc = false; // used to decide if we should cache in apc or not |
93 | public $cache_key = null; | 93 | public $cache_key = null; |
94 | public static $debug = false; | 94 | public static $debug = false; |
95 | protected static $apc = false; | 95 | protected static $apc = false; |
96 | protected static $config_path; | 96 | protected static $config_path; |
97 | protected static $config_path_fallback; | 97 | protected static $config_path_fallback; |
98 | protected static $config_cache = array(); | 98 | protected static $config_cache = array(); |
99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | 99 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; |
100 | 100 | ||
101 | protected static function debug($msg) { | 101 | protected static function debug($msg) { |
102 | if (self::$debug) { | 102 | if (self::$debug) { |
103 | //$mem = round(memory_get_usage()/1024, 2); | 103 | //$mem = round(memory_get_usage()/1024, 2); |
104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); | 104 | //$memPeak = round(memory_get_peak_usage()/1024, 2); |
105 | echo '* ',$msg; | 105 | echo '* ',$msg; |
106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | 106 | //echo ' - mem used: ',$mem," (peak: $memPeak)\n"; |
107 | echo "\n"; | 107 | echo "\n"; |
108 | ob_flush(); | 108 | ob_flush(); |
109 | flush(); | 109 | flush(); |
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
113 | // enable APC caching of certain site config files? | 113 | // enable APC caching of certain site config files? |
114 | // If enabled the following site config files will be | 114 | // If enabled the following site config files will be |
115 | // cached in APC cache (when requested for first time): | 115 | // cached in APC cache (when requested for first time): |
116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ | 116 | // * anything in site_config/custom/ and its corresponding file in site_config/standard/ |
117 | // * the site config files associated with HTML fingerprints | 117 | // * the site config files associated with HTML fingerprints |
118 | // * the global site config file | 118 | // * the global site config file |
119 | // returns true if enabled, false otherwise | 119 | // returns true if enabled, false otherwise |
120 | public static function use_apc($apc=true) { | 120 | public static function use_apc($apc=true) { |
121 | if (!function_exists('apc_add')) { | 121 | if (!function_exists('apc_add')) { |
122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); | 122 | if ($apc) self::debug('APC will not be used (function apc_add does not exist)'); |
123 | return false; | 123 | return false; |
124 | } | 124 | } |
125 | self::$apc = $apc; | 125 | self::$apc = $apc; |
126 | return $apc; | 126 | return $apc; |
127 | } | 127 | } |
128 | 128 | ||
129 | // return bool or null | 129 | // return bool or null |
130 | public function tidy($use_default=true) { | 130 | public function tidy($use_default=true) { |
131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; | 131 | if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; |
132 | return $this->tidy; | 132 | return $this->tidy; |
133 | } | 133 | } |
134 | 134 | ||
135 | // return bool or null | 135 | // return bool or null |
136 | public function prune($use_default=true) { | 136 | public function prune($use_default=true) { |
137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; | 137 | if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; |
138 | return $this->prune; | 138 | return $this->prune; |
139 | } | 139 | } |
140 | 140 | ||
141 | // return string or null | 141 | // return string or null |
142 | public function parser($use_default=true) { | 142 | public function parser($use_default=true) { |
143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; | 143 | if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; |
144 | return $this->parser; | 144 | return $this->parser; |
145 | } | 145 | } |
146 | 146 | ||
147 | // return bool or null | 147 | // return bool or null |
148 | public function autodetect_on_failure($use_default=true) { | 148 | public function autodetect_on_failure($use_default=true) { |
149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; | 149 | if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; |
150 | return $this->autodetect_on_failure; | 150 | return $this->autodetect_on_failure; |
151 | } | 151 | } |
152 | 152 | ||
153 | public static function set_config_path($path, $fallback=null) { | 153 | public static function set_config_path($path, $fallback=null) { |
154 | self::$config_path = $path; | 154 | self::$config_path = $path; |
155 | self::$config_path_fallback = $fallback; | 155 | self::$config_path_fallback = $fallback; |
156 | } | 156 | } |
157 | 157 | ||
158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { | 158 | public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { |
159 | $key = strtolower($key); | 159 | $key = strtolower($key); |
160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 160 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
161 | if ($config->cache_key) $key = $config->cache_key; | 161 | if ($config->cache_key) $key = $config->cache_key; |
162 | self::$config_cache[$key] = $config; | 162 | self::$config_cache[$key] = $config; |
163 | if (self::$apc && $use_apc) { | 163 | if (self::$apc && $use_apc) { |
164 | self::debug("Adding site config to APC cache with key sc.$key"); | 164 | self::debug("Adding site config to APC cache with key sc.$key"); |
165 | apc_add("sc.$key", $config); | 165 | apc_add("sc.$key", $config); |
166 | } | 166 | } |
167 | self::debug("Cached site config with key $key"); | 167 | self::debug("Cached site config with key $key"); |
168 | } | 168 | } |
169 | 169 | ||
170 | public static function is_cached($key) { | 170 | public static function is_cached($key) { |
171 | $key = strtolower($key); | 171 | $key = strtolower($key); |
172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); | 172 | if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); |
173 | if (array_key_exists($key, self::$config_cache)) { | 173 | if (array_key_exists($key, self::$config_cache)) { |
174 | return true; | 174 | return true; |
175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { | 175 | } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { |
176 | return true; | 176 | return true; |
177 | } | 177 | } |
178 | return false; | 178 | return false; |
179 | } | 179 | } |
180 | 180 | ||
181 | public function append(SiteConfig $newconfig) { | 181 | public function append(SiteConfig $newconfig) { |
182 | // check for commands where we accept multiple statements (no test_url) | 182 | // check for commands where we accept multiple statements (no test_url) |
183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { | 183 | foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { |
184 | // append array elements for this config variable from $newconfig to this config | 184 | // append array elements for this config variable from $newconfig to this config |
185 | //$this->$var = $this->$var + $newconfig->$var; | 185 | //$this->$var = $this->$var + $newconfig->$var; |
186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); | 186 | $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); |
187 | } | 187 | } |
188 | // check for single statement commands | 188 | // check for single statement commands |
189 | // we do not overwrite existing non null values | 189 | // we do not overwrite existing non null values |
190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { | 190 | foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { |
191 | if ($this->$var === null) $this->$var = $newconfig->$var; | 191 | if ($this->$var === null) $this->$var = $newconfig->$var; |
192 | } | 192 | } |
193 | } | 193 | // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) |
194 | 194 | foreach (array('find_string', 'replace_string') as $var) { | |
195 | // returns SiteConfig instance if an appropriate one is found, false otherwise | 195 | // append array elements for this config variable from $newconfig to this config |
196 | // if $exact_host_match is true, we will not look for wildcard config matches | 196 | //$this->$var = $this->$var + $newconfig->$var; |
197 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists | 197 | $this->$var = array_merge($this->$var, $newconfig->$var); |
198 | public static function build($host, $exact_host_match=false) { | 198 | } |
199 | $host = strtolower($host); | 199 | } |
200 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | 200 | |
201 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; | 201 | // returns SiteConfig instance if an appropriate one is found, false otherwise |
202 | // check for site configuration | 202 | // if $exact_host_match is true, we will not look for wildcard config matches |
203 | $try = array($host); | 203 | // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists |
204 | // should we look for wildcard matches | 204 | public static function build($host, $exact_host_match=false) { |
205 | if (!$exact_host_match) { | 205 | $host = strtolower($host); |
206 | $split = explode('.', $host); | 206 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); |
207 | if (count($split) > 1) { | 207 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; |
208 | array_shift($split); | 208 | // check for site configuration |
209 | $try[] = '.'.implode('.', $split); | 209 | $try = array($host); |
210 | } | 210 | // should we look for wildcard matches |
211 | } | 211 | if (!$exact_host_match) { |
212 | 212 | $split = explode('.', $host); | |
213 | // look for site config file in primary folder | 213 | if (count($split) > 1) { |
214 | self::debug(". looking for site config for $host in primary folder"); | 214 | array_shift($split); |
215 | foreach ($try as $h) { | 215 | $try[] = '.'.implode('.', $split); |
216 | if (array_key_exists($h, self::$config_cache)) { | 216 | } |
217 | self::debug("... site config for $h already loaded in this request"); | 217 | } |
218 | return self::$config_cache[$h]; | 218 | |
219 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { | 219 | // look for site config file in primary folder |
220 | self::debug("... site config for $h in APC cache"); | 220 | self::debug(". looking for site config for $host in primary folder"); |
221 | return $sconfig; | 221 | foreach ($try as $h) { |
222 | } elseif (file_exists(self::$config_path."/$h.txt")) { | 222 | if (array_key_exists($h, self::$config_cache)) { |
223 | self::debug("... found site config ($h.txt)"); | 223 | self::debug("... site config for $h already loaded in this request"); |
224 | $file_primary = self::$config_path."/$h.txt"; | 224 | return self::$config_cache[$h]; |
225 | $matched_name = $h; | 225 | } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { |
226 | break; | 226 | self::debug("... site config for $h in APC cache"); |
227 | } | 227 | return $sconfig; |
228 | } | 228 | } elseif (file_exists(self::$config_path."/$h.txt")) { |
229 | 229 | self::debug("... found site config ($h.txt)"); | |
230 | // if we found site config, process it | 230 | $file_primary = self::$config_path."/$h.txt"; |
231 | if (isset($file_primary)) { | 231 | $matched_name = $h; |
232 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 232 | break; |
233 | if (!$config_lines || !is_array($config_lines)) return false; | 233 | } |
234 | $config = self::build_from_array($config_lines); | 234 | } |
235 | // if APC caching is available and enabled, mark this for cache | 235 | |
236 | //$config->cache_in_apc = true; | 236 | // if we found site config, process it |
237 | $config->cache_key = $matched_name; | 237 | if (isset($file_primary)) { |
238 | 238 | $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
239 | // if autodetec on failure is off (on by default) we do not need to look | 239 | if (!$config_lines || !is_array($config_lines)) return false; |
240 | // in secondary folder | 240 | $config = self::build_from_array($config_lines); |
241 | if (!$config->autodetect_on_failure()) { | 241 | // if APC caching is available and enabled, mark this for cache |
242 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); | 242 | //$config->cache_in_apc = true; |
243 | return $config; | 243 | $config->cache_key = $matched_name; |
244 | } | 244 | |
245 | } | 245 | // if autodetec on failure is off (on by default) we do not need to look |
246 | 246 | // in secondary folder | |
247 | // look for site config file in secondary folder | 247 | if (!$config->autodetect_on_failure()) { |
248 | if (isset(self::$config_path_fallback)) { | 248 | self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); |
249 | self::debug(". looking for site config for $host in secondary folder"); | 249 | return $config; |
250 | foreach ($try as $h) { | 250 | } |
251 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | 251 | } |
252 | self::debug("... found site config in secondary folder ($h.txt)"); | 252 | |
253 | $file_secondary = self::$config_path_fallback."/$h.txt"; | 253 | // look for site config file in secondary folder |
254 | $matched_name = $h; | 254 | if (isset(self::$config_path_fallback)) { |
255 | break; | 255 | self::debug(". looking for site config for $host in secondary folder"); |
256 | } | 256 | foreach ($try as $h) { |
257 | } | 257 | if (file_exists(self::$config_path_fallback."/$h.txt")) { |
258 | if (!isset($file_secondary)) { | 258 | self::debug("... found site config in secondary folder ($h.txt)"); |
259 | self::debug("... no site config match in secondary folder"); | 259 | $file_secondary = self::$config_path_fallback."/$h.txt"; |
260 | } | 260 | $matched_name = $h; |
261 | } | 261 | break; |
262 | 262 | } | |
263 | // return false if no config file found | 263 | } |
264 | if (!isset($file_primary) && !isset($file_secondary)) { | 264 | if (!isset($file_secondary)) { |
265 | self::debug("... no site config match for $host"); | 265 | self::debug("... no site config match in secondary folder"); |
266 | return false; | 266 | } |
267 | } | 267 | } |
268 | 268 | ||
269 | // return primary config if secondary not found | 269 | // return false if no config file found |
270 | if (!isset($file_secondary) && isset($config)) { | 270 | if (!isset($file_primary) && !isset($file_secondary)) { |
271 | return $config; | 271 | self::debug("... no site config match for $host"); |
272 | } | 272 | return false; |
273 | 273 | } | |
274 | // process secondary config file | 274 | |
275 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | 275 | // return primary config if secondary not found |
276 | if (!$config_lines || !is_array($config_lines)) { | 276 | if (!isset($file_secondary) && isset($config)) { |
277 | // failed to process secondary | 277 | return $config; |
278 | if (isset($config)) { | 278 | } |
279 | // return primary config | 279 | |
280 | return $config; | 280 | // process secondary config file |
281 | } else { | 281 | $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
282 | return false; | 282 | if (!$config_lines || !is_array($config_lines)) { |
283 | } | 283 | // failed to process secondary |
284 | } | 284 | if (isset($config)) { |
285 | 285 | // return primary config | |
286 | // merge with primary and return | 286 | return $config; |
287 | if (isset($config)) { | 287 | } else { |
288 | self::debug('. merging config files'); | 288 | return false; |
289 | $config->append(self::build_from_array($config_lines)); | 289 | } |
290 | return $config; | 290 | } |
291 | } else { | 291 | |
292 | // return just secondary | 292 | // merge with primary and return |
293 | $config = self::build_from_array($config_lines); | 293 | if (isset($config)) { |
294 | // if APC caching is available and enabled, mark this for cache | 294 | self::debug('. merging config files'); |
295 | //$config->cache_in_apc = true; | 295 | $config->append(self::build_from_array($config_lines)); |
296 | $config->cache_key = $matched_name; | 296 | return $config; |
297 | return $config; | 297 | } else { |
298 | } | 298 | // return just secondary |
299 | } | 299 | $config = self::build_from_array($config_lines); |
300 | 300 | // if APC caching is available and enabled, mark this for cache | |
301 | public static function build_from_array(array $lines) { | 301 | //$config->cache_in_apc = true; |
302 | $config = new SiteConfig(); | 302 | $config->cache_key = $matched_name; |
303 | foreach ($lines as $line) { | 303 | return $config; |
304 | $line = trim($line); | 304 | } |
305 | 305 | } | |
306 | // skip comments, empty lines | 306 | |
307 | if ($line == '' || $line[0] == '#') continue; | 307 | public static function build_from_array(array $lines) { |
308 | 308 | $config = new SiteConfig(); | |
309 | // get command | 309 | foreach ($lines as $line) { |
310 | $command = explode(':', $line, 2); | 310 | $line = trim($line); |
311 | // if there's no colon ':', skip this line | 311 | |
312 | if (count($command) != 2) continue; | 312 | // skip comments, empty lines |
313 | $val = trim($command[1]); | 313 | if ($line == '' || $line[0] == '#') continue; |
314 | $command = trim($command[0]); | 314 | |
315 | if ($command == '' || $val == '') continue; | 315 | // get command |
316 | 316 | $command = explode(':', $line, 2); | |
317 | // check for commands where we accept multiple statements | 317 | // if there's no colon ':', skip this line |
318 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { | 318 | if (count($command) != 2) continue; |
319 | array_push($config->$command, $val); | 319 | $val = trim($command[1]); |
320 | // check for single statement commands that evaluate to true or false | 320 | $command = trim($command[0]); |
321 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | 321 | if ($command == '' || $val == '') continue; |
322 | $config->$command = ($val == 'yes'); | 322 | |
323 | // check for single statement commands stored as strings | 323 | // check for commands where we accept multiple statements |
324 | } elseif (in_array($command, array('parser'))) { | 324 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { |
325 | $config->$command = $val; | 325 | array_push($config->$command, $val); |
326 | // check for replace_string(find): replace | 326 | // check for single statement commands that evaluate to true or false |
327 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | 327 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { |
328 | if (in_array($match[1], array('replace_string'))) { | 328 | $config->$command = ($val == 'yes'); |
329 | $command = $match[1]; | 329 | // check for single statement commands stored as strings |
330 | array_push($config->find_string, $match[2]); | 330 | } elseif (in_array($command, array('parser'))) { |
331 | array_push($config->$command, $val); | 331 | $config->$command = $val; |
332 | } | 332 | // check for replace_string(find): replace |
333 | } | 333 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { |
334 | } | 334 | if (in_array($match[1], array('replace_string'))) { |
335 | return $config; | 335 | $command = $match[1]; |
336 | } | 336 | array_push($config->find_string, $match[2]); |
337 | } | 337 | array_push($config->$command, $val); |
338 | ?> \ No newline at end of file | 338 | } |
339 | } | ||
340 | } | ||
341 | return $config; | ||
342 | } | ||
343 | } \ No newline at end of file | ||