From 3ec62cf95ab4436923d4c665fad7aef226cbb822 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Thu, 22 May 2014 17:16:38 +0300 Subject: update to 3.2 version of full-text-rss, issue #694 --- .../libraries/content-extractor/SiteConfig.php | 681 +++++++++++---------- 1 file changed, 343 insertions(+), 338 deletions(-) (limited to 'inc/3rdparty/libraries/content-extractor/SiteConfig.php') diff --git a/inc/3rdparty/libraries/content-extractor/SiteConfig.php b/inc/3rdparty/libraries/content-extractor/SiteConfig.php index c5e300d7..1f6a7603 100644 --- a/inc/3rdparty/libraries/content-extractor/SiteConfig.php +++ b/inc/3rdparty/libraries/content-extractor/SiteConfig.php @@ -1,338 +1,343 @@ -tidy)) ? $this->tidy : $this->default_tidy; - return $this->tidy; - } - - // return bool or null - public function prune($use_default=true) { - if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; - return $this->prune; - } - - // return string or null - public function parser($use_default=true) { - if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; - return $this->parser; - } - - // return bool or null - public function autodetect_on_failure($use_default=true) { - if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; - return $this->autodetect_on_failure; - } - - public static function set_config_path($path, $fallback=null) { - self::$config_path = $path; - self::$config_path_fallback = $fallback; - } - - public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { - $key = strtolower($key); - if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); - if ($config->cache_key) $key = $config->cache_key; - self::$config_cache[$key] = $config; - if (self::$apc && $use_apc) { - self::debug("Adding site config to APC cache with key sc.$key"); - apc_add("sc.$key", $config); - } - self::debug("Cached site config with key $key"); - } - - public static function is_cached($key) { - $key = strtolower($key); - if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); - if (array_key_exists($key, self::$config_cache)) { - return true; - } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { - return true; - } - return false; - } - - public function append(SiteConfig $newconfig) { - // check for commands where we accept multiple statements (no test_url) - foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { - // append array elements for this config variable from $newconfig to this config - //$this->$var = $this->$var + $newconfig->$var; - $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); - } - // check for single statement commands - // we do not overwrite existing non null values - foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { - if ($this->$var === null) $this->$var = $newconfig->$var; - } - } - - // returns SiteConfig instance if an appropriate one is found, false otherwise - // if $exact_host_match is true, we will not look for wildcard config matches - // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists - public static function build($host, $exact_host_match=false) { - $host = strtolower($host); - if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); - if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; - // check for site configuration - $try = array($host); - // should we look for wildcard matches - if (!$exact_host_match) { - $split = explode('.', $host); - if (count($split) > 1) { - array_shift($split); - $try[] = '.'.implode('.', $split); - } - } - - // look for site config file in primary folder - self::debug(". looking for site config for $host in primary folder"); - foreach ($try as $h) { - if (array_key_exists($h, self::$config_cache)) { - self::debug("... site config for $h already loaded in this request"); - return self::$config_cache[$h]; - } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { - self::debug("... site config for $h in APC cache"); - return $sconfig; - } elseif (file_exists(self::$config_path."/$h.txt")) { - self::debug("... found site config ($h.txt)"); - $file_primary = self::$config_path."/$h.txt"; - $matched_name = $h; - break; - } - } - - // if we found site config, process it - if (isset($file_primary)) { - $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) return false; - $config = self::build_from_array($config_lines); - // if APC caching is available and enabled, mark this for cache - //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - - // if autodetec on failure is off (on by default) we do not need to look - // in secondary folder - if (!$config->autodetect_on_failure()) { - self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); - return $config; - } - } - - // look for site config file in secondary folder - if (isset(self::$config_path_fallback)) { - self::debug(". looking for site config for $host in secondary folder"); - foreach ($try as $h) { - if (file_exists(self::$config_path_fallback."/$h.txt")) { - self::debug("... found site config in secondary folder ($h.txt)"); - $file_secondary = self::$config_path_fallback."/$h.txt"; - $matched_name = $h; - break; - } - } - if (!isset($file_secondary)) { - self::debug("... no site config match in secondary folder"); - } - } - - // return false if no config file found - if (!isset($file_primary) && !isset($file_secondary)) { - self::debug("... no site config match for $host"); - return false; - } - - // return primary config if secondary not found - if (!isset($file_secondary) && isset($config)) { - return $config; - } - - // process secondary config file - $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) { - // failed to process secondary - if (isset($config)) { - // return primary config - return $config; - } else { - return false; - } - } - - // merge with primary and return - if (isset($config)) { - self::debug('. merging config files'); - $config->append(self::build_from_array($config_lines)); - return $config; - } else { - // return just secondary - $config = self::build_from_array($config_lines); - // if APC caching is available and enabled, mark this for cache - //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - return $config; - } - } - - public static function build_from_array(array $lines) { - $config = new SiteConfig(); - foreach ($lines as $line) { - $line = trim($line); - - // skip comments, empty lines - if ($line == '' || $line[0] == '#') continue; - - // get command - $command = explode(':', $line, 2); - // if there's no colon ':', skip this line - if (count($command) != 2) continue; - $val = trim($command[1]); - $command = trim($command[0]); - if ($command == '' || $val == '') continue; - - // check for commands where we accept multiple statements - if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { - array_push($config->$command, $val); - // check for single statement commands that evaluate to true or false - } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { - $config->$command = ($val == 'yes'); - // check for single statement commands stored as strings - } elseif (in_array($command, array('parser'))) { - $config->$command = $val; - // check for replace_string(find): replace - } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { - if (in_array($match[1], array('replace_string'))) { - $command = $match[1]; - array_push($config->find_string, $match[2]); - array_push($config->$command, $val); - } - } - } - return $config; - } -} -?> \ No newline at end of file +tidy)) ? $this->tidy : $this->default_tidy; + return $this->tidy; + } + + // return bool or null + public function prune($use_default=true) { + if ($use_default) return (isset($this->prune)) ? $this->prune : $this->default_prune; + return $this->prune; + } + + // return string or null + public function parser($use_default=true) { + if ($use_default) return (isset($this->parser)) ? $this->parser : $this->default_parser; + return $this->parser; + } + + // return bool or null + public function autodetect_on_failure($use_default=true) { + if ($use_default) return (isset($this->autodetect_on_failure)) ? $this->autodetect_on_failure : $this->default_autodetect_on_failure; + return $this->autodetect_on_failure; + } + + public static function set_config_path($path, $fallback=null) { + self::$config_path = $path; + self::$config_path_fallback = $fallback; + } + + public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { + $key = strtolower($key); + if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); + if ($config->cache_key) $key = $config->cache_key; + self::$config_cache[$key] = $config; + if (self::$apc && $use_apc) { + self::debug("Adding site config to APC cache with key sc.$key"); + apc_add("sc.$key", $config); + } + self::debug("Cached site config with key $key"); + } + + public static function is_cached($key) { + $key = strtolower($key); + if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); + if (array_key_exists($key, self::$config_cache)) { + return true; + } elseif (self::$apc && (bool)apc_fetch("sc.$key")) { + return true; + } + return false; + } + + public function append(SiteConfig $newconfig) { + // check for commands where we accept multiple statements (no test_url) + foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { + // append array elements for this config variable from $newconfig to this config + //$this->$var = $this->$var + $newconfig->$var; + $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); + } + // check for single statement commands + // we do not overwrite existing non null values + foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { + if ($this->$var === null) $this->$var = $newconfig->$var; + } + // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) + foreach (array('find_string', 'replace_string') as $var) { + // append array elements for this config variable from $newconfig to this config + //$this->$var = $this->$var + $newconfig->$var; + $this->$var = array_merge($this->$var, $newconfig->$var); + } + } + + // returns SiteConfig instance if an appropriate one is found, false otherwise + // if $exact_host_match is true, we will not look for wildcard config matches + // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists + public static function build($host, $exact_host_match=false) { + $host = strtolower($host); + if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); + if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; + // check for site configuration + $try = array($host); + // should we look for wildcard matches + if (!$exact_host_match) { + $split = explode('.', $host); + if (count($split) > 1) { + array_shift($split); + $try[] = '.'.implode('.', $split); + } + } + + // look for site config file in primary folder + self::debug(". looking for site config for $host in primary folder"); + foreach ($try as $h) { + if (array_key_exists($h, self::$config_cache)) { + self::debug("... site config for $h already loaded in this request"); + return self::$config_cache[$h]; + } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) { + self::debug("... site config for $h in APC cache"); + return $sconfig; + } elseif (file_exists(self::$config_path."/$h.txt")) { + self::debug("... found site config ($h.txt)"); + $file_primary = self::$config_path."/$h.txt"; + $matched_name = $h; + break; + } + } + + // if we found site config, process it + if (isset($file_primary)) { + $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_lines || !is_array($config_lines)) return false; + $config = self::build_from_array($config_lines); + // if APC caching is available and enabled, mark this for cache + //$config->cache_in_apc = true; + $config->cache_key = $matched_name; + + // if autodetec on failure is off (on by default) we do not need to look + // in secondary folder + if (!$config->autodetect_on_failure()) { + self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); + return $config; + } + } + + // look for site config file in secondary folder + if (isset(self::$config_path_fallback)) { + self::debug(". looking for site config for $host in secondary folder"); + foreach ($try as $h) { + if (file_exists(self::$config_path_fallback."/$h.txt")) { + self::debug("... found site config in secondary folder ($h.txt)"); + $file_secondary = self::$config_path_fallback."/$h.txt"; + $matched_name = $h; + break; + } + } + if (!isset($file_secondary)) { + self::debug("... no site config match in secondary folder"); + } + } + + // return false if no config file found + if (!isset($file_primary) && !isset($file_secondary)) { + self::debug("... no site config match for $host"); + return false; + } + + // return primary config if secondary not found + if (!isset($file_secondary) && isset($config)) { + return $config; + } + + // process secondary config file + $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_lines || !is_array($config_lines)) { + // failed to process secondary + if (isset($config)) { + // return primary config + return $config; + } else { + return false; + } + } + + // merge with primary and return + if (isset($config)) { + self::debug('. merging config files'); + $config->append(self::build_from_array($config_lines)); + return $config; + } else { + // return just secondary + $config = self::build_from_array($config_lines); + // if APC caching is available and enabled, mark this for cache + //$config->cache_in_apc = true; + $config->cache_key = $matched_name; + return $config; + } + } + + public static function build_from_array(array $lines) { + $config = new SiteConfig(); + foreach ($lines as $line) { + $line = trim($line); + + // skip comments, empty lines + if ($line == '' || $line[0] == '#') continue; + + // get command + $command = explode(':', $line, 2); + // if there's no colon ':', skip this line + if (count($command) != 2) continue; + $val = trim($command[1]); + $command = trim($command[0]); + if ($command == '' || $val == '') continue; + + // check for commands where we accept multiple statements + if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { + array_push($config->$command, $val); + // check for single statement commands that evaluate to true or false + } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { + $config->$command = ($val == 'yes'); + // check for single statement commands stored as strings + } elseif (in_array($command, array('parser'))) { + $config->$command = $val; + // check for replace_string(find): replace + } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { + if (in_array($match[1], array('replace_string'))) { + $command = $match[1]; + array_push($config->find_string, $match[2]); + array_push($config->$command, $val); + } + } + } + return $config; + } +} \ No newline at end of file -- cgit v1.2.3