diff options
author | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2013-08-25 12:12:53 -0700 |
---|---|---|
committer | Nicolas LÅ“uillet <nicolas.loeuillet@gmail.com> | 2013-08-25 12:12:53 -0700 |
commit | c51be6b697da573cdcf0788eb8617130ce5517a4 (patch) | |
tree | 642eaf70afb134dee5f274c84bf15b8aab00c117 /inc/3rdparty/content-extractor/SiteConfig.php | |
parent | 7ba37bd91a43321196e6d867caf9e298e82c6d6c (diff) | |
parent | 063fc1a7baaf6f7e1fb08eced058962a6140a471 (diff) | |
download | wallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.tar.gz wallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.tar.zst wallabag-c51be6b697da573cdcf0788eb8617130ce5517a4.zip |
Merge pull request #181 from inthepoche/dev
beta4
Diffstat (limited to 'inc/3rdparty/content-extractor/SiteConfig.php')
-rw-r--r-- | inc/3rdparty/content-extractor/SiteConfig.php | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/inc/3rdparty/content-extractor/SiteConfig.php b/inc/3rdparty/content-extractor/SiteConfig.php new file mode 100644 index 00000000..089e10c6 --- /dev/null +++ b/inc/3rdparty/content-extractor/SiteConfig.php | |||
@@ -0,0 +1,184 @@ | |||
1 | <?php | ||
2 | /** | ||
3 | * Site Config | ||
4 | * | ||
5 | * Each instance of this class should hold extraction patterns and other directives | ||
6 | * for a website. See ContentExtractor class to see how it's used. | ||
7 | * | ||
8 | * @version 0.6 | ||
9 | * @date 2011-10-30 | ||
10 | * @author Keyvan Minoukadeh | ||
11 | * @copyright 2011 Keyvan Minoukadeh | ||
12 | * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 | ||
13 | */ | ||
14 | |||
15 | class SiteConfig | ||
16 | { | ||
17 | // Use first matching element as title (0 or more xpath expressions) | ||
18 | public $title = array(); | ||
19 | |||
20 | // Use first matching element as body (0 or more xpath expressions) | ||
21 | public $body = array(); | ||
22 | |||
23 | // Use first matching element as author (0 or more xpath expressions) | ||
24 | public $author = array(); | ||
25 | |||
26 | // Use first matching element as date (0 or more xpath expressions) | ||
27 | public $date = array(); | ||
28 | |||
29 | // Strip elements matching these xpath expressions (0 or more) | ||
30 | public $strip = array(); | ||
31 | |||
32 | // Strip elements which contain these strings (0 or more) in the id or class attribute | ||
33 | public $strip_id_or_class = array(); | ||
34 | |||
35 | // Strip images which contain these strings (0 or more) in the src attribute | ||
36 | public $strip_image_src = array(); | ||
37 | |||
38 | // Additional HTTP headers to send | ||
39 | // NOT YET USED | ||
40 | public $http_header = array(); | ||
41 | |||
42 | // Process HTML with tidy before creating DOM | ||
43 | public $tidy = true; | ||
44 | |||
45 | // Autodetect title/body if xpath expressions fail to produce results. | ||
46 | // Note that this applies to title and body separately, ie. | ||
47 | // * if we get a body match but no title match, this option will determine whether we autodetect title | ||
48 | // * if neither match, this determines whether we autodetect title and body. | ||
49 | // Also note that this only applies when there is at least one xpath expression in title or body, ie. | ||
50 | // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected) | ||
51 | // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results. | ||
52 | // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content). | ||
53 | public $autodetect_on_failure = true; | ||
54 | |||
55 | // Clean up content block - attempt to remove elements that appear to be superfluous | ||
56 | public $prune = true; | ||
57 | |||
58 | // Test URL - if present, can be used to test the config above | ||
59 | public $test_url = null; | ||
60 | |||
61 | // Single-page link - should identify a link element or URL pointing to the page holding the entire article | ||
62 | // This is useful for sites which split their articles across multiple pages. Links to such pages tend to | ||
63 | // display the first page with links to the other pages at the bottom. Often there is also a link to a page | ||
64 | // which displays the entire article on one page (e.g. 'print view'). | ||
65 | // This should be an XPath expression identifying the link to that page. If present and we find a match, | ||
66 | // we will retrieve that page and the rest of the options in this config will be applied to the new page. | ||
67 | public $single_page_link = array(); | ||
68 | |||
69 | // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed | ||
70 | public $single_page_link_in_feed = array(); | ||
71 | |||
72 | // TODO: which parser to use for turning raw HTML into a DOMDocument | ||
73 | public $parser = 'libxml'; | ||
74 | |||
75 | // String replacement to be made on HTML before processing begins | ||
76 | public $replace_string = array(); | ||
77 | |||
78 | // the options below cannot be set in the config files which this class represents | ||
79 | |||
80 | public static $debug = false; | ||
81 | protected static $config_path; | ||
82 | protected static $config_path_fallback; | ||
83 | protected static $config_cache = array(); | ||
84 | const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; | ||
85 | |||
86 | protected static function debug($msg) { | ||
87 | if (self::$debug) { | ||
88 | $mem = round(memory_get_usage()/1024, 2); | ||
89 | $memPeak = round(memory_get_peak_usage()/1024, 2); | ||
90 | echo '* ',$msg; | ||
91 | echo ' - mem used: ',$mem," (peak: $memPeak)\n"; | ||
92 | ob_flush(); | ||
93 | flush(); | ||
94 | } | ||
95 | } | ||
96 | |||
97 | public static function set_config_path($path, $fallback=null) { | ||
98 | self::$config_path = $path; | ||
99 | self::$config_path_fallback = $fallback; | ||
100 | } | ||
101 | |||
102 | public static function add_to_cache($host, SiteConfig $config) { | ||
103 | $host = strtolower($host); | ||
104 | self::$config_cache[$host] = $config; | ||
105 | } | ||
106 | |||
107 | // returns SiteConfig instance if an appropriate one is found, false otherwise | ||
108 | public static function build($host) { | ||
109 | $host = strtolower($host); | ||
110 | if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); | ||
111 | if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false; | ||
112 | // check for site configuration | ||
113 | $try = array($host); | ||
114 | $split = explode('.', $host); | ||
115 | if (count($split) > 1) { | ||
116 | array_shift($split); | ||
117 | $try[] = '.'.implode('.', $split); | ||
118 | } | ||
119 | foreach ($try as $h) { | ||
120 | if (array_key_exists($h, self::$config_cache)) { | ||
121 | self::debug("... cached ($h)"); | ||
122 | return self::$config_cache[$h]; | ||
123 | } elseif (file_exists(self::$config_path."/$h.txt")) { | ||
124 | self::debug("... from file ($h)"); | ||
125 | $file = self::$config_path."/$h.txt"; | ||
126 | break; | ||
127 | } | ||
128 | } | ||
129 | if (!isset($file)) { | ||
130 | if (isset(self::$config_path_fallback)) { | ||
131 | self::debug("... trying fallback ($host)"); | ||
132 | foreach ($try as $h) { | ||
133 | if (file_exists(self::$config_path_fallback."/$h.txt")) { | ||
134 | self::debug("... from fallback file ($h)"); | ||
135 | $file = self::$config_path_fallback."/$h.txt"; | ||
136 | break; | ||
137 | } | ||
138 | } | ||
139 | if (!isset($file)) { | ||
140 | self::debug("... no match in fallback directory"); | ||
141 | return false; | ||
142 | } | ||
143 | } else { | ||
144 | self::debug("... no match ($host)"); | ||
145 | return false; | ||
146 | } | ||
147 | } | ||
148 | $config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | ||
149 | if (!$config_file || !is_array($config_file)) return false; | ||
150 | $config = new SiteConfig(); | ||
151 | foreach ($config_file as $line) { | ||
152 | $line = trim($line); | ||
153 | |||
154 | // skip comments, empty lines | ||
155 | if ($line == '' || $line[0] == '#') continue; | ||
156 | |||
157 | // get command | ||
158 | $command = explode(':', $line, 2); | ||
159 | // if there's no colon ':', skip this line | ||
160 | if (count($command) != 2) continue; | ||
161 | $val = trim($command[1]); | ||
162 | $command = trim($command[0]); | ||
163 | if ($command == '' || $val == '') continue; | ||
164 | |||
165 | // check for commands where we accept multiple statements | ||
166 | if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'http_header'))) { | ||
167 | array_push($config->$command, $val); | ||
168 | // check for single statement commands that evaluate to true or false | ||
169 | } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { | ||
170 | $config->$command = ($val == 'yes'); | ||
171 | // check for single statement commands stored as strings | ||
172 | } elseif (in_array($command, array('test_url', 'parser'))) { | ||
173 | $config->$command = $val; | ||
174 | } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { | ||
175 | if (in_array($match[1], array('replace_string'))) { | ||
176 | $command = $match[1]; | ||
177 | array_push($config->$command, array($match[2], $val)); | ||
178 | } | ||
179 | } | ||
180 | } | ||
181 | return $config; | ||
182 | } | ||
183 | } | ||
184 | ?> \ No newline at end of file | ||