aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/config.php
blob: e618117b7190da886aa40731fba02896f4521664 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
<?php
/* Full-Text RSS config */

// ......IMPORTANT......................................
// .....................................................
// Please do not change this file (config.php) directly.
// Save a copy as custom_config.php and make your
// changes to that instead. It will automatically
// override anything in config.php. Because config.php
// always gets loaded anyway, you can simply specify
// options you'd like to override in custom_config.php.
// .....................................................

global $options;

// Create config object
if (!isset($options)) $options = new stdClass();

// Enable service
// ----------------------
// Set this to false if you want to disable the service.
// If set to false, no feed is produced and users will 
// be told that the service is disabled.
$options->enabled = true;

// Debug mode
// ----------------------
// Enable or disable debugging. When enabled debugging works by passing
// &debug to the makefulltextfeed.php querystring.
// Valid values:
// true or 'user' (default) - let user decide
// 'admin' - debug works only for logged in admin users
// false - disabled
$options->debug = true;

// Default entries (without access key)
// ----------------------
// The number of feed items to process when no API key is supplied
// and no &max=x value is supplied in the querystring.
$options->default_entries = 5;

// Max entries (without access key)
// ----------------------
// The maximum number of feed items to process when no access key is supplied.
// This limits the user-supplied &max=x value. For example, if the user
// asks for 20 items to be processed (&max=20), if max_entries is set to 
// 10, only 10 will be processed.
$options->max_entries = 10;

// Rewrite relative URLs
// ----------------------
// With this enabled relative URLs found in the extracted content
// block are automatically rewritten as absolute URLs.
$options->rewrite_relative_urls = true;

// Exclude items if extraction fails
// ---------------------------------
// Excludes items from the resulting feed
// if we cannot extract any content from the
// item URL.
// Possible values...
// Enable: true
// Disable: false (default)
// User decides: 'user' (this option will appear on the form)
$options->exclude_items_on_fail = 'user';

// Enable multi-page support
// -------------------------
// If enabled, we will try to follow next page links on multi-page articles.
// Currently this only happens for sites where next_page_link has been defined 
// in a site config file.
$options->multipage = true;

// Enable caching
// ----------------------
// Enable this if you'd like to cache results
// for 10 minutes. Cache files are written to disk (in cache/ subfolders
// - which must be writable).
// Initially it's best to keep this disabled to make sure everything works
// as expected. If you have APC enabled, please also see smart_cache in the
// advanced section.
$options->caching = false;

// Cache directory
// ----------------------
// Only used if caching is true
$options->cache_dir = dirname(__FILE__).'/cache';

// Message to prepend (without access key)
// ----------------------
// HTML to insert at the beginning of each feed item when no access key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_prepend = '';

// Message to append (without access key)
// ----------------------
// HTML to insert at the end of each feed item when no access key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_append = '';

// Error message when content extraction fails (without access key)
// ----------------------
$options->error_message = '[unable to retrieve full-text content]';

// Keep enclosure in feed items
// If enabled, we will try to preserve enclosures if present.
// ----------------------
$options->keep_enclosures = true;

// Detect language
// ---------------
// Should we try and find/guess the language of the article being processed?
// Values will be placed inside the <dc:language> element inside each <item> element
// Possible values:
// * Ignore language: 0
// * Use article/feed metadata (e.g. HTML lang attribute): 1 (default)
// * As above, but guess if not present: 2
// * Always guess: 3
// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &l=2)
$options->detect_language = 1;

// Registration key
// ---------------
// The registration key is optional. It is not required to use Full-Text RSS, 
// and does not affect the normal operation of Full-Text RSS. It is currently 
// only used on admin pages which help you update site patterns with the 
// latest version offered by FiveFilters.org. For these admin-related 
// tasks to complete, we will require a valid registration key.
// If you would like one, you can purchase the latest version of Full-Text RSS
// at http://fivefilters.org/content-only/
// Your registration key will automatically be sent in the confirmation email.
// Once you have it, simply copy and paste it here.
$options->registration_key = '';

/////////////////////////////////////////////////
/// RESTRICT ACCESS /////////////////////////////
/////////////////////////////////////////////////

// Admin credentials
// ----------------------
// Certain pages/actions, e.g. updating site patterns with our online tool, will require admin credentials.
// To use these pages, enter a password here and you'll be prompted for it when you try to access those pages.
// If no password or username is set, pages requiring admin privelages will be inaccessible. 
// The default username is 'admin'.
// If overriding with an environment variable, separate username and password with a colon, e.g.:
// ftr_admin_credentials: admin:my-secret-password
// Example: $options->admin_credentials = array('username'=>'admin', 'password'=>'my-secret-password');
$options->admin_credentials = array('username'=>'admin', 'password'=>'admin');

// URLs to allow
// ----------------------
// List of URLs (or parts of a URL) which the service will accept.
// If the list is empty, all URLs (except those specified in the blocked list below)
// will be permitted.
// Empty: array();
// Non-empty example: array('example.com', 'anothersite.org');
$options->allowed_urls = array();

// URLs to block
// ----------------------
// List of URLs (or parts of a URL) which the service will not accept.
// Note: this list is ignored if allowed_urls is not empty
$options->blocked_urls = array();

// Key holder(s) only?
// ----------------------
// Set this to true if you want to restrict access only to
// those with a key (see below to specify key(s)).
// If set to true, no feed is produced unless a valid
// key is provided.
$options->key_required = false;

// Favour item titles in feed
// ----------------------
// By default, when processing feeds, we assume item titles in the feed
// have not been truncated. So after processing web pages, the extracted titles
// are not used in the generated feed. If you prefer to have extracted titles in 
// the feed you can either set this to false, in which case we will always favour 
// extracted titles. Alternatively, if set to 'user' (default) we'll use the 
// extracted title if you pass '&use_extracted_title' in the querystring.
// Possible values:
// * Favour feed titles: true 
// * Favour extracted titles: false
// * Favour feed titles with user override: 'user' (default)
// Note: this has no effect when the input URL is to a web page - in these cases
// we always use the extracted title in the generated feed.
$options->favour_feed_titles = 'user';

// Access keys (password protected access)
// ------------------------------------
// NOTE: You do not need an API key from fivefilters.org to run your own 
// copy of the code. This is here if you'd like to restrict access to
// _your_ copy.
// Keys let you group users - those with a key and those without - and
// restrict access to the service to those without a key.
// If you want everyone to access the service in the same way, you can
// leave the array below empty and ignore the access key options further down.
// The options further down let you control how the service should behave 
// in each mode.
// Note: Explicitly including the index number (1 and 2 in the examples below) 
// is highly recommended (when generating feeds, we encode the key and 
// refer to it by index number and hash).
$options->api_keys = array();
// Example:
// $options->api_keys[1] = 'secret-key-1';
// $options->api_keys[2] = 'secret-key-2';

// Default entries (with access key)
// ----------------------
// The number of feed items to process when a valid access key is supplied.
$options->default_entries_with_key = 5;

// Max entries (with access key)
// ----------------------
// The maximum number of feed items to process when a valid access key is supplied.
$options->max_entries_with_key = 10;

/////////////////////////////////////////////////
/// ADVANCED OPTIONS ////////////////////////////
/////////////////////////////////////////////////

// Enable XSS filter?
// ----------------------
// We have not enabled this by default because we assume the majority of
// our users do not display the HTML retrieved by Full-Text RSS
// in a web page without further processing. If you subscribe to our generated
// feeds in your news reader application, it should, if it's good software, already
// filter the resulting HTML for XSS attacks, making it redundant for
// Full-Text RSS do the same. Similarly with frameworks/CMS which display
// feed content - the content should be treated like any other user-submitted content.
// 
// If you are writing an application yourself which is processing feeds generated by
// Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks
// or enable this option. This might be useful if you are processing our generated
// feeds with JavaScript on the client side - although there's client side xss
// filtering available too, e.g. https://code.google.com/p/google-caja/wiki/JsHtmlSanitizer
// 
// If enabled, we'll pass retrieved HTML content through htmLawed with
// safe flag on and style attributes denied, see
// http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawed_README.htm#s3.6
// Note: if enabled this will also remove certain elements you may want to preserve, such as iframes.
//
// Valid values:
// true - enabled, all content will be filtered
// 'user' (default) - user must pass &xss in makefulltextfeed.php querystring to enable
// false - disabled
$options->xss_filter = 'user';

// Allowed parsers
// ----------------------
// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
// While fast, on some sites it may not always produce good results. 
// For these sites, you can specify an alternative HTML parser: 
// parser: html5lib
// The html5lib parser is bundled with Full-Text RSS.
// see http://code.google.com/p/html5lib/
//
// To disable HTML parsing with html5lib, you can remove it from this list.
// By default we allow both: libxml and html5lib.
$options->allowed_parsers = array('libxml', 'html5lib');
//$options->allowed_parsers = array('libxml'); //disable html5lib - forcing libxml in all cases

// Enable Cross-Origin Resource Sharing (CORS)
// ----------------------
// If enabled we'll send the following HTTP header
// Access-Control-Allow-Origin: *
// see http://en.wikipedia.org/wiki/Cross-origin_resource_sharing
$options->cors = false;

// Use APC user cache?
// ----------------------
// If enabled we will store site config files (when requested 
// for the first time) in APC's user cache. Keys prefixed with 'sc.'
// This improves performance by reducing disk access.
// Note: this has no effect if APC is unavailable on your server.
$options->apc = true;

// Smart cache (experimental)
// ----------------------
// With this option enabled we will not cache to disk immediately.
// We will store the cache key in APC and if it's requested again
// we will cache results to disk. Keys prefixed with 'cache.'
// This improves performance by reducing disk access.
// Note: this has no effect if APC is disabled or unavailable on your server,
// or if you have caching disabled.
$options->smart_cache = true;

// Fingerprints
// ----------------------
// key is fingerprint (fragment to find in HTML)
// value is host name to use for site config lookup if fingerprint matches
$options->fingerprints = array(
	// Posterous
	'<meta name="generator" content="Posterous"' => array('hostname'=>'fingerprint.posterous.com', 'head'=>true),
	// Blogger
	'<meta content=\'blogger\' name=\'generator\'' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
	'<meta name="generator" content="Blogger"' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
	// WordPress (hosted)
	// '<meta name="generator" content="WordPress.com"' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true),
	// WordPress (self-hosted and hosted)
	'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
);

// User Agent strings - mapping domain names
// ----------------------
// e.g. $options->user_agents = array('example.org' => 'PHP/5.2');
$options->user_agents = array( 'lifehacker.com' => 'PHP/5.2',
							   'gawker.com' => 'PHP/5.2',
							   'deadspin.com' => 'PHP/5.2',
							   'kotaku.com' => 'PHP/5.2',
							   'jezebel.com' => 'PHP/5.2',
							   'io9.com' => 'PHP/5.2',
							   'jalopnik.com' => 'PHP/5.2',
							   'gizmodo.com' => 'PHP/5.2',
							   '.wikipedia.org' => 'Mozilla/5.2',
							   '.fok.nl' => 'Googlebot/2.1',
							   'getpocket.com' => 'PHP/5.2'
							  );

// URL Rewriting
// ----------------------
// Currently allows simple string replace of URLs.
// Useful for rewriting certain URLs to point to a single page
// or HTML view. Although using the single_page_link site config
// instruction is the preferred way to do this, sometimes, as
// with Google Docs URLs, it's not possible.
// Note: this might move to the site config file at some point.
$options->rewrite_url = array(
	// Rewrite public Google Docs URLs to point to HTML view:
	// if a URL contains docs.google.com, replace /Doc? with /View?
	'docs.google.com' => array('/Doc?' => '/View?'),
	'tnr.com' => array('tnr.com/article/' => 'tnr.com/print/article/'),
	'.m.wikipedia.org' => array('.m.wikipedia.org' => '.wikipedia.org'),
	'm.vanityfair.com' => array('m.vanityfair.com' => 'www.vanityfair.com')
);

// Content-Type exceptions
// -----------------------
// Here you can define different actions based
// on the Content-Type header returned by server.
// MIME type as key, action as value.
// Valid actions:
// * 'exclude' - exclude this item from the result
// * 'link' - create HTML link to the item
$options->content_type_exc = array( 
							   'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
							   'image' => array('action'=>'link', 'name'=>'Image'),
							   'audio' => array('action'=>'link', 'name'=>'Audio'),
							   'video' => array('action'=>'link', 'name'=>'Video')
							  );

// Cache directory level
// ----------------------
// Spread cache files over different directories (only used if caching is enabled).
// Used to prevent large number of files in one directory.
// This corresponds to Zend_Cache's hashed_directory_level
// see http://framework.zend.com/manual/en/zend.cache.backends.html
// It's best not to change this if you're unsure.
$options->cache_directory_level = 0;

// Cache cleanup
// -------------
// 0 = script will not clean cache (rename cachecleanup.php and use it for scheduled (e.g. cron) cache cleanup)
// 1 = clean cache everytime the script runs (not recommended)
// 100 = clean cache roughly once every 100 script runs
// x = clean cache roughly once every x script runs
// ...you get the idea :)
$options->cache_cleanup = 100;

/////////////////////////////////////////////////
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////

if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1');

if (basename(__FILE__) == 'config.php') {
	if (file_exists(dirname(__FILE__).'/custom_config.php')) {
		require_once dirname(__FILE__).'/custom_config.php';
	}
	
	// check for environment variables - often used on cloud platforms
	// environment variables should be prefixed with 'ftr_', e.g.
	// ftr_max_entries: 1
	// will set the max_entries value to 1.
	foreach ($options as $_key=>&$_val) {
		$_key = "ftr_$_key";
		if (($_env = getenv($_key)) !== false) {
			if (is_array($_val)) {
				if ($_key === 'ftr_admin_credentials') {
					$_val = array_combine(array('username', 'password'), array_map('trim', explode(':', $_env, 2)));
					if ($_val === false) $_val = array('username'=>'admin', 'password'=>'');
				}
			} elseif ($_env === 'true' || $_env === 'false') {
				$_val = ($_env === 'true');
			} elseif (is_numeric($_env)) {
				$_val = (int)$_env;
			} else { // string
				$_val = $_env;
			}
		}
	}
	unset($_key, $_val, $_env);
}