- * @param mixed $frontend frontend name (string) or Zend_Cache_Frontend_ object
- * @param mixed $backend backend name (string) or Zend_Cache_Backend_ object
- * @param array $frontendOptions associative array of options for the corresponding frontend constructor
- * @param array $backendOptions associative array of options for the corresponding backend constructor
- * @param boolean $customFrontendNaming if true, the frontend argument is used as a complete class name ; if false, the frontend argument is used as the end of "Zend_Cache_Frontend_[...]" class name
- * @param boolean $customBackendNaming if true, the backend argument is used as a complete class name ; if false, the backend argument is used as the end of "Zend_Cache_Backend_[...]" class name
- * @param boolean $autoload if true, there will no require_once for backend and frontend (useful only for custom backends/frontends)
- * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
- * @license http://framework.zend.com/license/new-bsd New BSD License
- */
-class Zend_Cache_Core
-{
- /**
- * Messages
- */
- const BACKEND_NOT_SUPPORTS_TAG = 'tags are not supported by the current backend';
- const BACKEND_NOT_IMPLEMENTS_EXTENDED_IF = 'Current backend doesn\'t implement the Zend_Cache_Backend_ExtendedInterface, so this method is not available';
-
- /**
- * Backend Object
- *
- * @var Zend_Cache_Backend_Interface $_backend
- */
- protected $_backend = null;
-
- /**
- * Available options
- *
- * ====> (boolean) write_control :
- * - Enable / disable write control (the cache is read just after writing to detect corrupt entries)
- * - Enable write control will lightly slow the cache writing but not the cache reading
- * Write control can detect some corrupt cache files but maybe it's not a perfect control
- *
- * ====> (boolean) caching :
- * - Enable / disable caching
- * (can be very useful for the debug of cached scripts)
- *
- * =====> (string) cache_id_prefix :
- * - prefix for cache ids (namespace)
- *
- * ====> (boolean) automatic_serialization :
- * - Enable / disable automatic serialization
- * - It can be used to save directly datas which aren't strings (but it's slower)
- *
- * ====> (int) automatic_cleaning_factor :
- * - Disable / Tune the automatic cleaning process
- * - The automatic cleaning process destroy too old (for the given life time)
- * cache files when a new cache file is written :
- * 0 => no automatic cache cleaning
- * 1 => systematic cache cleaning
- * x (integer) > 1 => automatic cleaning randomly 1 times on x cache write
- *
- * ====> (int) lifetime :
- * - Cache lifetime (in seconds)
- * - If null, the cache is valid forever.
- *
- * ====> (boolean) logging :
- * - If set to true, logging is activated (but the system is slower)
- *
- * ====> (boolean) ignore_user_abort
- * - If set to true, the core will set the ignore_user_abort PHP flag inside the
- * save() method to avoid cache corruptions in some cases (default false)
- *
- * @var array $_options available options
- */
- protected $_options = array(
- 'write_control' => true,
- 'caching' => true,
- 'cache_id_prefix' => null,
- 'automatic_serialization' => false,
- 'automatic_cleaning_factor' => 10,
- 'lifetime' => 3600,
- 'logging' => false,
- 'logger' => null,
- 'ignore_user_abort' => false
- );
-
- /**
- * Array of options which have to be transfered to backend
- // Use first matching element as title (0 or more xpath expressions)
- public $title = array();
-
- // Use first matching element as body (0 or more xpath expressions)
- public $body = array();
-
- // Use first matching element as author (0 or more xpath expressions)
- public $author = array();
-
- // Use first matching element as date (0 or more xpath expressions)
- public $date = array();
-
- // Strip elements matching these xpath expressions (0 or more)
- public $strip = array();
-
- // Strip elements which contain these strings (0 or more) in the id or class attribute
- public $strip_id_or_class = array();
-
- // Strip images which contain these strings (0 or more) in the src attribute
- public $strip_image_src = array();
-
- // Additional HTTP headers to send
- // NOT YET USED
- public $http_header = array();
-
- // Process HTML with tidy before creating DOM (bool or null if undeclared)
- public $tidy = null;
-
- protected $default_tidy = true; // used if undeclared
-
- // Autodetect title/body if xpath expressions fail to produce results.
- // Note that this applies to title and body separately, ie.
- // * if we get a body match but no title match, this option will determine whether we autodetect title
- // * if neither match, this determines whether we autodetect title and body.
- // Also note that this only applies when there is at least one xpath expression in title or body, ie.
- // * if title and body are both empty (no xpath expressions), this option has no effect (both title and body will be auto-detected)
- // * if there's an xpath expression for title and none for body, body will be auto-detected and this option will determine whether we auto-detect title if the xpath expression for it fails to produce results.
- // Usage scenario: you want to extract something specific from a set of URLs, e.g. a table, and if the table is not found, you want to ignore the entry completely. Auto-detection is unlikely to succeed here, so you construct your patterns and set this option to false. Another scenario may be a site where auto-detection has proven to fail (or worse, picked up the wrong content).
- // bool or null if undeclared
- public $autodetect_on_failure = null;
- protected $default_autodetect_on_failure = true; // used if undeclared
-
- // Clean up content block - attempt to remove elements that appear to be superfluous
- // bool or null if undeclared
- public $prune = null;
- protected $default_prune = true; // used if undeclared
-
- // Test URL - if present, can be used to test the config above
- public $test_url = array();
-
- // Single-page link - should identify a link element or URL pointing to the page holding the entire article
- // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
- // display the first page with links to the other pages at the bottom. Often there is also a link to a page
- // which displays the entire article on one page (e.g. 'print view').
- // This should be an XPath expression identifying the link to that page. If present and we find a match,
- // we will retrieve that page and the rest of the options in this config will be applied to the new page.
- public $single_page_link = array();
-
- public $next_page_link = array();
-
- // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
- public $single_page_link_in_feed = array();
-
- // Which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
- // string or null if undeclared
- public $parser = null;
- protected $default_parser = 'libxml'; // used if undeclared
-
- // Strings to search for in HTML before processing begins (used with $replace_string)
- public $find_string = array();
- // Strings to replace those found in $find_string before HTML processing begins
- public $replace_string = array();
-
- // the options below cannot be set in the config files which this class represents
-
- //public $cache_in_apc = false; // used to decide if we should cache in apc or not
-// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
-//////////////////////////////////////////////
-function convert_to_utf8($html, $header=null)
-{
- $encoding = null;
- if ($html || $header) {
- if (is_array($header)) $header = implode("\n", $header);
- if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
- // error parsing the response
- debug('Could not find Content-Type header in HTTP response');
- } else {
- $match = end($match); // get last matched element (in case of redirects)
- if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
- }
- // TODO: check to see if encoding is supported (can we convert it?)
- // If it's not, result will be empty string.
- // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
- // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
- if (!$encoding || $encoding == 'none') {
- // search for encoding in HTML - only look at the first 50000 characters
- // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
- // TODO: improve this so it looks at smaller chunks first
- $html_head = substr($html, 0, 50000);
- if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
-[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically.
-
-This repository contains the site-specific extraction rules we rely on in Full-Text RSS.
-
-### Contributing changes
-
-We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for.
-
-We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
-
-You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
-
-> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
-
-When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
-
-If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
-
-### How to write a site config file
-
-The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
-
-For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
-
-### Instapaper
-
-When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
-
-Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
-
-> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
-
-Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold).
-
-### Testing site config files
-
-Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.
-body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')]
-# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
-#This leaves some crud after the article, but it's better than nothing.
-#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
-# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
-# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
-# remove sidebars containing images (I assume this is desired for Instapaper)
-strip: //div[@id="related"]
-strip: //div[contains(@class, "image")]
-
-# note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet
-# Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height.
-# the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output.
-body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')]
-date: //meta[@name="date"]/@content
-
-strip_id_or_class: enlarge_measure
-strip_id_or_class: enlarge_html
-strip: //a[contains(@class, 'enlargeicon')]
-strip: //div[contains(@class, 'bookedition')]
-strip: //div[@class='textsize']
-strip: //ul[@class='genres']
-strip: //span[@class='bull']
-strip_id_or_class: secondary
-strip_id_or_class: con1col
-strip: //h3[@class='conheader']
-
-replace_string(<a name="more"> </a>): <!-- no more -->
-# Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :(
-# my section divs seem to interfere with the Instapaper parser, so I ditch 'em
-dissolve://div[contains(@class, 'section')]
-
-#these don't seem to be necessary, but just in case
-strip_id_or_class:'masthead'
-strip_id_or_class:'footer'
-
-#again, Instapaper seems to understand where my content is, but just in case
-body://div[@id='content']
-
-# in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing
-strip_id_or_class:'screen-only'
-strip_id_or_class:'no-print'
-
-#other misc removals and simplifications
-strip_id_or_class:'popup'
-strip_id_or_class:'ZoomSpin'
-
-#I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes
-#their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time'
-# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)
-#Removes copyright statement - often disturb as first line of the news
-strip: //p[@class="copyright"]
-strip: //div[@class="copyright"]
-#Removes pagination links at the end
-strip: //div[@class="pagination"]
-
-# Fix picture captions
-wrap_in(small): //p[@class="caption"]/text()
-
-# Fix sub-headlines
-wrap_in(h2): //p/strong
-dissolve: //h2/strong
-
-#Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here.