]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/makefulltextfeedHelpers.php
1c11b8f6b2345ea15adb93496ed97b2686a9cda6
3 // Autoloading of classes allows us to include files only when they're
4 // needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
5 function autoload ( $class_name ) {
7 if ( $dir === null ) $dir = dirname ( __FILE__
). '/libraries/' ;
8 static $mapping = array (
9 // Include FeedCreator for RSS/Atom creation
10 'FeedWriter' => 'feedwriter/FeedWriter.php' ,
11 'FeedItem' => 'feedwriter/FeedItem.php' ,
12 // Include ContentExtractor and Readability for identifying and extracting content from URLs
13 'ContentExtractor' => 'content-extractor/ContentExtractor.php' ,
14 'SiteConfig' => 'content-extractor/SiteConfig.php' ,
15 'Readability' => 'readability/Readability.php' ,
16 // Include Humble HTTP Agent to allow parallel requests and response caching
17 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php' ,
18 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php' ,
19 'CookieJar' => 'humble-http-agent/CookieJar.php' ,
20 // Include Zend Cache to improve performance (cache results)
21 'Zend_Cache' => 'Zend/Cache.php' ,
23 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php' ,
25 'HTML5_Parser' => 'html5/Parser.php' ,
26 // htmLawed - used if XSS filter is enabled (xss_filter)
27 'htmLawed' => 'htmLawed/htmLawed.php'
29 if ( isset ( $mapping [ $class_name ])) {
30 debug ( "** Loading class $class_name ( {$mapping[$class_name]} )" );
31 require $dir . $mapping [ $class_name ];
37 spl_autoload_register ( 'autoload' );
38 require dirname ( __FILE__
). '/libraries/simplepie/autoloader.php' ;
41 class DummySingleItemFeed
{
43 function __construct ( $url ) { $this
-> item
= new DummySingleItem ( $url
); }
44 public function get_title () { return '' ; }
45 public function get_description () { return 'Content extracted from ' . $this
-> item
-> url
; }
46 public function get_link () { return $this
-> item
-> url
; }
47 public function get_language () { return false ; }
48 public function get_image_url () { return false ; }
49 public function get_items ( $start = 0 , $max = 1 ) { return array ( 0 => $this
-> item
); }
51 class DummySingleItem
{
53 function __construct ( $url ) { $this
-> url
= $url
; }
54 public function get_permalink () { return $this
-> url
; }
55 public function get_title () { return null ; }
56 public function get_date ( $format = '' ) { return false ; }
57 public function get_author ( $key = 0 ) { return null ; }
58 public function get_authors () { return null ; }
59 public function get_description () { return '' ; }
60 public function get_enclosure ( $key = 0 , $prefer = null ) { return null ; }
61 public function get_enclosures () { return null ; }
62 public function get_categories () { return null ; }
65 ///////////////////////////////
67 ///////////////////////////////
69 function url_allowed ( $url ) {
71 if (! empty ( $options- > allowed_urls
)) {
73 foreach ( $options- > allowed_urls
as $allowurl ) {
74 if ( stristr ( $url , $allowurl ) !== false ) {
79 if (! $allowed ) return false ;
81 foreach ( $options- > blocked_urls
as $blockurl ) {
82 if ( stristr ( $url , $blockurl ) !== false ) {
90 //////////////////////////////////////////////
91 // Convert $html to UTF8
92 // (uses HTTP headers and HTML to find encoding)
93 // adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
94 //////////////////////////////////////////////
95 function convert_to_utf8 ( $html , $header = null )
98 if ( $html || $header ) {
99 if ( is_array ( $header )) $header = implode ( " \n " , $header );
100 if (! $header || ! preg_match_all ( '/^Content-Type:\s+([^;]+)(?:;\s*charset=[" \' ]?([^;" \'\n ]*))?/im' , $header , $match , PREG_SET_ORDER
)) {
101 // error parsing the response
102 debug ( 'Could not find Content-Type header in HTTP response' );
104 $match = end ( $match ); // get last matched element (in case of redirects)
105 if ( isset ( $match [ 2 ])) $encoding = trim ( $match [ 2 ], " \" ' \r\n \0 \x0B\t " );
107 // TODO: check to see if encoding is supported (can we convert it?)
108 // If it's not, result will be empty string.
109 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
110 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
111 if (! $encoding || $encoding == 'none' ) {
112 // search for encoding in HTML - only look at the first 50000 characters
113 // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
114 // TODO: improve this so it looks at smaller chunks first
115 $html_head = substr ( $html , 0 , 50000 );
116 if ( preg_match ( '/^< \? xml\s+version=(?:"[^"]*"| \' [^ \' ]* \' )\s+encoding=("[^"]*"| \' [^ \' ]* \' )/s' , $html_head , $match )) {
117 $encoding = trim ( $match [ 1 ], '" \' ' );
118 } elseif ( preg_match ( '/<meta\s+http-equiv=[" \' ]?Content-Type[" \' ]? content=[" \' ][^;]+;\s*charset=[" \' ]?([^;" \' >]+)/i' , $html_head , $match )) {
119 $encoding = trim ( $match [ 1 ]);
120 } elseif ( preg_match_all ( '/<meta\s+([^>]+)>/i' , $html_head , $match )) {
121 foreach ( $match [ 1 ] as $_test ) {
122 if ( preg_match ( '/charset=[" \' ]?([^" \' ]+)/i' , $_test , $_m )) {
123 $encoding = trim ( $_m [ 1 ]);
129 if ( isset ( $encoding )) $encoding = trim ( $encoding );
130 // trim is important here!
131 if (! $encoding || ( strtolower ( $encoding ) == 'iso-8859-1' )) {
132 // replace MS Word smart qutoes
134 $trans [ chr ( 130 )] = '‚' ; // Single Low-9 Quotation Mark
135 $trans [ chr ( 131 )] = 'ƒ' ; // Latin Small Letter F With Hook
136 $trans [ chr ( 132 )] = '„' ; // Double Low-9 Quotation Mark
137 $trans [ chr ( 133 )] = '…' ; // Horizontal Ellipsis
138 $trans [ chr ( 134 )] = '†' ; // Dagger
139 $trans [ chr ( 135 )] = '‡' ; // Double Dagger
140 $trans [ chr ( 136 )] = 'ˆ' ; // Modifier Letter Circumflex Accent
141 $trans [ chr ( 137 )] = '‰' ; // Per Mille Sign
142 $trans [ chr ( 138 )] = 'Š' ; // Latin Capital Letter S With Caron
143 $trans [ chr ( 139 )] = '‹' ; // Single Left-Pointing Angle Quotation Mark
144 $trans [ chr ( 140 )] = 'Œ' ; // Latin Capital Ligature OE
145 $trans [ chr ( 145 )] = '‘' ; // Left Single Quotation Mark
146 $trans [ chr ( 146 )] = '’' ; // Right Single Quotation Mark
147 $trans [ chr ( 147 )] = '“' ; // Left Double Quotation Mark
148 $trans [ chr ( 148 )] = '”' ; // Right Double Quotation Mark
149 $trans [ chr ( 149 )] = '•' ; // Bullet
150 $trans [ chr ( 150 )] = '–' ; // En Dash
151 $trans [ chr ( 151 )] = '—' ; // Em Dash
152 $trans [ chr ( 152 )] = '˜' ; // Small Tilde
153 $trans [ chr ( 153 )] = '™' ; // Trade Mark Sign
154 $trans [ chr ( 154 )] = 'š' ; // Latin Small Letter S With Caron
155 $trans [ chr ( 155 )] = '›' ; // Single Right-Pointing Angle Quotation Mark
156 $trans [ chr ( 156 )] = 'œ' ; // Latin Small Ligature OE
157 $trans [ chr ( 159 )] = 'Ÿ' ; // Latin Capital Letter Y With Diaeresis
158 $html = strtr ( $html , $trans );
161 debug ( 'No character encoding found, so treating as UTF-8' );
164 debug ( 'Character encoding: ' . $encoding );
165 if ( strtolower ( $encoding ) != 'utf-8' ) {
166 debug ( 'Converting to UTF-8' );
167 $html = SimplePie_Misc
:: change_encoding ( $html , $encoding , 'utf-8' );
169 if (function_exists('iconv')) {
170 // iconv appears to handle certain character encodings better than mb_convert_encoding
171 $html = iconv($encoding, 'utf-8', $html);
173 $html = mb_convert_encoding($html, 'utf-8', $encoding);
182 function makeAbsolute ( $base , $elem ) {
183 $base = new SimplePie_IRI ( $base );
184 // remove '//' in URL path (used to prevent URLs from resolving properly)
185 // TODO: check if this is still the case
186 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
187 foreach ( array ( 'a' => 'href' , 'img' => 'src' ) as $tag => $attr ) {
188 $elems = $elem- > getElementsByTagName ( $tag );
189 for ( $i = $elems- > length
- 1 ; $i >= 0 ; $i-- ) {
190 $e = $elems- > item ( $i );
191 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
192 makeAbsoluteAttr ( $base , $e , $attr );
194 if ( strtolower ( $elem- > tagName
) == $tag ) makeAbsoluteAttr ( $base , $elem , $attr );
197 function makeAbsoluteAttr ( $base , $e , $attr ) {
198 if ( $e- > hasAttribute ( $attr )) {
199 // Trim leading and trailing white space. I don't really like this but
200 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
201 $url = trim ( str_replace ( '%20' , ' ' , $e- > getAttribute ( $attr )));
202 $url = str_replace ( ' ' , '%20' , $url );
203 if (! preg_match ( '!https?://!i' , $url )) {
204 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
205 $e- > setAttribute ( $attr , $absolute );
210 function makeAbsoluteStr ( $base , $url ) {
211 $base = new SimplePie_IRI ( $base );
212 // remove '//' in URL path (causes URLs not to resolve properly)
213 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
214 if ( preg_match ( '!^https?://!i' , $url )) {
218 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
224 // returns single page response, or false if not found
225 function getSinglePage ( $item , $html , $url ) {
226 global $http , $extractor ;
227 debug ( 'Looking for site config files to see if single page link exists' );
228 $site_config = $extractor- > buildSiteConfig ( $url , $html );
230 if (! empty ( $site_config- > single_page_link
)) {
231 $splink = $site_config- > single_page_link
;
232 } elseif (! empty ( $site_config- > single_page_link_in_feed
)) {
233 // single page link xpath is targeted at feed
234 $splink = $site_config- > single_page_link_in_feed
;
235 // so let's replace HTML with feed item description
236 $html = $item- > get_description ();
238 if ( isset ( $splink )) {
239 // Build DOM tree from HTML
240 $readability = new Readability ( $html , $url );
241 $xpath = new DOMXPath ( $readability- > dom
);
242 // Loop through single_page_link xpath expressions
243 $single_page_url = null ;
244 foreach ( $splink as $pattern ) {
245 $elems = @ $xpath- > evaluate ( $pattern , $readability- > dom
);
246 if ( is_string ( $elems )) {
247 $single_page_url = trim ( $elems );
249 } elseif ( $elems instanceof DOMNodeList
&& $elems- > length
> 0 ) {
250 foreach ( $elems as $item ) {
251 if ( $item instanceof DOMElement
&& $item- > hasAttribute ( 'href' )) {
252 $single_page_url = $item- > getAttribute ( 'href' );
254 } elseif ( $item instanceof DOMAttr
&& $item- > value
) {
255 $single_page_url = $item- > value
;
261 // If we've got URL, resolve against $url
262 if ( isset ( $single_page_url ) && ( $single_page_url = makeAbsoluteStr ( $url , $single_page_url ))) {
263 // check it's not what we have already!
264 if ( $single_page_url != $url ) {
265 // it's not, so let's try to fetch it...
266 $_prev_ref = $http- > referer
;
267 $http- > referer
= $single_page_url ;
268 if (( $response = $http- > get ( $single_page_url , true )) && $response [ 'status_code' ] < 300 ) {
269 $http- > referer
= $_prev_ref ;
272 $http- > referer
= $_prev_ref ;
279 // based on content-type http header, decide what to do
280 // param: HTTP headers string
281 // return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
282 // e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
283 function get_mime_action_info ( $headers ) {
285 // check if action defined for returned Content-Type
287 if ( preg_match ( '!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im' , $headers , $match )) {
288 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
289 // match[1] = full mime type, e.g. image/jpeg
290 // match[2] = first part, e.g. image
291 // match[3] = last part, e.g. jpeg
292 $info [ 'mime' ] = strtolower ( trim ( $match [ 1 ]));
293 $info [ 'type' ] = strtolower ( trim ( $match [ 2 ]));
294 $info [ 'subtype' ] = strtolower ( trim ( $match [ 3 ]));
295 foreach ( array ( $info [ 'mime' ], $info [ 'type' ]) as $_mime ) {
296 if ( isset ( $options- > content_type_exc
[ $_mime ])) {
297 $info [ 'action' ] = $options- > content_type_exc
[ $_mime ][ 'action' ];
298 $info [ 'name' ] = $options- > content_type_exc
[ $_mime ][ 'name' ];
306 function remove_url_cruft ( $url ) {
307 // remove google analytics for the time being
308 // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
309 // https://gist.github.com/758177
310 return preg_replace ( '/( \? |\&)utm_[a-z]+=[^\&]+/' , '' , $url );
313 function make_substitutions ( $string ) {
314 if ( $string == '' ) return $string ;
315 global $item , $effective_url ;
316 $string = str_replace ( ' {url} ' , htmlspecialchars ( $item- > get_permalink ()), $string );
317 $string = str_replace ( ' {effective-url} ' , htmlspecialchars ( $effective_url ), $string );
321 function get_cache () {
322 global $options , $valid_key ;
323 static $cache = null ;
324 if ( $cache === null ) {
325 $frontendOptions = array (
326 'lifetime' => 10 * 60 , // cache lifetime of 10 minutes
327 'automatic_serialization' => false ,
328 'write_control' => false ,
329 'automatic_cleaning_factor' => $options- > cache_cleanup
,
330 'ignore_user_abort' => false
332 $backendOptions = array (
333 'cache_dir' => ( $valid_key ) ? $options- > cache_dir
. '/rss-with-key/' : $options- > cache_dir
. '/rss/' , // directory where to put the cache files
334 'file_locking' => false ,
335 'read_control' => true ,
336 'read_control_type' => 'strlen' ,
337 'hashed_directory_level' => $options- > cache_directory_level
,
338 'hashed_directory_perm' => 0777 ,
339 'cache_file_perm' => 0664 ,
340 'file_name_prefix' => 'ff'
342 // getting a Zend_Cache_Core object
343 $cache = Zend_Cache
:: factory ( 'Core' , 'File' , $frontendOptions , $backendOptions );
348 function debug ( $msg ) {