]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/makefulltextfeedHelpers.php
3 // Autoloading of classes allows us to include files only when they're
4 // needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
5 function autoload ( $class_name ) {
7 if ( $dir === null ) $dir = dirname ( __FILE__
). '/libraries/' ;
8 static $mapping = array (
9 // Include FeedCreator for RSS/Atom creation
10 'FeedWriter' => 'feedwriter/FeedWriter.php' ,
11 'FeedItem' => 'feedwriter/FeedItem.php' ,
12 // Include ContentExtractor and Readability for identifying and extracting content from URLs
13 'ContentExtractor' => 'content-extractor/ContentExtractor.php' ,
14 'SiteConfig' => 'content-extractor/SiteConfig.php' ,
15 'Readability' => 'readability/Readability.php' ,
16 // Include Humble HTTP Agent to allow parallel requests and response caching
17 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php' ,
18 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php' ,
19 'CookieJar' => 'humble-http-agent/CookieJar.php' ,
20 // Include Zend Cache to improve performance (cache results)
21 'Zend_Cache' => 'Zend/Cache.php' ,
23 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php' ,
25 'HTML5_Parser' => 'html5/Parser.php' ,
26 // htmLawed - used if XSS filter is enabled (xss_filter)
27 'htmLawed' => 'htmLawed/htmLawed.php'
29 if ( isset ( $mapping [ $class_name ])) {
30 debug ( "** Loading class $class_name ( {$mapping[$class_name]} )" );
31 require $dir . $mapping [ $class_name ];
37 spl_autoload_register ( 'autoload' );
38 require dirname ( __FILE__
). '/libraries/simplepie/autoloader.php' ;
41 class DummySingleItemFeed
{
43 function __construct ( $url ) { $this
-> item
= new DummySingleItem ( $url
); }
44 public function get_title () { return '' ; }
45 public function get_description () { return 'Content extracted from ' . $this
-> item
-> url
; }
46 public function get_link () { return $this
-> item
-> url
; }
47 public function get_language () { return false ; }
48 public function get_image_url () { return false ; }
49 public function get_items ( $start = 0 , $max = 1 ) { return array ( 0 => $this
-> item
); }
51 class DummySingleItem
{
53 function __construct ( $url ) { $this
-> url
= $url
; }
54 public function get_permalink () { return $this
-> url
; }
55 public function get_title () { return null ; }
56 public function get_date ( $format = '' ) { return false ; }
57 public function get_author ( $key = 0 ) { return null ; }
58 public function get_authors () { return null ; }
59 public function get_description () { return '' ; }
60 public function get_enclosure ( $key = 0 , $prefer = null ) { return null ; }
61 public function get_enclosures () { return null ; }
62 public function get_categories () { return null ; }
65 ///////////////////////////////
67 ///////////////////////////////
69 // Adapted from WordPress
70 // http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
71 function get_excerpt ( $text , $num_words = 55 , $more = null ) {
72 if ( null === $more ) $more = '…' ;
73 $text = strip_tags ( $text );
74 //TODO: Check if word count is based on single characters (East Asian characters)
77 $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
78 preg_match_all('/./u', $text, $words_array);
79 $words_array = array_slice($words_array[0], 0, $num_words + 1);
82 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
86 $words_array = preg_split ( "/[ \n\r\t ]+/" , $text , $num_words +
1 , PREG_SPLIT_NO_EMPTY
);
88 if ( count ( $words_array ) > $num_words ) {
89 array_pop ( $words_array );
90 $text = implode ( $sep , $words_array );
93 $text = implode ( $sep , $words_array );
95 // trim whitespace at beginning or end of string
96 // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
97 $text = preg_replace ( '/^[\pZ\pC]+|[\pZ\pC]+$/u' , '' , $text );
101 function url_allowed ( $url ) {
103 if (! empty ( $options- > allowed_urls
)) {
105 foreach ( $options- > allowed_urls
as $allowurl ) {
106 if ( stristr ( $url , $allowurl ) !== false ) {
111 if (! $allowed ) return false ;
113 foreach ( $options- > blocked_urls
as $blockurl ) {
114 if ( stristr ( $url , $blockurl ) !== false ) {
122 //////////////////////////////////////////////
123 // Convert $html to UTF8
124 // (uses HTTP headers and HTML to find encoding)
125 // adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
126 //////////////////////////////////////////////
127 function convert_to_utf8 ( $html , $header = null )
130 if ( $html || $header ) {
131 if ( is_array ( $header )) $header = implode ( " \n " , $header );
132 if (! $header || ! preg_match_all ( '/^Content-Type:\s+([^;]+)(?:;\s*charset=[" \' ]?([^;" \'\n ]*))?/im' , $header , $match , PREG_SET_ORDER
)) {
133 // error parsing the response
134 debug ( 'Could not find Content-Type header in HTTP response' );
136 $match = end ( $match ); // get last matched element (in case of redirects)
137 if ( isset ( $match [ 2 ])) $encoding = trim ( $match [ 2 ], " \" ' \r\n \0 \x0B\t " );
139 // TODO: check to see if encoding is supported (can we convert it?)
140 // If it's not, result will be empty string.
141 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
142 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
143 if (! $encoding || $encoding == 'none' ) {
144 // search for encoding in HTML - only look at the first 50000 characters
145 // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
146 // TODO: improve this so it looks at smaller chunks first
147 $html_head = substr ( $html , 0 , 50000 );
148 if ( preg_match ( '/^< \? xml\s+version=(?:"[^"]*"| \' [^ \' ]* \' )\s+encoding=("[^"]*"| \' [^ \' ]* \' )/s' , $html_head , $match )) {
149 $encoding = trim ( $match [ 1 ], '" \' ' );
150 } elseif ( preg_match ( '/<meta\s+http-equiv=[" \' ]?Content-Type[" \' ]? content=[" \' ][^;]+;\s*charset=[" \' ]?([^;" \' >]+)/i' , $html_head , $match )) {
151 $encoding = trim ( $match [ 1 ]);
152 } elseif ( preg_match_all ( '/<meta\s+([^>]+)>/i' , $html_head , $match )) {
153 foreach ( $match [ 1 ] as $_test ) {
154 if ( preg_match ( '/charset=[" \' ]?([^" \' ]+)/i' , $_test , $_m )) {
155 $encoding = trim ( $_m [ 1 ]);
161 if ( isset ( $encoding )) $encoding = trim ( $encoding );
162 // trim is important here!
163 if (! $encoding || ( strtolower ( $encoding ) == 'iso-8859-1' )) {
164 // replace MS Word smart qutoes
166 $trans [ chr ( 130 )] = '‚' ; // Single Low-9 Quotation Mark
167 $trans [ chr ( 131 )] = 'ƒ' ; // Latin Small Letter F With Hook
168 $trans [ chr ( 132 )] = '„' ; // Double Low-9 Quotation Mark
169 $trans [ chr ( 133 )] = '…' ; // Horizontal Ellipsis
170 $trans [ chr ( 134 )] = '†' ; // Dagger
171 $trans [ chr ( 135 )] = '‡' ; // Double Dagger
172 $trans [ chr ( 136 )] = 'ˆ' ; // Modifier Letter Circumflex Accent
173 $trans [ chr ( 137 )] = '‰' ; // Per Mille Sign
174 $trans [ chr ( 138 )] = 'Š' ; // Latin Capital Letter S With Caron
175 $trans [ chr ( 139 )] = '‹' ; // Single Left-Pointing Angle Quotation Mark
176 $trans [ chr ( 140 )] = 'Œ' ; // Latin Capital Ligature OE
177 $trans [ chr ( 145 )] = '‘' ; // Left Single Quotation Mark
178 $trans [ chr ( 146 )] = '’' ; // Right Single Quotation Mark
179 $trans [ chr ( 147 )] = '“' ; // Left Double Quotation Mark
180 $trans [ chr ( 148 )] = '”' ; // Right Double Quotation Mark
181 $trans [ chr ( 149 )] = '•' ; // Bullet
182 $trans [ chr ( 150 )] = '–' ; // En Dash
183 $trans [ chr ( 151 )] = '—' ; // Em Dash
184 $trans [ chr ( 152 )] = '˜' ; // Small Tilde
185 $trans [ chr ( 153 )] = '™' ; // Trade Mark Sign
186 $trans [ chr ( 154 )] = 'š' ; // Latin Small Letter S With Caron
187 $trans [ chr ( 155 )] = '›' ; // Single Right-Pointing Angle Quotation Mark
188 $trans [ chr ( 156 )] = 'œ' ; // Latin Small Ligature OE
189 $trans [ chr ( 159 )] = 'Ÿ' ; // Latin Capital Letter Y With Diaeresis
190 $html = strtr ( $html , $trans );
193 debug ( 'No character encoding found, so treating as UTF-8' );
196 debug ( 'Character encoding: ' . $encoding );
197 if ( strtolower ( $encoding ) != 'utf-8' ) {
198 debug ( 'Converting to UTF-8' );
199 $html = SimplePie_Misc
:: change_encoding ( $html , $encoding , 'utf-8' );
206 function makeAbsolute ( $base , $elem ) {
207 $base = new SimplePie_IRI ( $base );
208 // remove '//' in URL path (used to prevent URLs from resolving properly)
209 // TODO: check if this is still the case
210 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
211 foreach ( array ( 'a' => 'href' , 'img' => 'src' ) as $tag => $attr ) {
212 $elems = $elem- > getElementsByTagName ( $tag );
213 for ( $i = $elems- > length
- 1 ; $i >= 0 ; $i-- ) {
214 $e = $elems- > item ( $i );
215 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
216 makeAbsoluteAttr ( $base , $e , $attr );
218 if ( strtolower ( $elem- > tagName
) == $tag ) makeAbsoluteAttr ( $base , $elem , $attr );
221 function makeAbsoluteAttr ( $base , $e , $attr ) {
222 if ( $e- > hasAttribute ( $attr )) {
223 // Trim leading and trailing white space. I don't really like this but
224 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
225 $url = trim ( str_replace ( '%20' , ' ' , $e- > getAttribute ( $attr )));
226 $url = str_replace ( ' ' , '%20' , $url );
227 if (! preg_match ( '!https?://!i' , $url )) {
228 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
229 $e- > setAttribute ( $attr , $absolute );
234 function makeAbsoluteStr ( $base , $url ) {
235 $base = new SimplePie_IRI ( $base );
236 // remove '//' in URL path (causes URLs not to resolve properly)
237 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
238 if ( preg_match ( '!^https?://!i' , $url )) {
242 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
248 // returns single page response, or false if not found
249 function getSinglePage ( $item , $html , $url ) {
250 global $http , $extractor ;
251 debug ( 'Looking for site config files to see if single page link exists' );
252 $site_config = $extractor- > buildSiteConfig ( $url , $html );
254 if (! empty ( $site_config- > single_page_link
)) {
255 $splink = $site_config- > single_page_link
;
256 } elseif (! empty ( $site_config- > single_page_link_in_feed
)) {
257 // single page link xpath is targeted at feed
258 $splink = $site_config- > single_page_link_in_feed
;
259 // so let's replace HTML with feed item description
260 $html = $item- > get_description ();
262 if ( isset ( $splink )) {
263 // Build DOM tree from HTML
264 $readability = new Readability ( $html , $url );
265 $xpath = new DOMXPath ( $readability- > dom
);
266 // Loop through single_page_link xpath expressions
267 $single_page_url = null ;
268 foreach ( $splink as $pattern ) {
269 $elems = @ $xpath- > evaluate ( $pattern , $readability- > dom
);
270 if ( is_string ( $elems )) {
271 $single_page_url = trim ( $elems );
273 } elseif ( $elems instanceof DOMNodeList
&& $elems- > length
> 0 ) {
274 foreach ( $elems as $item ) {
275 if ( $item instanceof DOMElement
&& $item- > hasAttribute ( 'href' )) {
276 $single_page_url = $item- > getAttribute ( 'href' );
278 } elseif ( $item instanceof DOMAttr
&& $item- > value
) {
279 $single_page_url = $item- > value
;
285 // If we've got URL, resolve against $url
286 if ( isset ( $single_page_url ) && ( $single_page_url = makeAbsoluteStr ( $url , $single_page_url ))) {
287 // check it's not what we have already!
288 if ( $single_page_url != $url ) {
289 // it's not, so let's try to fetch it...
290 $_prev_ref = $http- > referer
;
291 $http- > referer
= $single_page_url ;
292 if (( $response = $http- > get ( $single_page_url , true )) && $response [ 'status_code' ] < 300 ) {
293 $http- > referer
= $_prev_ref ;
296 $http- > referer
= $_prev_ref ;
303 // based on content-type http header, decide what to do
304 // param: HTTP headers string
305 // return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
306 // e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
307 function get_mime_action_info ( $headers ) {
309 // check if action defined for returned Content-Type
311 if ( preg_match ( '!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im' , $headers , $match )) {
312 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
313 // match[1] = full mime type, e.g. image/jpeg
314 // match[2] = first part, e.g. image
315 // match[3] = last part, e.g. jpeg
316 $info [ 'mime' ] = strtolower ( trim ( $match [ 1 ]));
317 $info [ 'type' ] = strtolower ( trim ( $match [ 2 ]));
318 $info [ 'subtype' ] = strtolower ( trim ( $match [ 3 ]));
319 foreach ( array ( $info [ 'mime' ], $info [ 'type' ]) as $_mime ) {
320 if ( isset ( $options- > content_type_exc
[ $_mime ])) {
321 $info [ 'action' ] = $options- > content_type_exc
[ $_mime ][ 'action' ];
322 $info [ 'name' ] = $options- > content_type_exc
[ $_mime ][ 'name' ];
330 function remove_url_cruft ( $url ) {
331 // remove google analytics for the time being
332 // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
333 // https://gist.github.com/758177
334 return preg_replace ( '/( \? |\&)utm_[a-z]+=[^\&]+/' , '' , $url );
337 function make_substitutions ( $string ) {
338 if ( $string == '' ) return $string ;
339 global $item , $effective_url ;
340 $string = str_replace ( ' {url} ' , htmlspecialchars ( $item- > get_permalink ()), $string );
341 $string = str_replace ( ' {effective-url} ' , htmlspecialchars ( $effective_url ), $string );
345 function get_cache () {
346 global $options , $valid_key ;
347 static $cache = null ;
348 if ( $cache === null ) {
349 $frontendOptions = array (
350 'lifetime' => 10 * 60 , // cache lifetime of 10 minutes
351 'automatic_serialization' => false ,
352 'write_control' => false ,
353 'automatic_cleaning_factor' => $options- > cache_cleanup
,
354 'ignore_user_abort' => false
356 $backendOptions = array (
357 'cache_dir' => ( $valid_key ) ? $options- > cache_dir
. '/rss-with-key/' : $options- > cache_dir
. '/rss/' , // directory where to put the cache files
358 'file_locking' => false ,
359 'read_control' => true ,
360 'read_control_type' => 'strlen' ,
361 'hashed_directory_level' => $options- > cache_directory_level
,
362 'hashed_directory_perm' => 0777 ,
363 'cache_file_perm' => 0664 ,
364 'file_name_prefix' => 'ff'
366 // getting a Zend_Cache_Core object
367 $cache = Zend_Cache
:: factory ( 'Core' , 'File' , $frontendOptions , $backendOptions );
372 function debug ( $msg ) {
381 function get_base_url ( $dom ) {
382 $xpath = new DOMXPath ( $dom );
383 $base_url = @ $xpath- > evaluate ( 'string(//head/base/@href)' , $dom );
384 if ( $base_url !== '' ) {