]>
git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/makefulltextfeedHelpers.php
453856dd72f4c71c68b450401e763ed3a6e36ee7
3 // Autoloading of classes allows us to include files only when they're
4 // needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
5 function autoload ( $class_name ) {
7 if ( $dir === null ) $dir = dirname ( __FILE__
). '/libraries/' ;
8 static $mapping = array (
9 // Include FeedCreator for RSS/Atom creation
10 'FeedWriter' => 'feedwriter/FeedWriter.php' ,
11 'FeedItem' => 'feedwriter/FeedItem.php' ,
12 // Include ContentExtractor and Readability for identifying and extracting content from URLs
13 'ContentExtractor' => 'content-extractor/ContentExtractor.php' ,
14 'SiteConfig' => 'content-extractor/SiteConfig.php' ,
15 // Include Humble HTTP Agent to allow parallel requests and response caching
16 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php' ,
17 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php' ,
18 'CookieJar' => 'humble-http-agent/CookieJar.php' ,
19 // Include Zend Cache to improve performance (cache results)
20 'Zend_Cache' => 'Zend/Cache.php' ,
22 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php' ,
24 'HTML5_Parser' => 'html5/Parser.php' ,
25 // htmLawed - used if XSS filter is enabled (xss_filter)
26 'htmLawed' => 'htmLawed/htmLawed.php'
28 if ( isset ( $mapping [ $class_name ])) {
29 debug ( "** Loading class $class_name ( {$mapping[$class_name]} )" );
30 require $dir . $mapping [ $class_name ];
36 spl_autoload_register ( 'autoload' );
38 class DummySingleItemFeed
{
40 function __construct ( $url ) { $this
-> item
= new DummySingleItem ( $url
); }
41 public function get_title () { return '' ; }
42 public function get_description () { return 'Content extracted from ' . $this
-> item
-> url
; }
43 public function get_link () { return $this
-> item
-> url
; }
44 public function get_language () { return false ; }
45 public function get_image_url () { return false ; }
46 public function get_items ( $start = 0 , $max = 1 ) { return array ( 0 => $this
-> item
); }
48 class DummySingleItem
{
50 function __construct ( $url ) { $this
-> url
= $url
; }
51 public function get_permalink () { return $this
-> url
; }
52 public function get_title () { return null ; }
53 public function get_date ( $format = '' ) { return false ; }
54 public function get_author ( $key = 0 ) { return null ; }
55 public function get_authors () { return null ; }
56 public function get_description () { return '' ; }
57 public function get_enclosure ( $key = 0 , $prefer = null ) { return null ; }
58 public function get_enclosures () { return null ; }
59 public function get_categories () { return null ; }
62 ///////////////////////////////
64 ///////////////////////////////
66 // Adapted from WordPress
67 // http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
68 function get_excerpt ( $text , $num_words = 55 , $more = null ) {
69 if ( null === $more ) $more = '…' ;
70 $text = strip_tags ( $text );
71 //TODO: Check if word count is based on single characters (East Asian characters)
74 $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
75 preg_match_all('/./u', $text, $words_array);
76 $words_array = array_slice($words_array[0], 0, $num_words + 1);
79 $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
83 $words_array = preg_split ( "/[ \n\r\t ]+/" , $text , $num_words +
1 , PREG_SPLIT_NO_EMPTY
);
85 if ( count ( $words_array ) > $num_words ) {
86 array_pop ( $words_array );
87 $text = implode ( $sep , $words_array );
90 $text = implode ( $sep , $words_array );
92 // trim whitespace at beginning or end of string
93 // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
94 $text = preg_replace ( '/^[\pZ\pC]+|[\pZ\pC]+$/u' , '' , $text );
98 function url_allowed ( $url ) {
100 if (! empty ( $options- > allowed_urls
)) {
102 foreach ( $options- > allowed_urls
as $allowurl ) {
103 if ( stristr ( $url , $allowurl ) !== false ) {
108 if (! $allowed ) return false ;
110 foreach ( $options- > blocked_urls
as $blockurl ) {
111 if ( stristr ( $url , $blockurl ) !== false ) {
119 //////////////////////////////////////////////
120 // Convert $html to UTF8
121 // (uses HTTP headers and HTML to find encoding)
122 // adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
123 //////////////////////////////////////////////
124 function convert_to_utf8 ( $html , $header = null )
127 if ( $html || $header ) {
128 if ( is_array ( $header )) $header = implode ( " \n " , $header );
129 if (! $header || ! preg_match_all ( '/^Content-Type:\s+([^;]+)(?:;\s*charset=[" \' ]?([^;" \'\n ]*))?/im' , $header , $match , PREG_SET_ORDER
)) {
130 // error parsing the response
131 debug ( 'Could not find Content-Type header in HTTP response' );
133 $match = end ( $match ); // get last matched element (in case of redirects)
134 if ( isset ( $match [ 2 ])) $encoding = trim ( $match [ 2 ], " \" ' \r\n \0 \x0B\t " );
136 // TODO: check to see if encoding is supported (can we convert it?)
137 // If it's not, result will be empty string.
138 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
139 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
140 if (! $encoding || $encoding == 'none' ) {
141 // search for encoding in HTML - only look at the first 50000 characters
142 // Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
143 // TODO: improve this so it looks at smaller chunks first
144 $html_head = substr ( $html , 0 , 50000 );
145 if ( preg_match ( '/^< \? xml\s+version=(?:"[^"]*"| \' [^ \' ]* \' )\s+encoding=("[^"]*"| \' [^ \' ]* \' )/s' , $html_head , $match )) {
146 $encoding = trim ( $match [ 1 ], '" \' ' );
147 } elseif ( preg_match ( '/<meta\s+http-equiv=[" \' ]?Content-Type[" \' ]? content=[" \' ][^;]+;\s*charset=[" \' ]?([^;" \' >]+)/i' , $html_head , $match )) {
148 $encoding = trim ( $match [ 1 ]);
149 } elseif ( preg_match_all ( '/<meta\s+([^>]+)>/i' , $html_head , $match )) {
150 foreach ( $match [ 1 ] as $_test ) {
151 if ( preg_match ( '/charset=[" \' ]?([^" \' ]+)/i' , $_test , $_m )) {
152 $encoding = trim ( $_m [ 1 ]);
158 if ( isset ( $encoding )) $encoding = trim ( $encoding );
159 // trim is important here!
160 if (! $encoding || ( strtolower ( $encoding ) == 'iso-8859-1' )) {
161 // replace MS Word smart qutoes
163 $trans [ chr ( 130 )] = '‚' ; // Single Low-9 Quotation Mark
164 $trans [ chr ( 131 )] = 'ƒ' ; // Latin Small Letter F With Hook
165 $trans [ chr ( 132 )] = '„' ; // Double Low-9 Quotation Mark
166 $trans [ chr ( 133 )] = '…' ; // Horizontal Ellipsis
167 $trans [ chr ( 134 )] = '†' ; // Dagger
168 $trans [ chr ( 135 )] = '‡' ; // Double Dagger
169 $trans [ chr ( 136 )] = 'ˆ' ; // Modifier Letter Circumflex Accent
170 $trans [ chr ( 137 )] = '‰' ; // Per Mille Sign
171 $trans [ chr ( 138 )] = 'Š' ; // Latin Capital Letter S With Caron
172 $trans [ chr ( 139 )] = '‹' ; // Single Left-Pointing Angle Quotation Mark
173 $trans [ chr ( 140 )] = 'Œ' ; // Latin Capital Ligature OE
174 $trans [ chr ( 145 )] = '‘' ; // Left Single Quotation Mark
175 $trans [ chr ( 146 )] = '’' ; // Right Single Quotation Mark
176 $trans [ chr ( 147 )] = '“' ; // Left Double Quotation Mark
177 $trans [ chr ( 148 )] = '”' ; // Right Double Quotation Mark
178 $trans [ chr ( 149 )] = '•' ; // Bullet
179 $trans [ chr ( 150 )] = '–' ; // En Dash
180 $trans [ chr ( 151 )] = '—' ; // Em Dash
181 $trans [ chr ( 152 )] = '˜' ; // Small Tilde
182 $trans [ chr ( 153 )] = '™' ; // Trade Mark Sign
183 $trans [ chr ( 154 )] = 'š' ; // Latin Small Letter S With Caron
184 $trans [ chr ( 155 )] = '›' ; // Single Right-Pointing Angle Quotation Mark
185 $trans [ chr ( 156 )] = 'œ' ; // Latin Small Ligature OE
186 $trans [ chr ( 159 )] = 'Ÿ' ; // Latin Capital Letter Y With Diaeresis
187 $html = strtr ( $html , $trans );
190 debug ( 'No character encoding found, so treating as UTF-8' );
193 debug ( 'Character encoding: ' . $encoding );
194 if ( strtolower ( $encoding ) != 'utf-8' ) {
195 debug ( 'Converting to UTF-8' );
196 $html = SimplePie_Misc
:: change_encoding ( $html , $encoding , 'utf-8' );
203 function makeAbsolute ( $base , $elem ) {
204 $base = new SimplePie_IRI ( $base );
205 // remove '//' in URL path (used to prevent URLs from resolving properly)
206 // TODO: check if this is still the case
207 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
208 foreach ( array ( 'a' => 'href' , 'img' => 'src' ) as $tag => $attr ) {
209 $elems = $elem- > getElementsByTagName ( $tag );
210 for ( $i = $elems- > length
- 1 ; $i >= 0 ; $i-- ) {
211 $e = $elems- > item ( $i );
212 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
213 makeAbsoluteAttr ( $base , $e , $attr );
215 if ( strtolower ( $elem- > tagName
) == $tag ) makeAbsoluteAttr ( $base , $elem , $attr );
218 function makeAbsoluteAttr ( $base , $e , $attr ) {
219 if ( $e- > hasAttribute ( $attr )) {
220 // Trim leading and trailing white space. I don't really like this but
221 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
222 $url = trim ( str_replace ( '%20' , ' ' , $e- > getAttribute ( $attr )));
223 $url = str_replace ( ' ' , '%20' , $url );
224 if (! preg_match ( '!https?://!i' , $url )) {
225 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
226 $e- > setAttribute ( $attr , $absolute );
231 function makeAbsoluteStr ( $base , $url ) {
232 $base = new SimplePie_IRI ( $base );
233 // remove '//' in URL path (causes URLs not to resolve properly)
234 if ( isset ( $base- > path
)) $base- > path
= preg_replace ( '!//+!' , '/' , $base- > path
);
235 if ( preg_match ( '!^https?://!i' , $url )) {
239 if ( $absolute = SimplePie_IRI
:: absolutize ( $base , $url )) {
245 // returns single page response, or false if not found
246 function getSinglePage ( $item , $html , $url ) {
247 global $http , $extractor ;
248 debug ( 'Looking for site config files to see if single page link exists' );
249 $site_config = $extractor- > buildSiteConfig ( $url , $html );
251 if (! empty ( $site_config- > single_page_link
)) {
252 $splink = $site_config- > single_page_link
;
253 } elseif (! empty ( $site_config- > single_page_link_in_feed
)) {
254 // single page link xpath is targeted at feed
255 $splink = $site_config- > single_page_link_in_feed
;
256 // so let's replace HTML with feed item description
257 $html = $item- > get_description ();
259 if ( isset ( $splink )) {
260 // Build DOM tree from HTML
261 $readability = new Readability ( $html , $url );
262 $xpath = new DOMXPath ( $readability- > dom
);
263 // Loop through single_page_link xpath expressions
264 $single_page_url = null ;
265 foreach ( $splink as $pattern ) {
266 $elems = @ $xpath- > evaluate ( $pattern , $readability- > dom
);
267 if ( is_string ( $elems )) {
268 $single_page_url = trim ( $elems );
270 } elseif ( $elems instanceof DOMNodeList
&& $elems- > length
> 0 ) {
271 foreach ( $elems as $item ) {
272 if ( $item instanceof DOMElement
&& $item- > hasAttribute ( 'href' )) {
273 $single_page_url = $item- > getAttribute ( 'href' );
275 } elseif ( $item instanceof DOMAttr
&& $item- > value
) {
276 $single_page_url = $item- > value
;
282 // If we've got URL, resolve against $url
283 if ( isset ( $single_page_url ) && ( $single_page_url = makeAbsoluteStr ( $url , $single_page_url ))) {
284 // check it's not what we have already!
285 if ( $single_page_url != $url ) {
286 // it's not, so let's try to fetch it...
287 $_prev_ref = $http- > referer
;
288 $http- > referer
= $single_page_url ;
289 if (( $response = $http- > get ( $single_page_url , true )) && $response [ 'status_code' ] < 300 ) {
290 $http- > referer
= $_prev_ref ;
293 $http- > referer
= $_prev_ref ;
300 // based on content-type http header, decide what to do
301 // param: HTTP headers string
302 // return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
303 // e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
304 function get_mime_action_info ( $headers ) {
306 // check if action defined for returned Content-Type
308 if ( preg_match ( '!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im' , $headers , $match )) {
309 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
310 // match[1] = full mime type, e.g. image/jpeg
311 // match[2] = first part, e.g. image
312 // match[3] = last part, e.g. jpeg
313 $info [ 'mime' ] = strtolower ( trim ( $match [ 1 ]));
314 $info [ 'type' ] = strtolower ( trim ( $match [ 2 ]));
315 $info [ 'subtype' ] = strtolower ( trim ( $match [ 3 ]));
316 foreach ( array ( $info [ 'mime' ], $info [ 'type' ]) as $_mime ) {
317 if ( isset ( $options- > content_type_exc
[ $_mime ])) {
318 $info [ 'action' ] = $options- > content_type_exc
[ $_mime ][ 'action' ];
319 $info [ 'name' ] = $options- > content_type_exc
[ $_mime ][ 'name' ];
327 function remove_url_cruft ( $url ) {
328 // remove google analytics for the time being
329 // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
330 // https://gist.github.com/758177
331 return preg_replace ( '/( \? |\&)utm_[a-z]+=[^\&]+/' , '' , $url );
334 function make_substitutions ( $string ) {
335 if ( $string == '' ) return $string ;
336 global $item , $effective_url ;
337 $string = str_replace ( ' {url} ' , htmlspecialchars ( $item- > get_permalink ()), $string );
338 $string = str_replace ( ' {effective-url} ' , htmlspecialchars ( $effective_url ), $string );
342 function get_cache () {
343 global $options , $valid_key ;
344 static $cache = null ;
345 if ( $cache === null ) {
346 $frontendOptions = array (
347 'lifetime' => 10 * 60 , // cache lifetime of 10 minutes
348 'automatic_serialization' => false ,
349 'write_control' => false ,
350 'automatic_cleaning_factor' => $options- > cache_cleanup
,
351 'ignore_user_abort' => false
353 $backendOptions = array (
354 'cache_dir' => ( $valid_key ) ? $options- > cache_dir
. '/rss-with-key/' : $options- > cache_dir
. '/rss/' , // directory where to put the cache files
355 'file_locking' => false ,
356 'read_control' => true ,
357 'read_control_type' => 'strlen' ,
358 'hashed_directory_level' => $options- > cache_directory_level
,
359 'hashed_directory_perm' => 0777 ,
360 'cache_file_perm' => 0664 ,
361 'file_name_prefix' => 'ff'
363 // getting a Zend_Cache_Core object
364 $cache = Zend_Cache
:: factory ( 'Core' , 'File' , $frontendOptions , $backendOptions );
369 function debug ( $msg ) {
378 function get_base_url ( $dom ) {
379 $xpath = new DOMXPath ( $dom );
380 $base_url = @ $xpath- > evaluate ( 'string(//head/base/@href)' , $dom );
381 if ( $base_url !== '' ) {