aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/poche
diff options
context:
space:
mode:
Diffstat (limited to 'inc/poche')
-rw-r--r--inc/poche/Database.class.php148
-rw-r--r--inc/poche/Poche.class.php159
-rw-r--r--inc/poche/PocheReadability.php46
-rw-r--r--inc/poche/Tools.class.php65
-rw-r--r--inc/poche/Url.class.php364
-rw-r--r--inc/poche/global.inc.php16
6 files changed, 320 insertions, 478 deletions
diff --git a/inc/poche/Database.class.php b/inc/poche/Database.class.php
index bf67de2a..afe02a41 100644
--- a/inc/poche/Database.class.php
+++ b/inc/poche/Database.class.php
@@ -39,12 +39,79 @@ class Database {
39 public function isInstalled() { 39 public function isInstalled() {
40 $sql = "SELECT username FROM users"; 40 $sql = "SELECT username FROM users";
41 $query = $this->executeQuery($sql, array()); 41 $query = $this->executeQuery($sql, array());
42 if ($query == false) {
43 die(STORAGE . ' database looks empty. You have to create it (you can find database structure in install folder).');
44 }
42 $hasAdmin = count($query->fetchAll()); 45 $hasAdmin = count($query->fetchAll());
43 46
44 if ($hasAdmin == 0) 47 if ($hasAdmin == 0)
45 return FALSE; 48 return false;
46 49
47 return TRUE; 50 return true;
51 }
52
53 public function checkTags() {
54
55 if (STORAGE == 'sqlite') {
56 $sql = '
57 CREATE TABLE IF NOT EXISTS tags (
58 id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE,
59 value TEXT
60 )';
61 }
62 elseif(STORAGE == 'mysql') {
63 $sql = '
64 CREATE TABLE IF NOT EXISTS `tags` (
65 `id` int(11) NOT NULL AUTO_INCREMENT,
66 `value` varchar(255) NOT NULL,
67 PRIMARY KEY (`id`)
68 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
69 ';
70 }
71 else {
72 $sql = '
73 CREATE TABLE tags (
74 id bigserial primary key,
75 value varchar(255) NOT NULL
76 );
77 ';
78 }
79
80 $query = $this->executeQuery($sql, array());
81
82 if (STORAGE == 'sqlite') {
83 $sql = '
84 CREATE TABLE tags_entries (
85 id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL UNIQUE,
86 entry_id INTEGER,
87 tag_id INTEGER,
88 FOREIGN KEY(entry_id) REFERENCES entries(id) ON DELETE CASCADE,
89 FOREIGN KEY(tag_id) REFERENCES tags(id) ON DELETE CASCADE
90 )';
91 }
92 elseif(STORAGE == 'mysql') {
93 $sql = '
94 CREATE TABLE IF NOT EXISTS `tags_entries` (
95 `id` int(11) NOT NULL AUTO_INCREMENT,
96 `entry_id` int(11) NOT NULL,
97 `tag_id` int(11) NOT NULL,
98 FOREIGN KEY(entry_id) REFERENCES entries(id) ON DELETE CASCADE,
99 FOREIGN KEY(tag_id) REFERENCES tags(id) ON DELETE CASCADE,
100 PRIMARY KEY (`id`)
101 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
102 ';
103 }
104 else {
105 $sql = '
106 CREATE TABLE tags_entries (
107 id bigserial primary key,
108 entry_id integer NOT NULL,
109 tag_id integer NOT NULL
110 )
111 ';
112 }
113
114 $query = $this->executeQuery($sql, array());
48 } 115 }
49 116
50 public function install($login, $password) { 117 public function install($login, $password) {
@@ -74,7 +141,7 @@ class Database {
74 return TRUE; 141 return TRUE;
75 } 142 }
76 143
77 private function getConfigUser($id) { 144 public function getConfigUser($id) {
78 $sql = "SELECT * FROM users_config WHERE user_id = ?"; 145 $sql = "SELECT * FROM users_config WHERE user_id = ?";
79 $query = $this->executeQuery($sql, array($id)); 146 $query = $this->executeQuery($sql, array($id));
80 $result = $query->fetchAll(); 147 $result = $query->fetchAll();
@@ -127,10 +194,10 @@ class Database {
127 $config = $this->getConfigUser($userId); 194 $config = $this->getConfigUser($userId);
128 195
129 if (!isset ($user_config[$key])) { 196 if (!isset ($user_config[$key])) {
130 $sql = "INSERT INTO users_config (`value`, `user_id`, `name`) VALUES (?, ?, ?)"; 197 $sql = "INSERT INTO users_config (value, user_id, name) VALUES (?, ?, ?)";
131 } 198 }
132 else { 199 else {
133 $sql = "UPDATE users_config SET `value`=? WHERE `user_id`=? AND `name`=?"; 200 $sql = "UPDATE users_config SET value=? WHERE user_id=? AND name=?";
134 } 201 }
135 202
136 $params = array($value, $userId, $key); 203 $params = array($value, $userId, $key);
@@ -249,4 +316,75 @@ class Database {
249 public function getLastId($column = '') { 316 public function getLastId($column = '') {
250 return $this->getHandle()->lastInsertId($column); 317 return $this->getHandle()->lastInsertId($column);
251 } 318 }
319
320 public function retrieveAllTags() {
321 $sql = "SELECT * FROM tags";
322 $query = $this->executeQuery($sql, array());
323 $tags = $query->fetchAll();
324
325 return $tags;
326 }
327
328 public function retrieveTag($id) {
329 $tag = NULL;
330 $sql = "SELECT * FROM tags WHERE id=?";
331 $params = array(intval($id));
332 $query = $this->executeQuery($sql, $params);
333 $tag = $query->fetchAll();
334
335 return isset($tag[0]) ? $tag[0] : null;
336 }
337
338 public function retrieveEntriesByTag($tag_id) {
339 $sql =
340 "SELECT * FROM entries
341 LEFT JOIN tags_entries ON tags_entries.entry_id=entries.id
342 WHERE tags_entries.tag_id = ?";
343 $query = $this->executeQuery($sql, array($tag_id));
344 $entries = $query->fetchAll();
345
346 return $entries;
347 }
348
349 public function retrieveTagsByEntry($entry_id) {
350 $sql =
351 "SELECT * FROM tags
352 LEFT JOIN tags_entries ON tags_entries.tag_id=tags.id
353 WHERE tags_entries.entry_id = ?";
354 $query = $this->executeQuery($sql, array($entry_id));
355 $tags = $query->fetchAll();
356
357 return $tags;
358 }
359
360 public function removeTagForEntry($entry_id, $tag_id) {
361 $sql_action = "DELETE FROM tags_entries WHERE tag_id=? AND entry_id=?";
362 $params_action = array($tag_id, $entry_id);
363 $query = $this->executeQuery($sql_action, $params_action);
364 return $query;
365 }
366
367 public function retrieveTagByValue($value) {
368 $tag = NULL;
369 $sql = "SELECT * FROM tags WHERE value=?";
370 $params = array($value);
371 $query = $this->executeQuery($sql, $params);
372 $tag = $query->fetchAll();
373
374 return isset($tag[0]) ? $tag[0] : null;
375 }
376
377 public function createTag($value) {
378 $sql_action = 'INSERT INTO tags ( value ) VALUES (?)';
379 $params_action = array($value);
380 $query = $this->executeQuery($sql_action, $params_action);
381 return $query;
382 }
383
384 public function setTagToEntry($tag_id, $entry_id) {
385 $sql_action = 'INSERT INTO tags_entries ( tag_id, entry_id ) VALUES (?, ?)';
386 $params_action = array($tag_id, $entry_id);
387 $query = $this->executeQuery($sql_action, $params_action);
388 return $query;
389 }
252} 390}
diff --git a/inc/poche/Poche.class.php b/inc/poche/Poche.class.php
index 3ecaf084..4f70afb7 100644
--- a/inc/poche/Poche.class.php
+++ b/inc/poche/Poche.class.php
@@ -49,6 +49,7 @@ class Poche
49 if (! $this->store->isInstalled()) { 49 if (! $this->store->isInstalled()) {
50 $this->install(); 50 $this->install();
51 } 51 }
52 $this->store->checkTags();
52 } 53 }
53 } 54 }
54 55
@@ -332,9 +333,12 @@ class Poche
332 switch ($action) 333 switch ($action)
333 { 334 {
334 case 'add': 335 case 'add':
335 $content = $url->extract(); 336 $json = file_get_contents(Tools::getPocheUrl() . '/inc/3rdparty/makefulltextfeed.php?url='.urlencode($url->getUrl()).'&max=5&links=preserve&exc=&format=json&submit=Create+Feed');
337 $content = json_decode($json, true);
338 $title = $content['rss']['channel']['item']['title'];
339 $body = $content['rss']['channel']['item']['description'];
336 340
337 if ($this->store->add($url->getUrl(), $content['title'], $content['body'], $this->user->getId())) { 341 if ($this->store->add($url->getUrl(), $title, $body, $this->user->getId())) {
338 Tools::logm('add link ' . $url->getUrl()); 342 Tools::logm('add link ' . $url->getUrl());
339 $sequence = ''; 343 $sequence = '';
340 if (STORAGE == 'postgres') { 344 if (STORAGE == 'postgres') {
@@ -342,7 +346,7 @@ class Poche
342 } 346 }
343 $last_id = $this->store->getLastId($sequence); 347 $last_id = $this->store->getLastId($sequence);
344 if (DOWNLOAD_PICTURES) { 348 if (DOWNLOAD_PICTURES) {
345 $content = filtre_picture($content['body'], $url->getUrl(), $last_id); 349 $content = filtre_picture($body, $url->getUrl(), $last_id);
346 Tools::logm('updating content article'); 350 Tools::logm('updating content article');
347 $this->store->updateContent($last_id, $content, $this->user->getId()); 351 $this->store->updateContent($last_id, $content, $this->user->getId());
348 } 352 }
@@ -394,6 +398,36 @@ class Poche
394 Tools::redirect(); 398 Tools::redirect();
395 } 399 }
396 break; 400 break;
401 case 'add_tag' :
402 $tags = explode(',', $_POST['value']);
403 $entry_id = $_POST['entry_id'];
404 foreach($tags as $key => $tag_value) {
405 $value = trim($tag_value);
406 $tag = $this->store->retrieveTagByValue($value);
407
408 if (is_null($tag)) {
409 # we create the tag
410 $tag = $this->store->createTag($value);
411 $sequence = '';
412 if (STORAGE == 'postgres') {
413 $sequence = 'tags_id_seq';
414 }
415 $tag_id = $this->store->getLastId($sequence);
416 }
417 else {
418 $tag_id = $tag['id'];
419 }
420
421 # we assign the tag to the article
422 $this->store->setTagToEntry($tag_id, $entry_id);
423 }
424 Tools::redirect();
425 break;
426 case 'remove_tag' :
427 $tag_id = $_GET['tag_id'];
428 $this->store->removeTagForEntry($id, $tag_id);
429 Tools::redirect();
430 break;
397 default: 431 default:
398 break; 432 break;
399 } 433 }
@@ -412,7 +446,8 @@ class Poche
412 $compare_prod = version_compare(POCHE, $prod); 446 $compare_prod = version_compare(POCHE, $prod);
413 $themes = $this->getInstalledThemes(); 447 $themes = $this->getInstalledThemes();
414 $languages = $this->getInstalledLanguages(); 448 $languages = $this->getInstalledLanguages();
415 $http_auth = (isset($_SERVER['PHP_AUTH_USER']))?true:false; 449 $token = $this->user->getConfigValue('token');
450 $http_auth = (isset($_SERVER['PHP_AUTH_USER']) || isset($_SERVER['REMOTE_USER'])) ? true : false;
416 $tpl_vars = array( 451 $tpl_vars = array(
417 'themes' => $themes, 452 'themes' => $themes,
418 'languages' => $languages, 453 'languages' => $languages,
@@ -420,10 +455,37 @@ class Poche
420 'prod' => $prod, 455 'prod' => $prod,
421 'compare_dev' => $compare_dev, 456 'compare_dev' => $compare_dev,
422 'compare_prod' => $compare_prod, 457 'compare_prod' => $compare_prod,
458 'token' => $token,
459 'user_id' => $this->user->getId(),
423 'http_auth' => $http_auth, 460 'http_auth' => $http_auth,
424 ); 461 );
425 Tools::logm('config view'); 462 Tools::logm('config view');
426 break; 463 break;
464 case 'edit-tags':
465 # tags
466 $tags = $this->store->retrieveTagsByEntry($id);
467 $tpl_vars = array(
468 'entry_id' => $id,
469 'tags' => $tags,
470 );
471 break;
472 case 'tag':
473 $entries = $this->store->retrieveEntriesByTag($id);
474 $tag = $this->store->retrieveTag($id);
475 $tpl_vars = array(
476 'tag' => $tag,
477 'entries' => $entries,
478 );
479 break;
480 case 'tags':
481 $token = $this->user->getConfigValue('token');
482 $tags = $this->store->retrieveAllTags();
483 $tpl_vars = array(
484 'token' => $token,
485 'user_id' => $this->user->getId(),
486 'tags' => $tags,
487 );
488 break;
427 case 'view': 489 case 'view':
428 $entry = $this->store->retrieveOneById($id, $this->user->getId()); 490 $entry = $this->store->retrieveOneById($id, $this->user->getId());
429 if ($entry != NULL) { 491 if ($entry != NULL) {
@@ -437,12 +499,16 @@ class Poche
437 499
438 # flattr checking 500 # flattr checking
439 $flattr = new FlattrItem(); 501 $flattr = new FlattrItem();
440 $flattr->checkItem($entry['url'],$entry['id']); 502 $flattr->checkItem($entry['url'], $entry['id']);
503
504 # tags
505 $tags = $this->store->retrieveTagsByEntry($entry['id']);
441 506
442 $tpl_vars = array( 507 $tpl_vars = array(
443 'entry' => $entry, 508 'entry' => $entry,
444 'content' => $content, 509 'content' => $content,
445 'flattr' => $flattr 510 'flattr' => $flattr,
511 'tags' => $tags
446 ); 512 );
447 } 513 }
448 else { 514 else {
@@ -584,14 +650,18 @@ class Poche
584 * it redirects the user to the $referer link 650 * it redirects the user to the $referer link
585 * @return array 651 * @return array
586 */ 652 */
587 private function credentials() { 653 private function credentials() {
588 if(isset($_SERVER['PHP_AUTH_USER'])) { 654 if(isset($_SERVER['PHP_AUTH_USER'])) {
589 return array($_SERVER['PHP_AUTH_USER'],'php_auth'); 655 return array($_SERVER['PHP_AUTH_USER'],'php_auth');
590 } 656 }
591 if(!empty($_POST['login']) && !empty($_POST['password'])) { 657 if(!empty($_POST['login']) && !empty($_POST['password'])) {
592 return array($_POST['login'],$_POST['password']); 658 return array($_POST['login'],$_POST['password']);
593 } 659 }
594 return array(false,false); 660 if(isset($_SERVER['REMOTE_USER'])) {
661 return array($_SERVER['REMOTE_USER'],'http_auth');
662 }
663
664 return array(false,false);
595 } 665 }
596 666
597 /** 667 /**
@@ -613,7 +683,8 @@ class Poche
613 $user = $this->store->login($login, Tools::encodeString($password . $login)); 683 $user = $this->store->login($login, Tools::encodeString($password . $login));
614 if ($user != array()) { 684 if ($user != array()) {
615 # Save login into Session 685 # Save login into Session
616 Session::login($user['username'], $user['password'], $login, Tools::encodeString($password . $login), array('poche_user' => new User($user))); 686 $longlastingsession = isset($_POST['longlastingsession']);
687 Session::login($user['username'], $user['password'], $login, Tools::encodeString($password . $login), $longlastingsession, array('poche_user' => new User($user)));
617 $this->messages->add('s', _('welcome to your poche')); 688 $this->messages->add('s', _('welcome to your poche'));
618 Tools::logm('login successful'); 689 Tools::logm('login successful');
619 Tools::redirect($referer); 690 Tools::redirect($referer);
@@ -837,4 +908,58 @@ class Poche
837 } 908 }
838 return $version; 909 return $version;
839 } 910 }
911
912 public function generateToken()
913 {
914 if (ini_get('open_basedir') === '') {
915 $token = substr(base64_encode(file_get_contents('/dev/urandom', false, null, 0, 20)), 0, 15);
916 }
917 else {
918 $token = substr(base64_encode(uniqid(mt_rand(), true)), 0, 20);
919 }
920
921 $this->store->updateUserConfig($this->user->getId(), 'token', $token);
922 $currentConfig = $_SESSION['poche_user']->config;
923 $currentConfig['token'] = $token;
924 $_SESSION['poche_user']->setConfig($currentConfig);
925 }
926
927 public function generateFeeds($token, $user_id, $tag_id, $type = 'home')
928 {
929 $allowed_types = array('home', 'fav', 'archive', 'tag');
930 $config = $this->store->getConfigUser($user_id);
931
932 if (!in_array($type, $allowed_types) ||
933 $token != $config['token']) {
934 die(_('Uh, there is a problem while generating feeds.'));
935 }
936 // Check the token
937
938 $feed = new FeedWriter(RSS2);
939 $feed->setTitle('poche - ' . $type . ' feed');
940 $feed->setLink(Tools::getPocheUrl());
941 $feed->setChannelElement('updated', date(DATE_RSS , time()));
942 $feed->setChannelElement('author', 'poche');
943
944 if ($type == 'tag') {
945 $entries = $this->store->retrieveEntriesByTag($tag_id);
946 }
947 else {
948 $entries = $this->store->getEntriesByView($type, $user_id);
949 }
950
951 if (count($entries) > 0) {
952 foreach ($entries as $entry) {
953 $newItem = $feed->createNewItem();
954 $newItem->setTitle(htmlentities($entry['title']));
955 $newItem->setLink(Tools::getPocheUrl() . '?view=view&id=' . $entry['id']);
956 $newItem->setDate(time());
957 $newItem->setDescription($entry['content']);
958 $feed->addItem($newItem);
959 }
960 }
961
962 $feed->genarateFeed();
963 exit;
964 }
840} 965}
diff --git a/inc/poche/PocheReadability.php b/inc/poche/PocheReadability.php
deleted file mode 100644
index 48ae90d0..00000000
--- a/inc/poche/PocheReadability.php
+++ /dev/null
@@ -1,46 +0,0 @@
1<?php
2
3class PocheReadability extends Readability
4{
5 /**
6 * Get the article title as an H1.
7 *
8 * @return DOMElement
9 */
10 protected function getArticleTitle() {
11 $curTitle = '';
12 $origTitle = '';
13
14 try {
15 $curTitle = $origTitle = $this->getInnerText($this->dom->getElementsByTagName('title')->item(0));
16 } catch(Exception $e) {}
17
18 if (preg_match('/ [\|\-] /', $curTitle))
19 {
20 $curTitle = preg_replace('/(.*)[\|\-] .*/i', '$1', $origTitle);
21
22 if (count(explode(' ', $curTitle)) < 3) {
23 $curTitle = preg_replace('/[^\|\-]*[\|\-](.*)/i', '$1', $origTitle);
24 }
25 }
26 else if(strlen($curTitle) > 150 || strlen($curTitle) < 15)
27 {
28 $hOnes = $this->dom->getElementsByTagName('h1');
29 if($hOnes->length == 1)
30 {
31 $curTitle = $this->getInnerText($hOnes->item(0));
32 }
33 }
34
35 $curTitle = trim($curTitle);
36
37 if (count(explode(' ', $curTitle)) <= 4) {
38 $curTitle = $origTitle;
39 }
40
41 $articleTitle = $this->dom->createElement('h1');
42 $articleTitle->innerHTML = $curTitle;
43
44 return $articleTitle;
45 }
46} \ No newline at end of file
diff --git a/inc/poche/Tools.class.php b/inc/poche/Tools.class.php
index 750553f1..63916582 100644
--- a/inc/poche/Tools.class.php
+++ b/inc/poche/Tools.class.php
@@ -88,39 +88,16 @@ class Tools
88 88
89 public static function getTplFile($view) 89 public static function getTplFile($view)
90 { 90 {
91 $default_tpl = 'home.twig'; 91 $views = array(
92 92 'install', 'import', 'export', 'config', 'tags',
93 switch ($view) { 93 'edit-tags', 'view', 'login', 'error', 'tag'
94 case 'install': 94 );
95 $tpl_file = 'install.twig'; 95
96 break; 96 if (in_array($view, $views)) {
97 case 'import'; 97 return $view . '.twig';
98 $tpl_file = 'import.twig';
99 break;
100 case 'export':
101 $tpl_file = 'export.twig';
102 break;
103 case 'config':
104 $tpl_file = 'config.twig';
105 break;
106 case 'view':
107 $tpl_file = 'view.twig';
108 break;
109
110 case 'login':
111 $tpl_file = 'login.twig';
112 break;
113
114 case 'error':
115 $tpl_file = 'error.twig';
116 break;
117
118 default:
119 $tpl_file = $default_tpl;
120 break;
121 } 98 }
122 99
123 return $tpl_file; 100 return 'home.twig';
124 } 101 }
125 102
126 public static function getFile($url) 103 public static function getFile($url)
@@ -249,4 +226,28 @@ class Tools
249 $lang = explode('.', $userlanguage); 226 $lang = explode('.', $userlanguage);
250 return str_replace('_', '-', $lang[0]); 227 return str_replace('_', '-', $lang[0]);
251 } 228 }
229
230 public static function status($status_code)
231 {
232 if (strpos(php_sapi_name(), 'apache') !== false) {
233
234 header('HTTP/1.0 '.$status_code);
235 }
236 else {
237
238 header('Status: '.$status_code);
239 }
240 }
241
242
243 public static function download_db() {
244 header('Content-Disposition: attachment; filename="poche.sqlite.gz"');
245 self::status(200);
246
247 header('Content-Transfer-Encoding: binary');
248 header('Content-Type: application/octet-stream');
249 echo gzencode(file_get_contents(STORAGE_SQLITE));
250
251 exit;
252 }
252} 253}
diff --git a/inc/poche/Url.class.php b/inc/poche/Url.class.php
index 600a2166..8b3468c3 100644
--- a/inc/poche/Url.class.php
+++ b/inc/poche/Url.class.php
@@ -12,45 +12,6 @@ class Url
12{ 12{
13 public $url; 13 public $url;
14 14
15 private $fingerprints = array(
16 // Posterous
17 '<meta name="generator" content="Posterous"' => array('hostname'=>'fingerprint.posterous.com', 'head'=>true),
18 // Blogger
19 '<meta content=\'blogger\' name=\'generator\'' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
20 '<meta name="generator" content="Blogger"' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
21 // WordPress (self-hosted and hosted)
22 '<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
23 );
24
25 private $user_agents = array( 'lifehacker.com' => 'PHP/5.2',
26 'gawker.com' => 'PHP/5.2',
27 'deadspin.com' => 'PHP/5.2',
28 'kotaku.com' => 'PHP/5.2',
29 'jezebel.com' => 'PHP/5.2',
30 'io9.com' => 'PHP/5.2',
31 'jalopnik.com' => 'PHP/5.2',
32 'gizmodo.com' => 'PHP/5.2',
33 '.wikipedia.org' => 'Mozilla/5.2'
34 );
35
36 private $content_type_exc = array(
37 'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
38 'image' => array('action'=>'link', 'name'=>'Image'),
39 'audio' => array('action'=>'link', 'name'=>'Audio'),
40 'video' => array('action'=>'link', 'name'=>'Video')
41 );
42
43 private $rewrite_url = array(
44 // Rewrite public Google Docs URLs to point to HTML view:
45 // if a URL contains docs.google.com, replace /Doc? with /View?
46 'docs.google.com' => array('/Doc?' => '/View?'),
47 'tnr.com' => array('tnr.com/article/' => 'tnr.com/print/article/'),
48 '.m.wikipedia.org' => array('.m.wikipedia.org' => '.wikipedia.org')
49 );
50
51 private $rewrite_relative_urls = true;
52 private $error_message = '[unable to retrieve full-text content]';
53
54 function __construct($url) 15 function __construct($url)
55 { 16 {
56 $this->url = base64_decode($url); 17 $this->url = base64_decode($url);
@@ -67,329 +28,4 @@ class Url
67 public function isCorrect() { 28 public function isCorrect() {
68 return filter_var($this->url, FILTER_VALIDATE_URL) !== FALSE; 29 return filter_var($this->url, FILTER_VALIDATE_URL) !== FALSE;
69 } 30 }
70
71 public function extract() {
72 global $http, $extractor;
73 $extractor = new ContentExtractor(dirname(__FILE__).'/../3rdparty/site_config/custom', dirname(__FILE__).'/../3rdparty/site_config/standard');
74 $extractor->fingerprints = $this->fingerprints;
75
76 $http = new HumbleHttpAgent();
77 $http->userAgentMap = $this->user_agents;
78 $http->headerOnlyTypes = array_keys($this->content_type_exc);
79 $http->rewriteUrls = $this->rewrite_url;
80 $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
81 // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
82 SimplePie_HumbleHttpAgent::set_agent($http);
83 $feed = new SimplePie();
84 // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
85 $feed->force_feed(true);
86 $feed->set_file_class('SimplePie_HumbleHttpAgent');
87 $feed->feed_url = $this->url;
88 $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
89 $feed->set_timeout(20);
90 $feed->enable_cache(false);
91 $feed->set_stupidly_fast(true);
92 $feed->enable_order_by_date(false); // we don't want to do anything to the feed
93 $feed->set_url_replacements(array());
94 // initialise the feed
95 // the @ suppresses notices which on some servers causes a 500 internal server error
96 $result = @$feed->init();
97 if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
98 die('Sorry, no feed items found');
99 }
100 // from now on, we'll identify ourselves as a browser
101 $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
102 unset($feed, $result);
103
104 $feed = new DummySingleItemFeed($this->url);
105
106 $items = $feed->get_items(0, 1);
107 // Request all feed items in parallel (if supported)
108 $urls_sanitized = array();
109 $urls = array();
110 foreach ($items as $key => $item) {
111 $permalink = htmlspecialchars_decode($item->get_permalink());
112 // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
113 $permalink = str_replace('%3A', ':', $permalink);
114 if ($permalink) {
115 $urls_sanitized[] = $permalink;
116 }
117 $urls[$key] = $permalink;
118 }
119 $http->fetchAll($urls_sanitized);
120
121 foreach ($items as $key => $item) {
122 $do_content_extraction = true;
123 $extract_result = false;
124 $permalink = $urls[$key];
125
126 // TODO: Allow error codes - some sites return correct content with error status
127 // e.g. prospectmagazine.co.uk returns 403
128
129 if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
130 $effective_url = $response['effective_url'];
131 // check if action defined for returned Content-Type
132 $type = null;
133 if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $response['headers'], $match)) {
134 // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
135 $match[1] = strtolower(trim($match[1]));
136 $match[2] = strtolower(trim($match[2]));
137 foreach (array($match[1], $match[2]) as $_mime) {
138 if (isset($this->content_type_exc[$_mime])) {
139 $type = $match[1];
140 $_act = $this->content_type_exc[$_mime]['action'];
141 $_name = $this->content_type_exc[$_mime]['name'];
142 if ($_act == 'exclude') {
143 continue 2; // skip this feed item entry
144 } elseif ($_act == 'link') {
145 if ($match[2] == 'image') {
146 $html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"$_name\" /></a>";
147 } else {
148 $html = "<a href=\"$effective_url\">Download $_name</a>";
149 }
150 $title = $_name;
151 $do_content_extraction = false;
152 break;
153 }
154 }
155 }
156 unset($_mime, $_act, $_name, $match);
157 }
158 if ($do_content_extraction) {
159 $html = $response['body'];
160 // remove strange things
161 $html = str_replace('</[>', '', $html);
162 $html = $this->convert_to_utf8($html, $response['headers']);
163
164 // check site config for single page URL - fetch it if found
165 if ($single_page_response = $this->getSinglePage($item, $html, $effective_url)) {
166 $html = $single_page_response['body'];
167 // remove strange things
168 $html = str_replace('</[>', '', $html);
169 $html = $this->convert_to_utf8($html, $single_page_response['headers']);
170 $effective_url = $single_page_response['effective_url'];
171 unset($single_page_response);
172 }
173 $extract_result = $extractor->process($html, $effective_url);
174 $readability = $extractor->readability;
175 $content_block = ($extract_result) ? $extractor->getContent() : null;
176 }
177 }
178 if ($do_content_extraction) {
179 // if we failed to extract content...
180 if (!$extract_result) {
181 $html = $this->error_message;
182 // keep the original item description
183 $html .= $item->get_description();
184 } else {
185 $readability->clean($content_block, 'select');
186 if ($this->rewrite_relative_urls) $this->makeAbsolute($effective_url, $content_block);
187 if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
188 $html = $content_block->firstChild->innerHTML;
189 } else {
190 $html = $content_block->innerHTML;
191 }
192 // post-processing cleanup
193 $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
194 }
195 }
196 }
197
198 $title = ($extractor->getTitle() != '' ? $extractor->getTitle() : _('Untitled'));
199 $content = array ('title' => $title, 'body' => $html);
200
201 return $content;
202 }
203
204 private function convert_to_utf8($html, $header=null)
205 {
206 $encoding = null;
207 if ($html || $header) {
208 if (is_array($header)) $header = implode("\n", $header);
209 if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
210 // error parsing the response
211 } else {
212 $match = end($match); // get last matched element (in case of redirects)
213 if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
214 }
215 // TODO: check to see if encoding is supported (can we convert it?)
216 // If it's not, result will be empty string.
217 // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
218 // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
219 if (!$encoding || $encoding == 'none') {
220 // search for encoding in HTML - only look at the first 35000 characters
221 $html_head = substr($html, 0, 40000);
222 if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
223 $encoding = trim($match[1], '"\'');
224 } elseif (preg_match('/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
225 $encoding = trim($match[1]);
226 } elseif (preg_match_all('/<meta\s+([^>]+)>/i', $html_head, $match)) {
227 foreach ($match[1] as $_test) {
228 if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
229 $encoding = trim($_m[1]);
230 break;
231 }
232 }
233 }
234 }
235 if (isset($encoding)) $encoding = trim($encoding);
236 // trim is important here!
237 if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
238 // replace MS Word smart qutoes
239 $trans = array();
240 $trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
241 $trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
242 $trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
243 $trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
244 $trans[chr(134)] = '&dagger;'; // Dagger
245 $trans[chr(135)] = '&Dagger;'; // Double Dagger
246 $trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
247 $trans[chr(137)] = '&permil;'; // Per Mille Sign
248 $trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
249 $trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
250 $trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
251 $trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
252 $trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
253 $trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
254 $trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
255 $trans[chr(149)] = '&bull;'; // Bullet
256 $trans[chr(150)] = '&ndash;'; // En Dash
257 $trans[chr(151)] = '&mdash;'; // Em Dash
258 $trans[chr(152)] = '&tilde;'; // Small Tilde
259 $trans[chr(153)] = '&trade;'; // Trade Mark Sign
260 $trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
261 $trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
262 $trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
263 $trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
264 $html = strtr($html, $trans);
265 }
266 if (!$encoding) {
267 $encoding = 'utf-8';
268 } else {
269 if (strtolower($encoding) != 'utf-8') {
270 $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
271 /*
272 if (function_exists('iconv')) {
273 // iconv appears to handle certain character encodings better than mb_convert_encoding
274 $html = iconv($encoding, 'utf-8', $html);
275 } else {
276 $html = mb_convert_encoding($html, 'utf-8', $encoding);
277 }
278 */
279 }
280 }
281 }
282 return $html;
283 }
284
285 private function makeAbsolute($base, $elem) {
286 $base = new SimplePie_IRI($base);
287 // remove '//' in URL path (used to prevent URLs from resolving properly)
288 // TODO: check if this is still the case
289 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
290 foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
291 $elems = $elem->getElementsByTagName($tag);
292 for ($i = $elems->length-1; $i >= 0; $i--) {
293 $e = $elems->item($i);
294 //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
295 $this->makeAbsoluteAttr($base, $e, $attr);
296 }
297 if (strtolower($elem->tagName) == $tag) $this->makeAbsoluteAttr($base, $elem, $attr);
298 }
299 }
300
301 private function makeAbsoluteAttr($base, $e, $attr) {
302 if ($e->hasAttribute($attr)) {
303 // Trim leading and trailing white space. I don't really like this but
304 // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
305 $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
306 $url = str_replace(' ', '%20', $url);
307 if (!preg_match('!https?://!i', $url)) {
308 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
309 $e->setAttribute($attr, $absolute);
310 }
311 }
312 }
313 }
314
315 private function makeAbsoluteStr($base, $url) {
316 $base = new SimplePie_IRI($base);
317 // remove '//' in URL path (causes URLs not to resolve properly)
318 if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
319 if (preg_match('!^https?://!i', $url)) {
320 // already absolute
321 return $url;
322 } else {
323 if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
324 return $absolute;
325 }
326 return false;
327 }
328 }
329
330 // returns single page response, or false if not found
331 private function getSinglePage($item, $html, $url) {
332 global $http, $extractor;
333 $host = @parse_url($url, PHP_URL_HOST);
334 $site_config = SiteConfig::build($host);
335 if ($site_config === false) {
336 // check for fingerprints
337 if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
338 $site_config = SiteConfig::build($_fphost);
339 }
340 if ($site_config === false) $site_config = new SiteConfig();
341 SiteConfig::add_to_cache($host, $site_config);
342 return false;
343 } else {
344 SiteConfig::add_to_cache($host, $site_config);
345 }
346 $splink = null;
347 if (!empty($site_config->single_page_link)) {
348 $splink = $site_config->single_page_link;
349 } elseif (!empty($site_config->single_page_link_in_feed)) {
350 // single page link xpath is targeted at feed
351 $splink = $site_config->single_page_link_in_feed;
352 // so let's replace HTML with feed item description
353 $html = $item->get_description();
354 }
355 if (isset($splink)) {
356 // Build DOM tree from HTML
357 $readability = new PocheReadability($html, $url);
358 $xpath = new DOMXPath($readability->dom);
359 // Loop through single_page_link xpath expressions
360 $single_page_url = null;
361 foreach ($splink as $pattern) {
362 $elems = @$xpath->evaluate($pattern, $readability->dom);
363 if (is_string($elems)) {
364 $single_page_url = trim($elems);
365 break;
366 } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
367 foreach ($elems as $item) {
368 if ($item instanceof DOMElement && $item->hasAttribute('href')) {
369 $single_page_url = $item->getAttribute('href');
370 break;
371 } elseif ($item instanceof DOMAttr && $item->value) {
372 $single_page_url = $item->value;
373 break;
374 }
375 }
376 }
377 }
378 // If we've got URL, resolve against $url
379 if (isset($single_page_url) && ($single_page_url = $this->makeAbsoluteStr($url, $single_page_url))) {
380 // check it's not what we have already!
381 if ($single_page_url != $url) {
382 // it's not, so let's try to fetch it...
383 $_prev_ref = $http->referer;
384 $http->referer = $single_page_url;
385 if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
386 $http->referer = $_prev_ref;
387 return $response;
388 }
389 $http->referer = $_prev_ref;
390 }
391 }
392 }
393 return false;
394 }
395} \ No newline at end of file 31} \ No newline at end of file
diff --git a/inc/poche/global.inc.php b/inc/poche/global.inc.php
index 65a026a7..846699d3 100644
--- a/inc/poche/global.inc.php
+++ b/inc/poche/global.inc.php
@@ -20,25 +20,13 @@ require_once INCLUDES . '/poche/Url.class.php';
20require_once INCLUDES . '/3rdparty/class.messages.php'; 20require_once INCLUDES . '/3rdparty/class.messages.php';
21require_once INCLUDES . '/poche/Poche.class.php'; 21require_once INCLUDES . '/poche/Poche.class.php';
22 22
23require_once INCLUDES . '/3rdparty/Readability.php';
24require_once INCLUDES . '/poche/PocheReadability.php';
25
26require_once INCLUDES . '/3rdparty/Encoding.php';
27require_once INCLUDES . '/poche/Database.class.php'; 23require_once INCLUDES . '/poche/Database.class.php';
28require_once INCLUDES . '/3rdparty/simple_html_dom.php'; 24require_once INCLUDES . '/3rdparty/simple_html_dom.php';
29require_once INCLUDES . '/3rdparty/paginator.php'; 25require_once INCLUDES . '/3rdparty/paginator.php';
30require_once INCLUDES . '/3rdparty/Session.class.php'; 26require_once INCLUDES . '/3rdparty/Session.class.php';
31 27
32require_once INCLUDES . '/3rdparty/simplepie/SimplePieAutoloader.php'; 28require_once INCLUDES . '/3rdparty/libraries/feedwriter/FeedItem.php';
33require_once INCLUDES . '/3rdparty/simplepie/SimplePie/Core.php'; 29require_once INCLUDES . '/3rdparty/libraries/feedwriter/FeedWriter.php';
34require_once INCLUDES . '/3rdparty/content-extractor/ContentExtractor.php';
35require_once INCLUDES . '/3rdparty/content-extractor/SiteConfig.php';
36require_once INCLUDES . '/3rdparty/humble-http-agent/HumbleHttpAgent.php';
37require_once INCLUDES . '/3rdparty/humble-http-agent/SimplePie_HumbleHttpAgent.php';
38require_once INCLUDES . '/3rdparty/humble-http-agent/CookieJar.php';
39require_once INCLUDES . '/3rdparty/feedwriter/FeedItem.php';
40require_once INCLUDES . '/3rdparty/feedwriter/FeedWriter.php';
41require_once INCLUDES . '/3rdparty/feedwriter/DummySingleItemFeed.php';
42require_once INCLUDES . '/3rdparty/FlattrItem.class.php'; 30require_once INCLUDES . '/3rdparty/FlattrItem.class.php';
43 31
44# Composer its autoloader for automatically loading Twig 32# Composer its autoloader for automatically loading Twig