aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty
diff options
context:
space:
mode:
Diffstat (limited to 'inc/3rdparty')
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/libraries/readability/Readability.php17
-rw-r--r--inc/3rdparty/site_config/custom/blogs.faz.net.txt45
-rw-r--r--inc/3rdparty/site_config/standard/.about.com.txt14
-rwxr-xr-xinc/3rdparty/site_config/standard/dn.pt.txt9
-rw-r--r--[-rwxr-xr-x]inc/3rdparty/site_config/standard/faz.net.txt117
-rwxr-xr-xinc/3rdparty/site_config/standard/habrahabr.ru.txt21
6 files changed, 195 insertions, 28 deletions
diff --git a/inc/3rdparty/libraries/readability/Readability.php b/inc/3rdparty/libraries/readability/Readability.php
index d0f09d74..4fa3ba63 100644..100755
--- a/inc/3rdparty/libraries/readability/Readability.php
+++ b/inc/3rdparty/libraries/readability/Readability.php
@@ -679,6 +679,7 @@ class Readability
679 } else { 679 } else {
680 $topCandidate->innerHTML = $page->documentElement->innerHTML; 680 $topCandidate->innerHTML = $page->documentElement->innerHTML;
681 $page->documentElement->innerHTML = ''; 681 $page->documentElement->innerHTML = '';
682 $this->reinitBody();
682 $page->documentElement->appendChild($topCandidate); 683 $page->documentElement->appendChild($topCandidate);
683 } 684 }
684 } else { 685 } else {
@@ -794,8 +795,7 @@ class Readability
794 { 795 {
795 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7 796 // TODO: find out why element disappears sometimes, e.g. for this URL http://www.businessinsider.com/6-hedge-fund-etfs-for-average-investors-2011-7
796 // in the meantime, we check and create an empty element if it's not there. 797 // in the meantime, we check and create an empty element if it's not there.
797 if (!isset($this->body->childNodes)) $this->body = $this->dom->createElement('body'); 798 $this->reinitBody();
798 $this->body->innerHTML = $this->bodyCache;
799 799
800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) { 800 if ($this->flagIsActive(self::FLAG_STRIP_UNLIKELYS)) {
801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS); 801 $this->removeFlag(self::FLAG_STRIP_UNLIKELYS);
@@ -1134,5 +1134,18 @@ class Readability
1134 public function removeFlag($flag) { 1134 public function removeFlag($flag) {
1135 $this->flags = $this->flags & ~$flag; 1135 $this->flags = $this->flags & ~$flag;
1136 } 1136 }
1137
1138 /**
1139 * Will recreate previously deleted body property
1140 *
1141 * @return void
1142 */
1143 protected function reinitBody() {
1144 if (!isset($this->body->childNodes)) {
1145 $this->body = $this->dom->createElement('body');
1146 $this->body->innerHTML = $this->bodyCache;
1147 }
1148 }
1149
1137} 1150}
1138?> \ No newline at end of file 1151?> \ No newline at end of file
diff --git a/inc/3rdparty/site_config/custom/blogs.faz.net.txt b/inc/3rdparty/site_config/custom/blogs.faz.net.txt
new file mode 100644
index 00000000..4f2626f1
--- /dev/null
+++ b/inc/3rdparty/site_config/custom/blogs.faz.net.txt
@@ -0,0 +1,45 @@
1# Author: zinnober
2
3tidy: no
4prune: no
5
6# Set author
7author: //a[@rel='author']
8
9# Set date
10date: //span[@class='Datum']
11
12# Content is here
13body: //div[@class='Artikel']
14
15# Tidy up before article
16strip: //div[@id='FAZHeaderNeu']
17strip: //h2[@itemprop='headline']
18strip: //span[@class='Datum']
19strip: //span[@class='Autor']
20strip_id_or_class: ArticlePagerTop
21strip: //div[@class='FAZArtikelEinleitung']/h2
22
23# General cleanup
24strip: //div[@class='clear']
25strip: //span[@class='Bildnachweis']
26strip: //iframe
27strip_id_or_class: Community
28strip: ' · '
29
30# Remove tracking and ads
31strip_image_src: /l.gif?
32strip: //img[@width='1']
33strip_id_or_class: invisible
34strip_id_or_class: Anzeige
35strip_id_or_class: billboard
36
37# Remove clutter after article
38strip_id_or_class: Tagline
39strip_id_or_class: ArtikelAbbinder
40strip_id_or_class: FAZArtikelKommentare
41strip_id_or_class: ArtikelKommentieren
42strip_id_or_class: FAZContentRight
43
44# Try it yourself
45test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/
diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt
new file mode 100644
index 00000000..e1ebaee3
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/.about.com.txt
@@ -0,0 +1,14 @@
1body: //div[@id='articlebody']
2title: //h1
3author: //p[@id='by']//a
4
5next_page_link: //span[@class='next']/a
6# Not the same as below!
7
8prune: yes
9tidy: no
10
11# Annoying 'next' links plainly inside the article body
12strip: //*[text()[contains(.,'Next: ')]]
13
14test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm
diff --git a/inc/3rdparty/site_config/standard/dn.pt.txt b/inc/3rdparty/site_config/standard/dn.pt.txt
new file mode 100755
index 00000000..051b8cb9
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/dn.pt.txt
@@ -0,0 +1,9 @@
1single_page_link: concat('http://www.dn.pt/Common/print.aspx?content_id=', //input[@type='hidden' and @name='link-comments']/@value)
2#<input type="hidden" name="link-comments" class="link-comments" value="3972244">
3
4title: //h1
5author: //div[@class="Author"]
6
7strip: //div[@class="Patrocinio"]
8
9test_url: http://www.dn.pt/inicio/opiniao/interior.aspx?content_id=3972244&seccao=Alberto%20Gon%E7alves&tag=Opini%E3o%20-%20Em%20Foco&page=1 \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt
index d087d2aa..47048a1b 100755..100644
--- a/inc/3rdparty/site_config/standard/faz.net.txt
+++ b/inc/3rdparty/site_config/standard/faz.net.txt
@@ -1,36 +1,101 @@
1# Author: zinnober
2# Complete rewrite of the faz.net template as the standard one is broken
3# I tried to consider as many page variants as possible, which was some serious work
4
5tidy: no
6prune: no
7
1# Title 8# Title
2title: //p[@class='Content HeadlineShort'] 9title: //p[@class='Content HeadlineShort']
3 10
4# Authors 11# Set author
5# some are known and have a link, others don't 12author: substring-after(//span[@class='Autor'], 'von ')
6author: substring-after(//span[@class='Autor'], 'Von') 13author: //span[@class='caps last']/span[@class='caps last']
14author: //a[@rel='author']
7 15
8# Date 16# Set date
9date: //span[@class='Datum'] 17date: //span[@class='Datum']
18date: //span[@class='Datum'],/span
19
20# Fetch full multipage articles
21next_page_link: //a[@title='Nächste Seite']
10 22
11# Body 23# Content is here
12body: //div[@class='Artikel'] 24body: //div[@class='Artikel']
13 25
14# Removements before body text 26# Tidy up before article
15strip: //div[@class='Breadcrumbs'] 27strip: //div[@id='FAZHeaderNeu']
16strip: //div[@class='QuickSearchBox'] 28strip: //h2[@itemprop='headline']
17strip: //div[@class='FAZArtikelEinleitung'] 29strip: //span[@class='Datum']
18strip: //div[@class='FAZArtikelReiter'] 30strip: //span[@class='Autor']
31strip_id_or_class: ArticlePagerTop
32
33# General cleanup
19strip: //div[@class='clear'] 34strip: //div[@class='clear']
35strip: //a[@title='Zur Homepage FAZ.NET']
36strip: //iframe
37replace_string( · ):
38
39# Remove tracking and ads
40strip_image_src: /l.gif?
41strip: //div[contains(@style, 'background-image')]
42strip: //img[@width='1']
43strip_id_or_class: invisible
44strip_id_or_class: Anzeige
45strip_id_or_class: billboard
46
47# Remove various text boxes and social media foo
48strip_id_or_class: WeitereBeitraege
49strip_id_or_class: WBListe
50strip_id_or_class: AutorenModul
51strip_id_or_class: Community
52strip_id_or_class: SocialMediaStatus
53strip_id_or_class: RelatedLinkBox
54strip_id_or_class: MultimediaNavigation
55strip_id_or_class: IndexTitel
56
57# Fix picture caps and pictures (use better resolution and remove clutter)
58strip_id_or_class: LightBoxOverlay
59strip_id_or_class: exitLarge
60strip_id_or_class: PagerBox
61strip_id_or_class: Bildnachweis
62strip_id_or_class: Bildueberschrift
63strip_id_or_class: Bildbeschreibung
64strip_id_or_class: ArtikelBild610
65strip_id_or_class: MediaLink
66strip_id_or_class: FotoBoxInnerLeft
67strip_id_or_class: BilderRelatedLinks
68
69# Remove clutter after article
70strip_id_or_class: ArticlePagerBottom
71strip_id_or_class: backToHome
72strip_id_or_class: ArtikelAbbinder
73strip_id_or_class: lesermeinungscontainer
74strip_id_or_class: ThemenLinks
75strip_id_or_class: rechtehinweis
76strip_id_or_class: FAZArtikelMap
77strip_id_or_class: FAZArtikelKommentare
78strip_id_or_class: ArtikelKommentieren
79strip_id_or_class: FAZArtikelFunktionen
80strip_id_or_class: mailLB
81strip_id_or_class: FAZContentRight
82strip_id_or_class: stageModule
83strip_id_or_class: ContentFooter
84strip_id_or_class: ServicesFooter
85strip_id_or_class: FAZFooter
86
87# Clean up stuff present just in some articles
88strip_id_or_class: Teaser620
89strip_id_or_class: TeaserMultimedia
90strip_id_or_class: VideoBox
91
92# Remove as soon as Wallabag maight be able to embed flash video
93strip_id_or_class: mmoObjectAsTeaserInArticle
94strip_id_or_class: additionalStylesAudioVideo
95strip_id_or_class: hideMMElements
96
97# Try it yourself
98test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken
99test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html
100test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html
20 101
21# General removements
22strip: //span[@class='Bildnachweis']
23strip: //img[@class='MediaIcon']
24strip: //div[@class='ArtikelMediaLink']
25dissolve: //a[img]
26
27# Removements after body text
28strip: //div[@class='ArtikelAbbinder']
29strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content']
30strip: //div[@class='FAZArtikelKommentare FAZArtikelContent']
31strip: //div[@class='FAZArtikelFunktionen']
32strip: //div[@id='FAZContentRight']
33
34# Fix picture captions
35wrap_in(small): //span[@class='Bildunterschrift']/text()
36test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/habrahabr.ru.txt b/inc/3rdparty/site_config/standard/habrahabr.ru.txt
new file mode 100755
index 00000000..67538359
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/habrahabr.ru.txt
@@ -0,0 +1,21 @@
1title: //span[@class="post_title"]
2author: //div[@class="author"]
3date: //div[@class="published
4
5body: //div[@class='content html_format'] | //div[@id='comments']
6
7strip: //a[@class="link_to_comment"]
8strip: //div[@class="show_tree"]
9strip: //a[@class="to_parent"]
10
11
12replace_string(class="reply_comments"): style="padding-left: 20px"
13replace_string(class="voting "): style="float: right"
14replace_string(src="//habrastorage.org/getpro/habr/avatars/): style="width:24px; height:24px;" class="123" src="//habrastorage.org/getpro/habr/avatars/
15replace_string(class="info "): style="padding-top:5px;font-size:0.85em;line-height:24px;"
16
17
18prune: no
19tidy: no
20
21test_url: http://habrahabr.ru/post/229883/ \ No newline at end of file