diff options
Diffstat (limited to 'inc/3rdparty/site_config')
-rw-r--r-- | inc/3rdparty/site_config/custom/blogs.faz.net.txt | 45 | ||||
-rw-r--r-- | inc/3rdparty/site_config/standard/.about.com.txt | 14 | ||||
-rwxr-xr-x | inc/3rdparty/site_config/standard/dn.pt.txt | 9 | ||||
-rw-r--r--[-rwxr-xr-x] | inc/3rdparty/site_config/standard/faz.net.txt | 117 | ||||
-rwxr-xr-x | inc/3rdparty/site_config/standard/habrahabr.ru.txt | 21 |
5 files changed, 180 insertions, 26 deletions
diff --git a/inc/3rdparty/site_config/custom/blogs.faz.net.txt b/inc/3rdparty/site_config/custom/blogs.faz.net.txt new file mode 100644 index 00000000..4f2626f1 --- /dev/null +++ b/inc/3rdparty/site_config/custom/blogs.faz.net.txt | |||
@@ -0,0 +1,45 @@ | |||
1 | # Author: zinnober | ||
2 | |||
3 | tidy: no | ||
4 | prune: no | ||
5 | |||
6 | # Set author | ||
7 | author: //a[@rel='author'] | ||
8 | |||
9 | # Set date | ||
10 | date: //span[@class='Datum'] | ||
11 | |||
12 | # Content is here | ||
13 | body: //div[@class='Artikel'] | ||
14 | |||
15 | # Tidy up before article | ||
16 | strip: //div[@id='FAZHeaderNeu'] | ||
17 | strip: //h2[@itemprop='headline'] | ||
18 | strip: //span[@class='Datum'] | ||
19 | strip: //span[@class='Autor'] | ||
20 | strip_id_or_class: ArticlePagerTop | ||
21 | strip: //div[@class='FAZArtikelEinleitung']/h2 | ||
22 | |||
23 | # General cleanup | ||
24 | strip: //div[@class='clear'] | ||
25 | strip: //span[@class='Bildnachweis'] | ||
26 | strip: //iframe | ||
27 | strip_id_or_class: Community | ||
28 | strip: ' · ' | ||
29 | |||
30 | # Remove tracking and ads | ||
31 | strip_image_src: /l.gif? | ||
32 | strip: //img[@width='1'] | ||
33 | strip_id_or_class: invisible | ||
34 | strip_id_or_class: Anzeige | ||
35 | strip_id_or_class: billboard | ||
36 | |||
37 | # Remove clutter after article | ||
38 | strip_id_or_class: Tagline | ||
39 | strip_id_or_class: ArtikelAbbinder | ||
40 | strip_id_or_class: FAZArtikelKommentare | ||
41 | strip_id_or_class: ArtikelKommentieren | ||
42 | strip_id_or_class: FAZContentRight | ||
43 | |||
44 | # Try it yourself | ||
45 | test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/ | ||
diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt new file mode 100644 index 00000000..e1ebaee3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/.about.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | body: //div[@id='articlebody'] | ||
2 | title: //h1 | ||
3 | author: //p[@id='by']//a | ||
4 | |||
5 | next_page_link: //span[@class='next']/a | ||
6 | # Not the same as below! | ||
7 | |||
8 | prune: yes | ||
9 | tidy: no | ||
10 | |||
11 | # Annoying 'next' links plainly inside the article body | ||
12 | strip: //*[text()[contains(.,'Next: ')]] | ||
13 | |||
14 | test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm | ||
diff --git a/inc/3rdparty/site_config/standard/dn.pt.txt b/inc/3rdparty/site_config/standard/dn.pt.txt new file mode 100755 index 00000000..051b8cb9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dn.pt.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | single_page_link: concat('http://www.dn.pt/Common/print.aspx?content_id=', //input[@type='hidden' and @name='link-comments']/@value) | ||
2 | #<input type="hidden" name="link-comments" class="link-comments" value="3972244"> | ||
3 | |||
4 | title: //h1 | ||
5 | author: //div[@class="Author"] | ||
6 | |||
7 | strip: //div[@class="Patrocinio"] | ||
8 | |||
9 | test_url: http://www.dn.pt/inicio/opiniao/interior.aspx?content_id=3972244&seccao=Alberto%20Gon%E7alves&tag=Opini%E3o%20-%20Em%20Foco&page=1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt index d087d2aa..47048a1b 100755..100644 --- a/inc/3rdparty/site_config/standard/faz.net.txt +++ b/inc/3rdparty/site_config/standard/faz.net.txt | |||
@@ -1,36 +1,101 @@ | |||
1 | # Author: zinnober | ||
2 | # Complete rewrite of the faz.net template as the standard one is broken | ||
3 | # I tried to consider as many page variants as possible, which was some serious work | ||
4 | |||
5 | tidy: no | ||
6 | prune: no | ||
7 | |||
1 | # Title | 8 | # Title |
2 | title: //p[@class='Content HeadlineShort'] | 9 | title: //p[@class='Content HeadlineShort'] |
3 | 10 | ||
4 | # Authors | 11 | # Set author |
5 | # some are known and have a link, others don't | 12 | author: substring-after(//span[@class='Autor'], 'von ') |
6 | author: substring-after(//span[@class='Autor'], 'Von') | 13 | author: //span[@class='caps last']/span[@class='caps last'] |
14 | author: //a[@rel='author'] | ||
7 | 15 | ||
8 | # Date | 16 | # Set date |
9 | date: //span[@class='Datum'] | 17 | date: //span[@class='Datum'] |
18 | date: //span[@class='Datum'],/span | ||
19 | |||
20 | # Fetch full multipage articles | ||
21 | next_page_link: //a[@title='Nächste Seite'] | ||
10 | 22 | ||
11 | # Body | 23 | # Content is here |
12 | body: //div[@class='Artikel'] | 24 | body: //div[@class='Artikel'] |
13 | 25 | ||
14 | # Removements before body text | 26 | # Tidy up before article |
15 | strip: //div[@class='Breadcrumbs'] | 27 | strip: //div[@id='FAZHeaderNeu'] |
16 | strip: //div[@class='QuickSearchBox'] | 28 | strip: //h2[@itemprop='headline'] |
17 | strip: //div[@class='FAZArtikelEinleitung'] | 29 | strip: //span[@class='Datum'] |
18 | strip: //div[@class='FAZArtikelReiter'] | 30 | strip: //span[@class='Autor'] |
31 | strip_id_or_class: ArticlePagerTop | ||
32 | |||
33 | # General cleanup | ||
19 | strip: //div[@class='clear'] | 34 | strip: //div[@class='clear'] |
35 | strip: //a[@title='Zur Homepage FAZ.NET'] | ||
36 | strip: //iframe | ||
37 | replace_string( · ): | ||
38 | |||
39 | # Remove tracking and ads | ||
40 | strip_image_src: /l.gif? | ||
41 | strip: //div[contains(@style, 'background-image')] | ||
42 | strip: //img[@width='1'] | ||
43 | strip_id_or_class: invisible | ||
44 | strip_id_or_class: Anzeige | ||
45 | strip_id_or_class: billboard | ||
46 | |||
47 | # Remove various text boxes and social media foo | ||
48 | strip_id_or_class: WeitereBeitraege | ||
49 | strip_id_or_class: WBListe | ||
50 | strip_id_or_class: AutorenModul | ||
51 | strip_id_or_class: Community | ||
52 | strip_id_or_class: SocialMediaStatus | ||
53 | strip_id_or_class: RelatedLinkBox | ||
54 | strip_id_or_class: MultimediaNavigation | ||
55 | strip_id_or_class: IndexTitel | ||
56 | |||
57 | # Fix picture caps and pictures (use better resolution and remove clutter) | ||
58 | strip_id_or_class: LightBoxOverlay | ||
59 | strip_id_or_class: exitLarge | ||
60 | strip_id_or_class: PagerBox | ||
61 | strip_id_or_class: Bildnachweis | ||
62 | strip_id_or_class: Bildueberschrift | ||
63 | strip_id_or_class: Bildbeschreibung | ||
64 | strip_id_or_class: ArtikelBild610 | ||
65 | strip_id_or_class: MediaLink | ||
66 | strip_id_or_class: FotoBoxInnerLeft | ||
67 | strip_id_or_class: BilderRelatedLinks | ||
68 | |||
69 | # Remove clutter after article | ||
70 | strip_id_or_class: ArticlePagerBottom | ||
71 | strip_id_or_class: backToHome | ||
72 | strip_id_or_class: ArtikelAbbinder | ||
73 | strip_id_or_class: lesermeinungscontainer | ||
74 | strip_id_or_class: ThemenLinks | ||
75 | strip_id_or_class: rechtehinweis | ||
76 | strip_id_or_class: FAZArtikelMap | ||
77 | strip_id_or_class: FAZArtikelKommentare | ||
78 | strip_id_or_class: ArtikelKommentieren | ||
79 | strip_id_or_class: FAZArtikelFunktionen | ||
80 | strip_id_or_class: mailLB | ||
81 | strip_id_or_class: FAZContentRight | ||
82 | strip_id_or_class: stageModule | ||
83 | strip_id_or_class: ContentFooter | ||
84 | strip_id_or_class: ServicesFooter | ||
85 | strip_id_or_class: FAZFooter | ||
86 | |||
87 | # Clean up stuff present just in some articles | ||
88 | strip_id_or_class: Teaser620 | ||
89 | strip_id_or_class: TeaserMultimedia | ||
90 | strip_id_or_class: VideoBox | ||
91 | |||
92 | # Remove as soon as Wallabag maight be able to embed flash video | ||
93 | strip_id_or_class: mmoObjectAsTeaserInArticle | ||
94 | strip_id_or_class: additionalStylesAudioVideo | ||
95 | strip_id_or_class: hideMMElements | ||
96 | |||
97 | # Try it yourself | ||
98 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken | ||
99 | test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html | ||
100 | test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html | ||
20 | 101 | ||
21 | # General removements | ||
22 | strip: //span[@class='Bildnachweis'] | ||
23 | strip: //img[@class='MediaIcon'] | ||
24 | strip: //div[@class='ArtikelMediaLink'] | ||
25 | dissolve: //a[img] | ||
26 | |||
27 | # Removements after body text | ||
28 | strip: //div[@class='ArtikelAbbinder'] | ||
29 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] | ||
30 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] | ||
31 | strip: //div[@class='FAZArtikelFunktionen'] | ||
32 | strip: //div[@id='FAZContentRight'] | ||
33 | |||
34 | # Fix picture captions | ||
35 | wrap_in(small): //span[@class='Bildunterschrift']/text() | ||
36 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/habrahabr.ru.txt b/inc/3rdparty/site_config/standard/habrahabr.ru.txt new file mode 100755 index 00000000..67538359 --- /dev/null +++ b/inc/3rdparty/site_config/standard/habrahabr.ru.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | title: //span[@class="post_title"] | ||
2 | author: //div[@class="author"] | ||
3 | date: //div[@class="published | ||
4 | |||
5 | body: //div[@class='content html_format'] | //div[@id='comments'] | ||
6 | |||
7 | strip: //a[@class="link_to_comment"] | ||
8 | strip: //div[@class="show_tree"] | ||
9 | strip: //a[@class="to_parent"] | ||
10 | |||
11 | |||
12 | replace_string(class="reply_comments"): style="padding-left: 20px" | ||
13 | replace_string(class="voting "): style="float: right" | ||
14 | replace_string(src="//habrastorage.org/getpro/habr/avatars/): style="width:24px; height:24px;" class="123" src="//habrastorage.org/getpro/habr/avatars/ | ||
15 | replace_string(class="info "): style="padding-top:5px;font-size:0.85em;line-height:24px;" | ||
16 | |||
17 | |||
18 | prune: no | ||
19 | tidy: no | ||
20 | |||
21 | test_url: http://habrahabr.ru/post/229883/ \ No newline at end of file | ||