diff options
Diffstat (limited to 'inc')
-rw-r--r-- | inc/3rdparty/site_config/custom/blogs.faz.net.txt | 45 | ||||
-rw-r--r--[-rwxr-xr-x] | inc/3rdparty/site_config/standard/faz.net.txt | 117 |
2 files changed, 136 insertions, 26 deletions
diff --git a/inc/3rdparty/site_config/custom/blogs.faz.net.txt b/inc/3rdparty/site_config/custom/blogs.faz.net.txt new file mode 100644 index 00000000..4f2626f1 --- /dev/null +++ b/inc/3rdparty/site_config/custom/blogs.faz.net.txt | |||
@@ -0,0 +1,45 @@ | |||
1 | # Author: zinnober | ||
2 | |||
3 | tidy: no | ||
4 | prune: no | ||
5 | |||
6 | # Set author | ||
7 | author: //a[@rel='author'] | ||
8 | |||
9 | # Set date | ||
10 | date: //span[@class='Datum'] | ||
11 | |||
12 | # Content is here | ||
13 | body: //div[@class='Artikel'] | ||
14 | |||
15 | # Tidy up before article | ||
16 | strip: //div[@id='FAZHeaderNeu'] | ||
17 | strip: //h2[@itemprop='headline'] | ||
18 | strip: //span[@class='Datum'] | ||
19 | strip: //span[@class='Autor'] | ||
20 | strip_id_or_class: ArticlePagerTop | ||
21 | strip: //div[@class='FAZArtikelEinleitung']/h2 | ||
22 | |||
23 | # General cleanup | ||
24 | strip: //div[@class='clear'] | ||
25 | strip: //span[@class='Bildnachweis'] | ||
26 | strip: //iframe | ||
27 | strip_id_or_class: Community | ||
28 | strip: ' · ' | ||
29 | |||
30 | # Remove tracking and ads | ||
31 | strip_image_src: /l.gif? | ||
32 | strip: //img[@width='1'] | ||
33 | strip_id_or_class: invisible | ||
34 | strip_id_or_class: Anzeige | ||
35 | strip_id_or_class: billboard | ||
36 | |||
37 | # Remove clutter after article | ||
38 | strip_id_or_class: Tagline | ||
39 | strip_id_or_class: ArtikelAbbinder | ||
40 | strip_id_or_class: FAZArtikelKommentare | ||
41 | strip_id_or_class: ArtikelKommentieren | ||
42 | strip_id_or_class: FAZContentRight | ||
43 | |||
44 | # Try it yourself | ||
45 | test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/ | ||
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt index d087d2aa..47048a1b 100755..100644 --- a/inc/3rdparty/site_config/standard/faz.net.txt +++ b/inc/3rdparty/site_config/standard/faz.net.txt | |||
@@ -1,36 +1,101 @@ | |||
1 | # Author: zinnober | ||
2 | # Complete rewrite of the faz.net template as the standard one is broken | ||
3 | # I tried to consider as many page variants as possible, which was some serious work | ||
4 | |||
5 | tidy: no | ||
6 | prune: no | ||
7 | |||
1 | # Title | 8 | # Title |
2 | title: //p[@class='Content HeadlineShort'] | 9 | title: //p[@class='Content HeadlineShort'] |
3 | 10 | ||
4 | # Authors | 11 | # Set author |
5 | # some are known and have a link, others don't | 12 | author: substring-after(//span[@class='Autor'], 'von ') |
6 | author: substring-after(//span[@class='Autor'], 'Von') | 13 | author: //span[@class='caps last']/span[@class='caps last'] |
14 | author: //a[@rel='author'] | ||
7 | 15 | ||
8 | # Date | 16 | # Set date |
9 | date: //span[@class='Datum'] | 17 | date: //span[@class='Datum'] |
18 | date: //span[@class='Datum'],/span | ||
19 | |||
20 | # Fetch full multipage articles | ||
21 | next_page_link: //a[@title='Nächste Seite'] | ||
10 | 22 | ||
11 | # Body | 23 | # Content is here |
12 | body: //div[@class='Artikel'] | 24 | body: //div[@class='Artikel'] |
13 | 25 | ||
14 | # Removements before body text | 26 | # Tidy up before article |
15 | strip: //div[@class='Breadcrumbs'] | 27 | strip: //div[@id='FAZHeaderNeu'] |
16 | strip: //div[@class='QuickSearchBox'] | 28 | strip: //h2[@itemprop='headline'] |
17 | strip: //div[@class='FAZArtikelEinleitung'] | 29 | strip: //span[@class='Datum'] |
18 | strip: //div[@class='FAZArtikelReiter'] | 30 | strip: //span[@class='Autor'] |
31 | strip_id_or_class: ArticlePagerTop | ||
32 | |||
33 | # General cleanup | ||
19 | strip: //div[@class='clear'] | 34 | strip: //div[@class='clear'] |
35 | strip: //a[@title='Zur Homepage FAZ.NET'] | ||
36 | strip: //iframe | ||
37 | replace_string( · ): | ||
38 | |||
39 | # Remove tracking and ads | ||
40 | strip_image_src: /l.gif? | ||
41 | strip: //div[contains(@style, 'background-image')] | ||
42 | strip: //img[@width='1'] | ||
43 | strip_id_or_class: invisible | ||
44 | strip_id_or_class: Anzeige | ||
45 | strip_id_or_class: billboard | ||
46 | |||
47 | # Remove various text boxes and social media foo | ||
48 | strip_id_or_class: WeitereBeitraege | ||
49 | strip_id_or_class: WBListe | ||
50 | strip_id_or_class: AutorenModul | ||
51 | strip_id_or_class: Community | ||
52 | strip_id_or_class: SocialMediaStatus | ||
53 | strip_id_or_class: RelatedLinkBox | ||
54 | strip_id_or_class: MultimediaNavigation | ||
55 | strip_id_or_class: IndexTitel | ||
56 | |||
57 | # Fix picture caps and pictures (use better resolution and remove clutter) | ||
58 | strip_id_or_class: LightBoxOverlay | ||
59 | strip_id_or_class: exitLarge | ||
60 | strip_id_or_class: PagerBox | ||
61 | strip_id_or_class: Bildnachweis | ||
62 | strip_id_or_class: Bildueberschrift | ||
63 | strip_id_or_class: Bildbeschreibung | ||
64 | strip_id_or_class: ArtikelBild610 | ||
65 | strip_id_or_class: MediaLink | ||
66 | strip_id_or_class: FotoBoxInnerLeft | ||
67 | strip_id_or_class: BilderRelatedLinks | ||
68 | |||
69 | # Remove clutter after article | ||
70 | strip_id_or_class: ArticlePagerBottom | ||
71 | strip_id_or_class: backToHome | ||
72 | strip_id_or_class: ArtikelAbbinder | ||
73 | strip_id_or_class: lesermeinungscontainer | ||
74 | strip_id_or_class: ThemenLinks | ||
75 | strip_id_or_class: rechtehinweis | ||
76 | strip_id_or_class: FAZArtikelMap | ||
77 | strip_id_or_class: FAZArtikelKommentare | ||
78 | strip_id_or_class: ArtikelKommentieren | ||
79 | strip_id_or_class: FAZArtikelFunktionen | ||
80 | strip_id_or_class: mailLB | ||
81 | strip_id_or_class: FAZContentRight | ||
82 | strip_id_or_class: stageModule | ||
83 | strip_id_or_class: ContentFooter | ||
84 | strip_id_or_class: ServicesFooter | ||
85 | strip_id_or_class: FAZFooter | ||
86 | |||
87 | # Clean up stuff present just in some articles | ||
88 | strip_id_or_class: Teaser620 | ||
89 | strip_id_or_class: TeaserMultimedia | ||
90 | strip_id_or_class: VideoBox | ||
91 | |||
92 | # Remove as soon as Wallabag maight be able to embed flash video | ||
93 | strip_id_or_class: mmoObjectAsTeaserInArticle | ||
94 | strip_id_or_class: additionalStylesAudioVideo | ||
95 | strip_id_or_class: hideMMElements | ||
96 | |||
97 | # Try it yourself | ||
98 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken | ||
99 | test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html | ||
100 | test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html | ||
20 | 101 | ||
21 | # General removements | ||
22 | strip: //span[@class='Bildnachweis'] | ||
23 | strip: //img[@class='MediaIcon'] | ||
24 | strip: //div[@class='ArtikelMediaLink'] | ||
25 | dissolve: //a[img] | ||
26 | |||
27 | # Removements after body text | ||
28 | strip: //div[@class='ArtikelAbbinder'] | ||
29 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] | ||
30 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] | ||
31 | strip: //div[@class='FAZArtikelFunktionen'] | ||
32 | strip: //div[@id='FAZContentRight'] | ||
33 | |||
34 | # Fix picture captions | ||
35 | wrap_in(small): //span[@class='Bildunterschrift']/text() | ||
36 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken \ No newline at end of file | ||