diff options
Diffstat (limited to 'inc/3rdparty/site_config/standard/faz.net.txt')
-rw-r--r-- | inc/3rdparty/site_config/standard/faz.net.txt | 131 |
1 files changed, 101 insertions, 30 deletions
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt index 4fe5968b..47048a1b 100644 --- a/inc/3rdparty/site_config/standard/faz.net.txt +++ b/inc/3rdparty/site_config/standard/faz.net.txt | |||
@@ -1,30 +1,101 @@ | |||
1 | # Title | 1 | # Author: zinnober |
2 | title: //p[@class='Content HeadlineShort'] | 2 | # Complete rewrite of the faz.net template as the standard one is broken |
3 | 3 | # I tried to consider as many page variants as possible, which was some serious work | |
4 | # Authors | 4 | |
5 | # some are known and have a link, others don't | 5 | tidy: no |
6 | author: substring-after(//span[@class='Autor'], 'Von') | 6 | prune: no |
7 | 7 | ||
8 | # Date | 8 | # Title |
9 | date: //span[@class='Datum'] | 9 | title: //p[@class='Content HeadlineShort'] |
10 | 10 | ||
11 | # Body | 11 | # Set author |
12 | body: //div[@class='Artikel'] | 12 | author: substring-after(//span[@class='Autor'], 'von ') |
13 | 13 | author: //span[@class='caps last']/span[@class='caps last'] | |
14 | # Removements before body text | 14 | author: //a[@rel='author'] |
15 | strip: //div[@class='Breadcrumbs'] | 15 | |
16 | strip: //div[@class='QuickSearchBox'] | 16 | # Set date |
17 | strip: //div[@class='FAZArtikelEinleitung'] | 17 | date: //span[@class='Datum'] |
18 | strip: //div[@class='FAZArtikelReiter'] | 18 | date: //span[@class='Datum'],/span |
19 | strip: //div[@class='clear'] | 19 | |
20 | 20 | # Fetch full multipage articles | |
21 | # General removements | 21 | next_page_link: //a[@title='Nächste Seite'] |
22 | strip: //span[@class='Bildnachweis'] | 22 | |
23 | 23 | # Content is here | |
24 | # Removements after body text | 24 | body: //div[@class='Artikel'] |
25 | strip: //div[@class='ArtikelAbbinder'] | 25 | |
26 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] | 26 | # Tidy up before article |
27 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] | 27 | strip: //div[@id='FAZHeaderNeu'] |
28 | strip: //div[@class='FAZArtikelFunktionen'] | 28 | strip: //h2[@itemprop='headline'] |
29 | strip: //div[@id='FAZContentRight'] | 29 | strip: //span[@class='Datum'] |
30 | test_url: http://www.faz.net/aktuell/gesellschaft/ehe-haltbarkeitsformel-verliebe-dich-oft-verlobe-dich-selten-heirate-vielleicht-11685306.html \ No newline at end of file | 30 | strip: //span[@class='Autor'] |
31 | strip_id_or_class: ArticlePagerTop | ||
32 | |||
33 | # General cleanup | ||
34 | strip: //div[@class='clear'] | ||
35 | strip: //a[@title='Zur Homepage FAZ.NET'] | ||
36 | strip: //iframe | ||
37 | replace_string( · ): | ||
38 | |||
39 | # Remove tracking and ads | ||
40 | strip_image_src: /l.gif? | ||
41 | strip: //div[contains(@style, 'background-image')] | ||
42 | strip: //img[@width='1'] | ||
43 | strip_id_or_class: invisible | ||
44 | strip_id_or_class: Anzeige | ||
45 | strip_id_or_class: billboard | ||
46 | |||
47 | # Remove various text boxes and social media foo | ||
48 | strip_id_or_class: WeitereBeitraege | ||
49 | strip_id_or_class: WBListe | ||
50 | strip_id_or_class: AutorenModul | ||
51 | strip_id_or_class: Community | ||
52 | strip_id_or_class: SocialMediaStatus | ||
53 | strip_id_or_class: RelatedLinkBox | ||
54 | strip_id_or_class: MultimediaNavigation | ||
55 | strip_id_or_class: IndexTitel | ||
56 | |||
57 | # Fix picture caps and pictures (use better resolution and remove clutter) | ||
58 | strip_id_or_class: LightBoxOverlay | ||
59 | strip_id_or_class: exitLarge | ||
60 | strip_id_or_class: PagerBox | ||
61 | strip_id_or_class: Bildnachweis | ||
62 | strip_id_or_class: Bildueberschrift | ||
63 | strip_id_or_class: Bildbeschreibung | ||
64 | strip_id_or_class: ArtikelBild610 | ||
65 | strip_id_or_class: MediaLink | ||
66 | strip_id_or_class: FotoBoxInnerLeft | ||
67 | strip_id_or_class: BilderRelatedLinks | ||
68 | |||
69 | # Remove clutter after article | ||
70 | strip_id_or_class: ArticlePagerBottom | ||
71 | strip_id_or_class: backToHome | ||
72 | strip_id_or_class: ArtikelAbbinder | ||
73 | strip_id_or_class: lesermeinungscontainer | ||
74 | strip_id_or_class: ThemenLinks | ||
75 | strip_id_or_class: rechtehinweis | ||
76 | strip_id_or_class: FAZArtikelMap | ||
77 | strip_id_or_class: FAZArtikelKommentare | ||
78 | strip_id_or_class: ArtikelKommentieren | ||
79 | strip_id_or_class: FAZArtikelFunktionen | ||
80 | strip_id_or_class: mailLB | ||
81 | strip_id_or_class: FAZContentRight | ||
82 | strip_id_or_class: stageModule | ||
83 | strip_id_or_class: ContentFooter | ||
84 | strip_id_or_class: ServicesFooter | ||
85 | strip_id_or_class: FAZFooter | ||
86 | |||
87 | # Clean up stuff present just in some articles | ||
88 | strip_id_or_class: Teaser620 | ||
89 | strip_id_or_class: TeaserMultimedia | ||
90 | strip_id_or_class: VideoBox | ||
91 | |||
92 | # Remove as soon as Wallabag maight be able to embed flash video | ||
93 | strip_id_or_class: mmoObjectAsTeaserInArticle | ||
94 | strip_id_or_class: additionalStylesAudioVideo | ||
95 | strip_id_or_class: hideMMElements | ||
96 | |||
97 | # Try it yourself | ||
98 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken | ||
99 | test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html | ||
100 | test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html | ||
101 | |||