blob: b9f9a12b4cffed38b530724ae844c0f71f94bbfd (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
# fforst@...
# Use link to print article for single page view
single_page_link: //a[@class="print"]
# set body
tidy: no
body: //div[@class='artikel-content']
# strip title and subtitle since we got it already
strip: //div[@class='issue']
strip: //div[@class='artikel-content']/h2
# some authors are known and have a link, others don't
author: //a[contains(@href, 'autor?')]
#date
date: //span[@class='article-date']
# Strip author since we got him
strip_id_or_class: author
#strip captions
strip_id_or_class: field-name-field-image-credit
strip_id_or_class: field-name-field-article-image-subtitle
# remove community functions
strip: //div[@class='meta']
strip: //div[@id='comments']
# remove "continue on the next page" text
strip: //p[text()="[SEITE]"]
test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049
|