1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
# A. Niepel, narya.de@...
# - added single_page_link
# - added author for default and single page view
# - added date for single page view
# fforst@...
# - Fixed it
# bode2104@...
# - Fixed single_page_link
# - Included intro text in single page view
# - Added body in default view
# stesie@
# - removed copyright box
# - removed "print more" box
# set body
tidy: no
# body in single page view
body: //div[@id="spArticleContent"]
# body in default view
body: //div[@id="spArticleSection"]
body: //div[contains(@class, 'article-section')] | //div[@id='js-article-top-wide-asset'] | //p[contains(@class, 'article-intro')] | //div[contains(@class, 'js-module-box-image')]
# body in "Fotostrecke"
body: //div[@id="spBigaContent"]
# set date in single page view
date: //div[@id="spArticleContent"]/h3
# strip date
strip: //div[@id="spArticleContent"]/h3
# set date in "Fotostrecke"
date: //div[@id="spBigaDatum"]
# title in default view
title: //h2[contains(@class, 'article-title')]
#set title in single page view
title: //div[@id='spArticleContent']/h2
# strip title
strip: //div[@id='spArticleContent']/h1
strip: //div[@id='spArticleContent']/h2
#set title in "Fotostrecke"
title: //div[@class='spBigaHeadline']
# set author
author: //p[@class="spAuthor"]/a
author: substring-after(//p[@class="spAuthor"], 'Von ')
# strip author
strip: //p[@class='spAuthor']
# remove captions
strip: //*/span[@class='spPicLayerText']
strip: //*/div[@class='spPanoPlayerPaneControl']
strip: //*/div[@class='spCredit']
strip: //*/div[@class='spCredit']/following-sibling::p
# remove ads
strip: //div[@class='spMInline']
# remove photogalleries and extras
strip: //div[contains(@class, 'spPhotoGallery')]
strip: //div[@class='spPhotoGallery']/following-sibling::br
strip: //div[@class='spAssetAlignleft']
strip: //div[contains(@class,'spAsset')]
strip: //br[@clear='all']
# remove community functions
strip: //div[@id='spSocialBookmark']
strip: //div[contains(@class, 'spCommunityBox')]
strip: //div[contains(@class, 'spArticleNewsfeedBox')]
strip: //div[@class='spArticleCredit']
# remove clutter in "Fotostrecke"
strip: //div[@id='spBreadcrumb']
strip: //div[@id='spBigaLatestEntries']
strip: //div[contains(@class, 'spBigaNavi')]
strip: //div[@class='spDottedLine']
strip: //div[@class='asset-box article-print-more']
strip: //div[@class='article-copyright']
strip: //span[@class='image-buttons']
# Use link to print article for single page view
single_page_link: //a[contains(@href, '-druck')]
if_page_contains: //div[contains(@class, 'multi-pager-control')]
# Clean up title in print view
find_string: <title>Druckversion -
replace_string: <title>
# use next link in "Fotostrecke"
next_page_link: //a[@class='spBigaControlForw']
test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html
# regular article
test_url: http://www.spiegel.de/wirtschaft/soziales/griechenland-was-den-griechischen-buergern-nun-droht-a-1042682.html
# multipage article
test_url: http://www.spiegel.de/spiegel/a-710880.html
|