2 # Should work with "normal" articles as well as with image galleries
7 title: //h1/span[@class='hcf-headline']
10 author: //a[@rel='author']
13 date: //span[@class='date hcf-atlas']
15 # Fetch full multipage articles
16 next_page_link: //a[contains(@class, 'hcf-forward')]
20 body: //div[contains(@class, 'hcf-screen')]
22 # Remove tracking and ads
23 strip_id_or_class: hcf-ad
24 strip_id_or_class: hcf-autoload-ad
25 strip_id_or_class: hcf-content-ad
27 # Tidy up before article
29 strip_id_or_class: hcf-atlas
30 strip_id_or_class: hcf-author
31 strip_id_or_class: date hcf-atlas
32 strip_id_or_class: date hcf-atlas
35 strip: //div[contains(@class, 'hcf-screen')]//h1
36 strip: //div[@class='hcf-subpage-titles']//ul
37 strip_id_or_class: hcf-doctype-media
38 strip_id_or_class: hcf-inline-gallery
39 strip_id_or_class: hcf-doctype-video
40 strip_id_or_class: hcf-links
41 strip_id_or_class: hcf-mini-navi
42 strip_id_or_class: hcf-media-control
43 strip_id_or_class: hcf-hidden
44 replace_string(<span class="hcf-update">Update</span>): <strong>Update: </strong>
46 # Fix pictures and captions
47 replace_string(<a class="hcf-doctype-gallery): <p class="hcf-doctype-gallery
48 replace_string(<a class="hcf-doctype-enlarge): <p class="hcf-doctype-enlarge
49 replace_string(<figcaption class="hcf-caption">): <br><small><em>
50 replace_string(</figcaption>): </em></small>
53 replace_string(<a class=" ajaxify): <p class="ajaxify
54 replace_string(<div class="hcf-caption"><div><p>): <small><em>
57 test_url: http://www.tagesspiegel.de/berlin/bezirke/wedding/wedding-jetzt/auf-der-suche-nach-einem-stadtteil-wilder-weiter-wedding/8757156.html
58 test_url: http://www.tagesspiegel.de/berlin/olympia-in-berlin-der-flughafen-tegel-soll-das-olympische-dorf-werden/10645036.html
59 test_url: http://www.tagesspiegel.de/mediacenter/fotostrecken/berlin/bildergalerie-kreuzberger-der-woche/9305534.html