1 title: //h1[@id="headline"]
2 author: //div[contains(@class, "editorial-byline-author")]/a
3 date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ")
5 # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed
6 body: //div[@id="template"]
7 strip_id_or_class: editorial-byline-pic
8 strip_id_or_class: editorial-byline
9 strip_id_or_class: headline
11 # Include the leadin paragraph in the body text, but remove quotes because they're out of context
12 dissolve: //div[contains(@id, "leadin")]
13 strip_id_or_class: pullquote
15 # Image captions removed because they're confusing in body text
16 strip_id_or_class: image-caption-content
18 # Remove header and footer
19 strip_id_or_class: header
20 strip_id_or_class: footer
22 # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image
23 strip: /html/body/span[contains(@style, "display: none")]
26 strip_id_or_class: searchContainer
27 strip: //div[contains(@class, "searchInstruction")]
28 strip: //div[contains(@class, "searchResults")]/h4
30 # Remove the 'Letters to the Editor' section
31 strip_id_or_class: letter-text
32 strip_id_or_class: letter-from
33 strip_id_or_class: letter-date
35 # Remove Like/Tweet links
36 strip_id_or_class: social-tab
38 # Remove 'divider' which causes an inexplicable slash to appear in the article body
39 strip_id_or_class: divider
41 test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/