]>
Commit | Line | Data |
---|---|---|
ac4d1142 NL |
1 | title: //h1[@id="headline"]\r |
2 | author: //div[contains(@class, "editorial-byline-author")]/a\r | |
3 | date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ")\r | |
4 | \r | |
5 | # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed\r | |
6 | body: //div[@id="template"]\r | |
7 | strip_id_or_class: editorial-byline-pic\r | |
8 | strip_id_or_class: editorial-byline\r | |
9 | strip_id_or_class: headline\r | |
10 | \r | |
11 | # Include the leadin paragraph in the body text, but remove quotes because they're out of context\r | |
12 | dissolve: //div[contains(@id, "leadin")]\r | |
13 | strip_id_or_class: pullquote\r | |
14 | \r | |
15 | # Image captions removed because they're confusing in body text\r | |
16 | strip_id_or_class: image-caption-content\r | |
17 | \r | |
18 | # Remove header and footer\r | |
19 | strip_id_or_class: header\r | |
20 | strip_id_or_class: footer\r | |
21 | \r | |
22 | # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image\r | |
23 | strip: /html/body/span[contains(@style, "display: none")]\r | |
24 | \r | |
25 | # Remove search box\r | |
26 | strip_id_or_class: searchContainer\r | |
27 | strip: //div[contains(@class, "searchInstruction")]\r | |
28 | strip: //div[contains(@class, "searchResults")]/h4\r | |
29 | \r | |
30 | # Remove the 'Letters to the Editor' section\r | |
31 | strip_id_or_class: letter-text\r | |
32 | strip_id_or_class: letter-from\r | |
33 | strip_id_or_class: letter-date\r | |
34 | \r | |
35 | # Remove Like/Tweet links \r | |
36 | strip_id_or_class: social-tab\r | |
37 | \r | |
38 | # Remove 'divider' which causes an inexplicable slash to appear in the article body\r | |
39 | strip_id_or_class: divider\r | |
40 | ||
41 | test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ |