diff options
author | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
---|---|---|
committer | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
commit | 7f667839764621b5aa01c9db8ce5dde2a29ef18f (patch) | |
tree | 93d8241ee81c87e18494325ae02f0589a8e328a2 /inc/3rdparty/site_config/standard/theglobalmail.org.txt | |
parent | a84f77d6ba15a64ff00453f5d5190c021ce460ed (diff) | |
parent | 2abcccb37180c17318f5226f5d4bc28f30b621ea (diff) | |
download | wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.gz wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.zst wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.zip |
Merge pull request #1 from inthepoche/dev
Dev
Diffstat (limited to 'inc/3rdparty/site_config/standard/theglobalmail.org.txt')
-rw-r--r-- | inc/3rdparty/site_config/standard/theglobalmail.org.txt | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/inc/3rdparty/site_config/standard/theglobalmail.org.txt b/inc/3rdparty/site_config/standard/theglobalmail.org.txt new file mode 100644 index 00000000..fae0fb29 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theglobalmail.org.txt | |||
@@ -0,0 +1,41 @@ | |||
1 | title: //h1[@id="headline"] | ||
2 | author: //div[contains(@class, "editorial-byline-author")]/a | ||
3 | date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") | ||
4 | |||
5 | # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed | ||
6 | body: //div[@id="template"] | ||
7 | strip_id_or_class: editorial-byline-pic | ||
8 | strip_id_or_class: editorial-byline | ||
9 | strip_id_or_class: headline | ||
10 | |||
11 | # Include the leadin paragraph in the body text, but remove quotes because they're out of context | ||
12 | dissolve: //div[contains(@id, "leadin")] | ||
13 | strip_id_or_class: pullquote | ||
14 | |||
15 | # Image captions removed because they're confusing in body text | ||
16 | strip_id_or_class: image-caption-content | ||
17 | |||
18 | # Remove header and footer | ||
19 | strip_id_or_class: header | ||
20 | strip_id_or_class: footer | ||
21 | |||
22 | # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image | ||
23 | strip: /html/body/span[contains(@style, "display: none")] | ||
24 | |||
25 | # Remove search box | ||
26 | strip_id_or_class: searchContainer | ||
27 | strip: //div[contains(@class, "searchInstruction")] | ||
28 | strip: //div[contains(@class, "searchResults")]/h4 | ||
29 | |||
30 | # Remove the 'Letters to the Editor' section | ||
31 | strip_id_or_class: letter-text | ||
32 | strip_id_or_class: letter-from | ||
33 | strip_id_or_class: letter-date | ||
34 | |||
35 | # Remove Like/Tweet links | ||
36 | strip_id_or_class: social-tab | ||
37 | |||
38 | # Remove 'divider' which causes an inexplicable slash to appear in the article body | ||
39 | strip_id_or_class: divider | ||
40 | |||
41 | test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ \ No newline at end of file | ||