From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/theglobalmail.org.txt | 78 +++++++++++----------- 1 file changed, 39 insertions(+), 39 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/theglobalmail.org.txt (limited to 'inc/3rdparty/site_config/standard/theglobalmail.org.txt') diff --git a/inc/3rdparty/site_config/standard/theglobalmail.org.txt b/inc/3rdparty/site_config/standard/theglobalmail.org.txt old mode 100644 new mode 100755 index fae0fb29..da1c84f9 --- a/inc/3rdparty/site_config/standard/theglobalmail.org.txt +++ b/inc/3rdparty/site_config/standard/theglobalmail.org.txt @@ -1,41 +1,41 @@ -title: //h1[@id="headline"] -author: //div[contains(@class, "editorial-byline-author")]/a -date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") - -# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed -body: //div[@id="template"] -strip_id_or_class: editorial-byline-pic -strip_id_or_class: editorial-byline -strip_id_or_class: headline - -# Include the leadin paragraph in the body text, but remove quotes because they're out of context -dissolve: //div[contains(@id, "leadin")] -strip_id_or_class: pullquote - -# Image captions removed because they're confusing in body text -strip_id_or_class: image-caption-content - -# Remove header and footer -strip_id_or_class: header -strip_id_or_class: footer - -# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image -strip: /html/body/span[contains(@style, "display: none")] - -# Remove search box -strip_id_or_class: searchContainer -strip: //div[contains(@class, "searchInstruction")] -strip: //div[contains(@class, "searchResults")]/h4 - -# Remove the 'Letters to the Editor' section -strip_id_or_class: letter-text -strip_id_or_class: letter-from -strip_id_or_class: letter-date - -# Remove Like/Tweet links -strip_id_or_class: social-tab - -# Remove 'divider' which causes an inexplicable slash to appear in the article body -strip_id_or_class: divider +title: //h1[@id="headline"] +author: //div[contains(@class, "editorial-byline-author")]/a +date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") + +# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed +body: //div[@id="template"] +strip_id_or_class: editorial-byline-pic +strip_id_or_class: editorial-byline +strip_id_or_class: headline + +# Include the leadin paragraph in the body text, but remove quotes because they're out of context +dissolve: //div[contains(@id, "leadin")] +strip_id_or_class: pullquote + +# Image captions removed because they're confusing in body text +strip_id_or_class: image-caption-content + +# Remove header and footer +strip_id_or_class: header +strip_id_or_class: footer + +# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image +strip: /html/body/span[contains(@style, "display: none")] + +# Remove search box +strip_id_or_class: searchContainer +strip: //div[contains(@class, "searchInstruction")] +strip: //div[contains(@class, "searchResults")]/h4 + +# Remove the 'Letters to the Editor' section +strip_id_or_class: letter-text +strip_id_or_class: letter-from +strip_id_or_class: letter-date + +# Remove Like/Tweet links +strip_id_or_class: social-tab + +# Remove 'divider' which causes an inexplicable slash to appear in the article body +strip_id_or_class: divider test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ \ No newline at end of file -- cgit v1.2.3