From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/washingtonpost.com.txt | 51 +++++++++++++--------- 1 file changed, 31 insertions(+), 20 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/washingtonpost.com.txt (limited to 'inc/3rdparty/site_config/standard/washingtonpost.com.txt') diff --git a/inc/3rdparty/site_config/standard/washingtonpost.com.txt b/inc/3rdparty/site_config/standard/washingtonpost.com.txt old mode 100644 new mode 100755 index 2931ca5f..0aa9f1d8 --- a/inc/3rdparty/site_config/standard/washingtonpost.com.txt +++ b/inc/3rdparty/site_config/standard/washingtonpost.com.txt @@ -1,21 +1,32 @@ -body: //div[@class="article_body"] -author://meta[@name='DC.creator']/@content -title://meta[@name='title']/@content -date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title -date://meta[@name="DC.date.issued"]/@content -strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] -strip://div[@id="wp-column six end"] -strip://div[contains(@class,'hidden')] -strip://div[@id='article-side-rail'] -strip://div[@class="module component todays-paper-module curved"] -strip://div[@class="module component live-qa curved img-border"] -strip://div[@class="module component newsletter-signup curved"] -strip://div[@class="module featured-stories component curved img-border"] - -strip_id_or_class: carousel -strip_id_or_class: toolbar -strip_id_or_class: module - -test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 -test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html +# Seems to be redirecting to articles.washingtonpost.com for many users + +body: //div[contains(@class, "article_body")] +# print view +body: //div[@id='print_facet']//div[@id='body'] + +author://meta[@name='DC.creator']/@content +title://meta[@name='title']/@content +date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title +date://meta[@name="DC.date.issued"]/@content +strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] +strip://div[@id="wp-column six end"] +strip://div[contains(@class,'hidden')] +strip://div[@id='article-side-rail'] +strip://div[@class="module component todays-paper-module curved"] +strip://div[@class="module component live-qa curved img-border"] +strip://div[@class="module component newsletter-signup curved"] +strip://div[@class="module featured-stories component curved img-border"] + +strip_id_or_class: carousel +strip_id_or_class: toolbar +strip_id_or_class: module + +# Change gJQAwdJG4U_story.html to gJQAwdJG4U_print.html +single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html") + +# [OLD] Change gJQAwdJG4U_story.html to gJQAwdJG4U_story_print.html +#single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_story_print.html") + +test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 +test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html \ No newline at end of file -- cgit v1.2.3