From ac4d114214d820b20e18518a2dbc809337e39043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Fri, 6 Dec 2013 10:13:03 +0100 Subject: [add] new specific configuration files --- inc/3rdparty/site_config/standard/bostonglobe.com.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/bostonglobe.com.txt (limited to 'inc/3rdparty/site_config/standard/bostonglobe.com.txt') diff --git a/inc/3rdparty/site_config/standard/bostonglobe.com.txt b/inc/3rdparty/site_config/standard/bostonglobe.com.txt new file mode 100644 index 00000000..d3e6f43f --- /dev/null +++ b/inc/3rdparty/site_config/standard/bostonglobe.com.txt @@ -0,0 +1,16 @@ +# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. + +title: //div[@class="header"]/h1 +author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") +date: //div[@class="byline"]/p[last()] +body: //div[@class="article-body"] + +strip_id_or_class: aside +strip_id_or_class: promo +strip_id_or_class: skip-nav +strip_id_or_class: article-more +strip_id_or_class: article-bar + +# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. +strip_id_or_class: figure +test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html \ No newline at end of file -- cgit v1.2.3