From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/bostonglobe.com.txt | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/bostonglobe.com.txt (limited to 'inc/3rdparty/site_config/standard/bostonglobe.com.txt') diff --git a/inc/3rdparty/site_config/standard/bostonglobe.com.txt b/inc/3rdparty/site_config/standard/bostonglobe.com.txt old mode 100644 new mode 100755 index d3e6f43f..4c74a34e --- a/inc/3rdparty/site_config/standard/bostonglobe.com.txt +++ b/inc/3rdparty/site_config/standard/bostonglobe.com.txt @@ -1,16 +1,16 @@ -# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. - -title: //div[@class="header"]/h1 -author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") -date: //div[@class="byline"]/p[last()] -body: //div[@class="article-body"] - -strip_id_or_class: aside -strip_id_or_class: promo -strip_id_or_class: skip-nav -strip_id_or_class: article-more -strip_id_or_class: article-bar - -# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. +# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. + +title: //div[@class="header"]/h1 +author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") +date: //div[@class="byline"]/p[last()] +body: //div[@class="article-body"] + +strip_id_or_class: aside +strip_id_or_class: promo +strip_id_or_class: skip-nav +strip_id_or_class: article-more +strip_id_or_class: article-bar + +# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. strip_id_or_class: figure test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html \ No newline at end of file -- cgit v1.2.3