From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/businessweek.com.txt | 58 +++++++++++----------- 1 file changed, 29 insertions(+), 29 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/businessweek.com.txt (limited to 'inc/3rdparty/site_config/standard/businessweek.com.txt') diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt old mode 100644 new mode 100755 index 7b3d063b..03085593 --- a/inc/3rdparty/site_config/standard/businessweek.com.txt +++ b/inc/3rdparty/site_config/standard/businessweek.com.txt @@ -1,30 +1,30 @@ -# story has several pages, should be detected -body: //div[@id='storyBody'] -body: //div[@id='article_body'] -body: //div[@id='story_body'] - -title://h1[@id='article_headline'] - -# article author -author: //p[@class='author']/a -# story author(s) -author: substring-after(//p[@class='byline'], 'By ') - -# article date -date: //span[@class='published_date'] -# story date -date: //span[@class='date'] - -date: substring-after(//div[contains(@class,'attributor')],'on') -strip_id_or_class: inset -strip: //p/span[@class='photoCredit'] -strip: //h1 - -strip_id_or_class: page_count -strip_id_or_class: tools -strip_id_or_class: pagination - -single_page_link: //li[@id='stPrint']/a - -test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html +# story has several pages, should be detected +body: //div[@id='storyBody'] +body: //div[@id='article_body'] +body: //div[@id='story_body'] + +title://h1[@id='article_headline'] + +# article author +author: //p[@class='author']/a +# story author(s) +author: substring-after(//p[@class='byline'], 'By ') + +# article date +date: //span[@class='published_date'] +# story date +date: //span[@class='date'] + +date: substring-after(//div[contains(@class,'attributor')],'on') +strip_id_or_class: inset +strip: //p/span[@class='photoCredit'] +strip: //h1 + +strip_id_or_class: page_count +strip_id_or_class: tools +strip_id_or_class: pagination + +single_page_link: //li[@id='stPrint']/a + +test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file -- cgit v1.2.3