From ac4d114214d820b20e18518a2dbc809337e39043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Fri, 6 Dec 2013 10:13:03 +0100 Subject: [add] new specific configuration files --- .../site_config/standard/businessweek.com.txt | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/businessweek.com.txt (limited to 'inc/3rdparty/site_config/standard/businessweek.com.txt') diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt new file mode 100644 index 00000000..7b3d063b --- /dev/null +++ b/inc/3rdparty/site_config/standard/businessweek.com.txt @@ -0,0 +1,30 @@ +# story has several pages, should be detected +body: //div[@id='storyBody'] +body: //div[@id='article_body'] +body: //div[@id='story_body'] + +title://h1[@id='article_headline'] + +# article author +author: //p[@class='author']/a +# story author(s) +author: substring-after(//p[@class='byline'], 'By ') + +# article date +date: //span[@class='published_date'] +# story date +date: //span[@class='date'] + +date: substring-after(//div[contains(@class,'attributor')],'on') +strip_id_or_class: inset +strip: //p/span[@class='photoCredit'] +strip: //h1 + +strip_id_or_class: page_count +strip_id_or_class: tools +strip_id_or_class: pagination + +single_page_link: //li[@id='stPrint']/a + +test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html +test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file -- cgit v1.2.3