From 90a1a78b1e2f4d40e1d9b8e6f46aca129a9d7bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Mon, 27 Oct 2014 06:46:13 +0100 Subject: updated site_config --- .../site_config/standard/businessweek.com.txt | 41 ++++++++-------------- 1 file changed, 14 insertions(+), 27 deletions(-) (limited to 'inc/3rdparty/site_config/standard/businessweek.com.txt') diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt index 03085593..f546b708 100755 --- a/inc/3rdparty/site_config/standard/businessweek.com.txt +++ b/inc/3rdparty/site_config/standard/businessweek.com.txt @@ -1,30 +1,17 @@ -# story has several pages, should be detected -body: //div[@id='storyBody'] -body: //div[@id='article_body'] -body: //div[@id='story_body'] +# include the lead graphic in the body, if available +body: //div[contains(concat(' ', normalize-space(@id), ' '), ' lead_graphic ')] | //div[contains(concat(' ', normalize-space(@itemprop), ' '), ' articleBody ')] +title: //h1[contains(concat(' ', normalize-space(@itemprop), ' '), ' headline ')] +date: //time[contains(concat(' ', normalize-space(@itemprop), ' '), ' datePublished ')] -title://h1[@id='article_headline'] - -# article author -author: //p[@class='author']/a -# story author(s) -author: substring-after(//p[@class='byline'], 'By ') - -# article date -date: //span[@class='published_date'] -# story date -date: //span[@class='date'] - -date: substring-after(//div[contains(@class,'attributor')],'on') -strip_id_or_class: inset -strip: //p/span[@class='photoCredit'] -strip: //h1 - -strip_id_or_class: page_count -strip_id_or_class: tools -strip_id_or_class: pagination - -single_page_link: //li[@id='stPrint']/a +strip_id_or_class: photo_credit +strip_id_or_class: photo_caption +strip_id_or_class: inline_gallery +# pull quote, often inside a blockquote element +strip_id_or_class: pq +strip_id_or_class: credit +strip_id_or_class: figcaption +strip_id_or_class: related_item test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html -test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file +test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall +test_url: http://www.businessweek.com/articles/2014-07-09/american-apparel-dov-charneys-sleazy-struggle-for-control -- cgit v1.2.3