From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/online.wsj.com.txt | 48 +++++++++++----------- 1 file changed, 25 insertions(+), 23 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/online.wsj.com.txt (limited to 'inc/3rdparty/site_config/standard/online.wsj.com.txt') diff --git a/inc/3rdparty/site_config/standard/online.wsj.com.txt b/inc/3rdparty/site_config/standard/online.wsj.com.txt old mode 100644 new mode 100755 index edb52855..448bb7e1 --- a/inc/3rdparty/site_config/standard/online.wsj.com.txt +++ b/inc/3rdparty/site_config/standard/online.wsj.com.txt @@ -1,23 +1,25 @@ -title: //meta[@property="og:title"]/@content -body: //div[@id='article_story_body'] - -author: //h3[@class='byline']/a -# for slid show content -body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] -date: //li[@class='dateStamp']/small - -strip_id_or_class: insetFullBracket -strip_id_or_class: insettipBox -#strip_id_or_class: legacyInset -strip_id_or_class: recipeACShopAndBuyText - -strip: //div[contains(@class, 'insetContent')]//cite -strip: //*[contains(@style, 'visibility: hidden;')] -strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] - -prune: no -tidy: no - -test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html -# slide show -test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html \ No newline at end of file +title: //meta[@property="og:title"]/@content +body: //div[@id='article_story_body'] + +author: //h3[@class='byline']/a +# for slide show content +body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] +date: //li[@class='dateStamp']/small + +strip_id_or_class: insetFullBracket +strip_id_or_class: insettipBox +#strip_id_or_class: legacyInset +strip_id_or_class: recipeACShopAndBuyText + +strip: //div[contains(@class, 'insetContent')]//cite +strip: //*[contains(@style, 'visibility: hidden;')] +strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] +strip: //div[contains(@class, 'carousel')] + +prune: no +tidy: no + +test_url: http://online.wsj.com/news/articles/SB10001424052702304626304579509100018004342 +test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html +# slide show +test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html -- cgit v1.2.3