From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- inc/3rdparty/site_config/standard/domusweb.it.txt | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/domusweb.it.txt (limited to 'inc/3rdparty/site_config/standard/domusweb.it.txt') diff --git a/inc/3rdparty/site_config/standard/domusweb.it.txt b/inc/3rdparty/site_config/standard/domusweb.it.txt old mode 100644 new mode 100755 index 81683f02..20566ee3 --- a/inc/3rdparty/site_config/standard/domusweb.it.txt +++ b/inc/3rdparty/site_config/standard/domusweb.it.txt @@ -1,21 +1,21 @@ -# TODO: clean up the extra junk at the end of articles - -# general text formatting -prune: no -convert_double_br_tags:yes - -# where to find the basic metadata -author://a[@class='articleauthor'] -date://a[starts-with(@href,'/en/search/published/')] -title:substring-before(//h2[@class='title'],'—') -body://div[@id='maincontainer'] - -dissolve://div[starts-with(@id,'commentableblock')] - -# clean up the crap -strip://div[contains(@class,'domusnetwork')] -strip://div[contains(@class,'relative_wrapper')] - -strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] +# TODO: clean up the extra junk at the end of articles + +# general text formatting +prune: no +convert_double_br_tags:yes + +# where to find the basic metadata +author://a[@class='articleauthor'] +date://a[starts-with(@href,'/en/search/published/')] +title:substring-before(//h2[@class='title'],'—') +body://div[@id='maincontainer'] + +dissolve://div[starts-with(@id,'commentableblock')] + +# clean up the crap +strip://div[contains(@class,'domusnetwork')] +strip://div[contains(@class,'relative_wrapper')] + +strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] wrap_in(em): //div[contains(@class,'captionsubimage')]/span test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/ \ No newline at end of file -- cgit v1.2.3