From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- inc/3rdparty/site_config/standard/welt.de.txt | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/welt.de.txt (limited to 'inc/3rdparty/site_config/standard/welt.de.txt') diff --git a/inc/3rdparty/site_config/standard/welt.de.txt b/inc/3rdparty/site_config/standard/welt.de.txt old mode 100644 new mode 100755 index 6e4f828f..42e65e97 --- a/inc/3rdparty/site_config/standard/welt.de.txt +++ b/inc/3rdparty/site_config/standard/welt.de.txt @@ -1,22 +1,22 @@ -# set body -tidy: no -body: //div[contains(@class, 'articleContent')] - -# remove clutter -strip: //div[@class='advertising'] -strip: //div[@class='themenalarm'] -strip: //div[contains(@class, 'inTextTeaser')] - -# remove captions -strip: //span[@class='copyRight'] - -# remove photo galleries and extras -strip: //div[contains(@class, 'textGallery')] -strip: //div[contains(@class, 'videoGallery')] -strip: //div[contains(@class, 'imageGallery')] -strip: //div[contains(@class, 'openContent')] - -# remove comments -strip: //div[@id = 'writeComment'] - +# set body +tidy: no +body: //div[contains(@class, 'articleContent')] + +# remove clutter +strip: //div[@class='advertising'] +strip: //div[@class='themenalarm'] +strip: //div[contains(@class, 'inTextTeaser')] + +# remove captions +strip: //span[@class='copyRight'] + +# remove photo galleries and extras +strip: //div[contains(@class, 'textGallery')] +strip: //div[contains(@class, 'videoGallery')] +strip: //div[contains(@class, 'imageGallery')] +strip: //div[contains(@class, 'openContent')] + +# remove comments +strip: //div[@id = 'writeComment'] + test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html \ No newline at end of file -- cgit v1.2.3