From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- inc/3rdparty/site_config/standard/wired.com.txt | 47 +++++++++++++------------ 1 file changed, 25 insertions(+), 22 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/wired.com.txt (limited to 'inc/3rdparty/site_config/standard/wired.com.txt') diff --git a/inc/3rdparty/site_config/standard/wired.com.txt b/inc/3rdparty/site_config/standard/wired.com.txt old mode 100644 new mode 100755 index 69bbf5b7..f5a72d14 --- a/inc/3rdparty/site_config/standard/wired.com.txt +++ b/inc/3rdparty/site_config/standard/wired.com.txt @@ -1,22 +1,25 @@ -title: //meta[@property="og:title"]/@content -title: //h1 -title: //*[@class='posttitle'] -author: //*[@class='entryAuthor']/a[1] -author://*[@class='member-title'] -author://li[@class='author']/a[contains(@href, '/author/')] -date: substring-after(//div[@class='entryAuthor'], '·') -date: substring-before(//*[@class='entryDate'], '|') -body: //div[@class='entry'] -strip: //span[contains(@class, 'nextprev')] -#strip_id_or_class: ngg-galleryoverview -# ngg-galleryoverview is the whole content sometimes, e.g. http://www.wired.com/underwire/2011/12/best-mixtapes-of-2011/?pid=5736&viewall=true - -strip: //p[span[contains(@class, 'contentjump')]] -strip: //text()[contains(., 'nextpage')] - -prune: no - -single_page_link: //a[contains(@href, '/all/1') and contains(@class, 'contentjumpall')] - -test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ -test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/1 \ No newline at end of file +title: //meta[@name='Title']/@content +author: //meta[@name='Author']/@content +date: //meta[@name='DisplayDate']/@content +body: //div[@class='entry'] +strip: //p[contains(., 'Pages:') and contains(., 'View All')] +strip: //p[@class='caption'] +strip: //div[@class='desc' or @class='slide' or @id='slide-info'] + +strip_id_or_class: pullquote +strip_id_or_class: left_rail +strip_id_or_class: related-container +strip_id_or_class: radvert-caption-wrap + +# Remove gallery? +strip_id_or_class: wpgallery + +#strip: //text()[contains(., 'nextpage')] + +prune: no + +single_page_link: //a[.='View All' and contains(@href, '/all/')] + +test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ +test_url: http://www.wired.com/wiredenterprise/2013/09/docker/ +test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/ -- cgit v1.2.3