From 4e067ceabd705201a16b4c92cf4b23f3b990326c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Sun, 13 Jul 2014 10:15:40 +0200 Subject: updated specific configuration for parsing --- .../site_config/standard/lifehacker.com.txt | 89 ++++++++++++---------- 1 file changed, 47 insertions(+), 42 deletions(-) mode change 100644 => 100755 inc/3rdparty/site_config/standard/lifehacker.com.txt (limited to 'inc/3rdparty/site_config/standard/lifehacker.com.txt') diff --git a/inc/3rdparty/site_config/standard/lifehacker.com.txt b/inc/3rdparty/site_config/standard/lifehacker.com.txt old mode 100644 new mode 100755 index 32ade14a..ec97f06c --- a/inc/3rdparty/site_config/standard/lifehacker.com.txt +++ b/inc/3rdparty/site_config/standard/lifehacker.com.txt @@ -1,42 +1,47 @@ -# Adds author text: Gawker sites commonly show as "Author: View Profile" -author://a[@class="plus-icon modfont"] - -# Add date and time -date: //span[@class="date"] - -# Remove date and time from article text -strip: //span[@class="date"] - -# Remove login/comment text -strip: //*[(@class="presence_control_external smalltype")] - -strip: //div[@class="nodebyline modfont"] - -# Remove right sidebar -strip: //div[@id="rightwrapper"] - -# Remove print header -strip: //div[@id='printhead']/h1 - -# Remove 'content is restricted' -strip: //div[@id='agegate_IDHERE'] - -# Remove follow text -strip: //*[(@class="permalink_ads")] - -# Remove view/comment count -strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] - -# Remove contact text -strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] - -# Remove medium duplicates of the article image -strip_image_src: medium.jpg - -# Remove "arrow" class at bottom of page -strip: //p[@class="arrow"] - -# Remove "track" image from article body -strip: //img[@alt="track"] -test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos -test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse \ No newline at end of file +# Adds author text: Gawker sites commonly show as "Author: View Profile" +author://a[@class="plus-icon modfont"] + +# Add date and time +date: //span[@class="date"] + +body: //div[contains(@class, 'marquee-asset-wrapper') or contains(@class, 'post-content')] + +# Remove date and time from article text +strip: //span[@class="date"] + +# Remove login/comment text +strip: //*[(@class="presence_control_external smalltype")] + +strip: //div[@class="nodebyline modfont"] + +# Remove right sidebar +strip: //div[@id="rightwrapper"] + +# Remove print header +strip: //div[@id='printhead']/h1 + +# Remove 'content is restricted' +strip: //div[@id='agegate_IDHERE'] + +# Remove follow text +strip: //*[(@class="permalink_ads")] + +strip_id_or_class: inset_groups + +# Remove view/comment count +strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] + +# Remove contact text +strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] + +# Remove medium duplicates of the article image +strip_image_src: medium.jpg + +# Remove "arrow" class at bottom of page +strip: //p[@class="arrow"] + +# Remove "track" image from article body +strip: //img[@alt="track"] +test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos +test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse +test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314 \ No newline at end of file -- cgit v1.2.3