]> git.immae.eu Git - github/wallabag/wallabag.git/commitdiff
update config from @fivefilters 1266/head
authorNicolas Lœuillet <nicolas@loeuillet.org>
Sat, 1 Aug 2015 19:20:43 +0000 (21:20 +0200)
committerNicolas Lœuillet <nicolas@loeuillet.org>
Sat, 1 Aug 2015 19:20:43 +0000 (21:20 +0200)
112 files changed:
inc/3rdparty/site_config/standard/20min.ch.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/24.ae.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/9gag.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/ad.nl.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/albayan.ae.txt
inc/3rdparty/site_config/standard/androidpolice.com.txt
inc/3rdparty/site_config/standard/artofmanliness.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/au.businessinsider.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/au.news.yahoo.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/bbc.co.uk.txt
inc/3rdparty/site_config/standard/bbc.com.txt
inc/3rdparty/site_config/standard/blog.cloudflare.com.txt
inc/3rdparty/site_config/standard/blogs.msdn.com.txt
inc/3rdparty/site_config/standard/brandeins.de.txt
inc/3rdparty/site_config/standard/brokernews.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/business.time.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/choice.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/cnet.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/computerbase.de.txt
inc/3rdparty/site_config/standard/computerworld.com.txt
inc/3rdparty/site_config/standard/contrepoints.org.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/cooper.com.txt
inc/3rdparty/site_config/standard/cwnp.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/dailymail.co.uk.txt
inc/3rdparty/site_config/standard/dailytelegraph.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/deadspin.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/derbund.ch.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/designbuildsource.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/dilbert.com.txt
inc/3rdparty/site_config/standard/dn.se.txt
inc/3rdparty/site_config/standard/economie.gouv.fr.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/entwickler.de.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/explosm.net.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/facebook.com.txt
inc/3rdparty/site_config/standard/fastcompany.com.txt
inc/3rdparty/site_config/standard/fok.nl.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/forsvaret.no.txt
inc/3rdparty/site_config/standard/france24.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/galwayindependent.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/gameblog.fr.txt
inc/3rdparty/site_config/standard/gawker.com.txt
inc/3rdparty/site_config/standard/getpocket.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/gist.github.com.txt
inc/3rdparty/site_config/standard/gizmodo.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/gizmodo.com.txt
inc/3rdparty/site_config/standard/globalgrind.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/gocomics.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/help.fivefilters.org.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/heraldsun.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/hiiraan.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/independent.co.uk.txt
inc/3rdparty/site_config/standard/io9.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/ippmedia.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/itnews.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/jalopnik.com.txt
inc/3rdparty/site_config/standard/jezebel.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/kotaku.com.txt
inc/3rdparty/site_config/standard/lefigaro.fr.txt
inc/3rdparty/site_config/standard/lifehacker.com.txt
inc/3rdparty/site_config/standard/linuxjournal.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/m.bbc.co.uk.txt
inc/3rdparty/site_config/standard/m.facebook.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/m.theregister.co.uk.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/marketingmag.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/medium.com.txt
inc/3rdparty/site_config/standard/menshealth.com.sg.txt
inc/3rdparty/site_config/standard/mitchellrepublic.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/mobile.nytimes.com.txt
inc/3rdparty/site_config/standard/moneymanagement.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/nbnnews.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/news.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/news.menshealth.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/news.ninemsn.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/parool.nl.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/perthnow.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/planetsave.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/presseportal.de.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/quora.com.txt
inc/3rdparty/site_config/standard/reddit.com.txt
inc/3rdparty/site_config/standard/rockpapershotgun.com.txt
inc/3rdparty/site_config/standard/saadaalnews.net.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/smh.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/smh.drive.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/smithsonianmag.com.txt
inc/3rdparty/site_config/standard/snip.ly.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/soundcity.tv.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/spiegel.de.txt
inc/3rdparty/site_config/standard/srf.ch.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/sueddeutsche.de.txt
inc/3rdparty/site_config/standard/sunshinecoastdaily.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/sz.de.txt
inc/3rdparty/site_config/standard/tagesanzeiger.ch.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/tagesschau.de.txt
inc/3rdparty/site_config/standard/taz.de.txt
inc/3rdparty/site_config/standard/theatlantic.com.txt
inc/3rdparty/site_config/standard/theaustralian.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/thebostonchannel.com.txt
inc/3rdparty/site_config/standard/theguardian.com.txt
inc/3rdparty/site_config/standard/theregister.co.uk.txt
inc/3rdparty/site_config/standard/theverge.com.txt
inc/3rdparty/site_config/standard/titanic-magazin.de.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/truongtx.me.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/utdailybeacon.com.txt
inc/3rdparty/site_config/standard/volkskrant.nl.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/warnerbros.fr.txt
inc/3rdparty/site_config/standard/washingtonpost.com.txt
inc/3rdparty/site_config/standard/watoday.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/weeklytimesnow.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/westernadvocate.com.au.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/wsj.com.txt [new file with mode: 0755]
inc/3rdparty/site_config/standard/yourerie.com.txt
inc/3rdparty/site_config/standard/zeit.de.txt

diff --git a/inc/3rdparty/site_config/standard/20min.ch.txt b/inc/3rdparty/site_config/standard/20min.ch.txt
new file mode 100755 (executable)
index 0000000..cd8e3fc
--- /dev/null
@@ -0,0 +1,24 @@
+# Author: cirnod@gmail.com
+
+tidy: no
+prune: no
+
+title: //h1
+date: /html/body/div[3]/div[1]/div[6]/div/div[1]/div[2]/div[1]/div/p
+body: //div[@class='published clearfix'] | //div[@class='story_titles']/h3 | //div[@class='story_text']
+
+# General Cleanup
+strip_id_or_class: info_panel 
+strip_id_or_class: info_poll 
+strip_id_or_class: teaser 
+strip_id_or_class: panelbox
+strip_id_or_class: polls
+strip_id_or_class: warning 
+strip_id_or_class: vplaceholder
+
+# visual removal only -> complete removal doesn't work
+replace_string(Print</a>): </a>
+
+# Try yourself
+test_url: http://www.20min.ch/wissen/news/story/31588952
+test_url: http://www.20min.ch/digital/dossier/apple/story/So-einfach-laesst-sich-das-iPhone-6-Plus-verbiegen-24651169
diff --git a/inc/3rdparty/site_config/standard/24.ae.txt b/inc/3rdparty/site_config/standard/24.ae.txt
new file mode 100755 (executable)
index 0000000..6e51507
--- /dev/null
@@ -0,0 +1,8 @@
+title: //div[@id='DivTitle']
+body: //div[@id='divImages' or @id='Divkhabarcontent']
+author: //div[@id='DivAuthor']
+
+prune: no
+
+test_url: http://24.ae/article.aspx?ArticleId=123304
+test_url: http://24.ae/rss.aspx?pageId=30
diff --git a/inc/3rdparty/site_config/standard/9gag.com.txt b/inc/3rdparty/site_config/standard/9gag.com.txt
new file mode 100755 (executable)
index 0000000..4ebb62a
--- /dev/null
@@ -0,0 +1,6 @@
+# Generated by FiveFilters.org's web-based selection tool
+# Place this file inside your site_config/custom/ folder
+# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2F9gag.com%2Fgag%2FaDwQnO7
+
+body: //div[contains(concat(' ',normalize-space(@class),' '),' badge-post-container ')]
+test_url: http://9gag.com/gag/aDwQnO7
diff --git a/inc/3rdparty/site_config/standard/ad.nl.txt b/inc/3rdparty/site_config/standard/ad.nl.txt
new file mode 100755 (executable)
index 0000000..422faa5
--- /dev/null
@@ -0,0 +1,7 @@
+#bypass cookie check
+single_page_link: //a[contains(@href, '/acceptCookieCheck.do?url=')]
+
+test_url: http://www.ad.nl/ad/nl/10444/Offside/article/detail/4043834/2015/05/31/Dani-Alves-voetbalt-met-drol-op-zijn-hoofd.dhtml
+test_contains: De nieuwe coupe van Alves
+
+test_url: http://www.ad.nl/digitaal/rss.xml
\ No newline at end of file
index f6c093d2bcc10b2c8cd8501939369dc77e2f8cd0..d52700b3739ccad49ea7d01cacafdb79343e892e 100755 (executable)
@@ -1,5 +1,7 @@
 body: //div[@id='main-column']//div[@class='content']
 
+strip_id_or_class: social-buttons
+
 prune: no
 
 test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645
index 8f9b1a216881b9e8b111ed5925d18c07e43e9940..660f29d9116bdfcbdf02a222e612c1430e7a778f 100755 (executable)
@@ -1,5 +1,6 @@
 body: //div[@class='post_content']
 date: //div[@class='date_day'] | div[@class='date_month']
+strip_id_or_class: author-box
+author: //h2[@class='author-box-heading']/a
 
 test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/
-
diff --git a/inc/3rdparty/site_config/standard/artofmanliness.com.txt b/inc/3rdparty/site_config/standard/artofmanliness.com.txt
new file mode 100755 (executable)
index 0000000..b29ea0d
--- /dev/null
@@ -0,0 +1,6 @@
+parser: html5php
+date: //article/p[contains(@class, 'single-date')]
+author: //article/p[contains(@class, 'byline')]
+
+test_url: http://www.artofmanliness.com/2013/01/31/relationship-red-flags/
+test_contains: It seems that once we get close to a person
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/au.businessinsider.com.txt b/inc/3rdparty/site_config/standard/au.businessinsider.com.txt
new file mode 100755 (executable)
index 0000000..46bcddf
--- /dev/null
@@ -0,0 +1,12 @@
+title://div[@class="sl-layout-post"]/h1
+body: //div[@id='content_post']
+strip: //div[contains(@class, "post-sidebar")]
+strip: //div[@id='related-links']
+strip: //img[@class='size_xlarge']
+author://div[@class="byline"]/a
+date://div[@class="byline"]/span[@class="date"]
+prune: no
+tidy: no
+
+
+test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1
diff --git a/inc/3rdparty/site_config/standard/au.news.yahoo.com.txt b/inc/3rdparty/site_config/standard/au.news.yahoo.com.txt
new file mode 100755 (executable)
index 0000000..8e84cbb
--- /dev/null
@@ -0,0 +1,4 @@
+strip: //a[contains(text(), "RELATED:")]
+author: //div[@class="info"]//span[@class="association printer-source"]
+author: //div[@class="info"]//span[@class="stamp printer-date"]
+
index bad77654b02c29981afcbb54079f77d60ab4096e..7bef73aded09f6e33ed684a365f906fcd12ad495 100755 (executable)
@@ -30,6 +30,12 @@ strip: //div[contains(@class, 'comment-introduction')]
 strip: //div[contains(@class, 'share-tools')]
 strip: //div[@id='also-related-links']
 
+find_string: http://ichef.bbci.co.uk/news/200/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
+find_string: http://ichef.bbci.co.uk/news/304/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
 strip_id_or_class: share-help
 strip_id_or_class: comments_module
 
index c04a683e985d90ba94001a575daa7ea9b8c0e0de..200dba63c8e28b58393631129b20e2938f57b460 100755 (executable)
@@ -33,6 +33,12 @@ strip: //div[@id='also-related-links']
 strip_id_or_class: share-help
 strip_id_or_class: comments_module
 
+find_string: http://ichef.bbci.co.uk/news/200/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
+find_string: http://ichef.bbci.co.uk/news/304/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
 replace_string(<noscript>): <div>
 replace_string(</noscript>): </div>
 
index 9b7cf25c3eb0d65328969c699267491f369221d3..db80a35f71277f98a8d637169a0e76ae3221f855 100755 (executable)
@@ -3,11 +3,7 @@ title: substring-before(//title, '-')
 
 author: //a[ contains(@href, '/people') ]
 
-body: //article[contains(concat(' ',normalize-space(@class),' '),' post ')]
-
-strip_id_or_class: section learn-more
-strip_id_or_class: section comments
-strip_id_or_class: disqus_thread
+body: //div[ @class='post' ]
 
 # Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
-test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
+test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
\ No newline at end of file
index b2ff8332b3bac9ee9358b9440c0205416932103c..11b8d42d1cf83b070bce6143f52779eb9de15f19 100755 (executable)
@@ -1,6 +1,6 @@
 title: //h3[@class="post-name"]
 author: //span[@class="user-name"]
-date: //div[@class="post-date"]
+date: //div[@class="post-date"]/span[@class="value"]
 body: //div[@class="post-content user-defined-markup"]
 footnotes: no
-test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx
\ No newline at end of file
+test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx
index 36aa2efac6437a41b59f53f2dc6ea81f5cce0caf..be3263469745716bd810b52bb9eda643151fbe41 100755 (executable)
@@ -1,7 +1,9 @@
-# set body
-body: //div[@id='theContent']
 
-# set title
-title: //div[@id='theContent']/h3
-strip: //div[@id='theContent']/h3
-test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html
\ No newline at end of file
+body: //div[@class="articleTeaser"] | //section[@class="contentSection"]
+
+strip: //section[@class="greenBox italic"]
+
+author: //div[@class="articleAuthor"]
+# no publish date on page (the articles are from a monthly periodical)
+
+test_url: http://www.brandeins.de/archiv/2015/fuehrung/ministry-group-mach-doch-mal-ne-ansage/
diff --git a/inc/3rdparty/site_config/standard/brokernews.com.au.txt b/inc/3rdparty/site_config/standard/brokernews.com.au.txt
new file mode 100755 (executable)
index 0000000..814da38
--- /dev/null
@@ -0,0 +1,2 @@
+author: //span[@itemprop="author"]
+date: //span[@itemprop="datePublished"]
diff --git a/inc/3rdparty/site_config/standard/business.time.com.txt b/inc/3rdparty/site_config/standard/business.time.com.txt
new file mode 100755 (executable)
index 0000000..5502bea
--- /dev/null
@@ -0,0 +1,17 @@
+# 2011-10-25 - carlo@... - Initial setup.
+
+single_page_link: //li[@class='print']/a/@href
+
+title: //h1
+author: //meta[@name="byline"]/@content
+date: //meta[@name="date"]/@content
+
+strip: //span[@class="see"]
+strip: //div[@class="byline"]
+strip: //div[@id="date2"]
+strip: //h1
+strip: //div[@class='post-rail-ad']
+strip: //div[@class='post-rail-content']
+strip: //aside[@class='post-rail']
+
+test_url: http://www.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html
diff --git a/inc/3rdparty/site_config/standard/choice.com.au.txt b/inc/3rdparty/site_config/standard/choice.com.au.txt
new file mode 100755 (executable)
index 0000000..0271475
--- /dev/null
@@ -0,0 +1,4 @@
+
+body: //div[@id='content']//div[@id='mainBlogContentWrapper']//*[self::p or self::img or self::ul] | //div[@class='mainArticleIntro')]
+
+date: //span[@class='date']
diff --git a/inc/3rdparty/site_config/standard/cnet.com.au.txt b/inc/3rdparty/site_config/standard/cnet.com.au.txt
new file mode 100755 (executable)
index 0000000..d5719d4
--- /dev/null
@@ -0,0 +1,17 @@
+title: //meta[@property="og:title"]/@content
+body: //div[contains(@class, 'postBody')]
+date: //div[@id='nameAndTime']/time
+author: //div[@id='nameAndTime']/span[@class='author']
+
+strip_id_or_class: image-credit
+strip_id_or_class: noAutolink
+strip_id_or_class: related
+strip_id_or_class: cite
+
+prune: no
+tidy: no
+
+# early end
+replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html>
+
+test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/
index 5973c50b7a9f0ee33c3316a4221d9037ef15411c..214fccebd3c0d03bdb3ce3de66ae236b34fcf734 100755 (executable)
@@ -2,7 +2,7 @@ title://h1
 
 author://div[@id="news-meta"]/a
 
-body://*[@id="main"]/div[1]
+body: //div[contains(@class, 'text-content')]
 
 strip://*[@id="main"]/div[2]
 strip://*[@id="main"]/div[3]
@@ -15,4 +15,4 @@ strip://img
 
 #figures are not displayed in instapaper...
 strip://figure | //figcaption
-test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/
\ No newline at end of file
+test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/
index 1f55e2cb87f504ba5181822e6a03cf1fdc7bfb54..7f20a4dac7781d4c30f2df4c2a47ea84317d8586 100755 (executable)
@@ -1,5 +1,8 @@
 title: //meta[@name='headline']/@content
-body://div[@id="drr-container"]
+date: //meta[@name='date']/@content
+author: //meta[@name='author']/@content
+body: //div[contains(@class, 'article')]
+body://div[@id="article_body"]
 
 strip_id_or_class: banner
 strip: //noscript
@@ -16,4 +19,4 @@ next_page_link://div[@id="next_page"]/a
 single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/'))
 
 test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware
-test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy
+test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/contrepoints.org.txt b/inc/3rdparty/site_config/standard/contrepoints.org.txt
new file mode 100755 (executable)
index 0000000..8a6a125
--- /dev/null
@@ -0,0 +1,21 @@
+# Contrepoints.org
+# As of 2015-04, it's a wordpress-powered website.
+
+title: //h1[contains(concat(' ',normalize-space(@class),' '),' page-title ')]//span[contains(concat(' ',normalize-space(@class),' '),' inner-text ')]
+date: //time[contains(concat(' ',normalize-space(@class),' '),' art-date ')]
+author: //h1[contains(concat(' ',normalize-space(@class),' '),' author-name ')]
+body: //article[contains(concat(' ',normalize-space(@class),' '),' plain-art ')]
+
+# no toolbar, meta, etc, but misses excerpt
+# body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ')]
+
+# Thus, we need to strip useless elements from the "plain-art"
+strip: //div[contains(concat(' ',normalize-space(@class),' '),' plain-post-topbar ')]
+strip: //div[contains(concat(' ',normalize-space(@class),' '),' single-type-block ')]
+strip: //header[contains(concat(' ',normalize-space(@class),' '),' entry-header ')]
+
+# And no pruning is needed because we stripped unwanted elements.
+prune: no
+
+test_url: http://www.contrepoints.org/2015/04/25/205709-leconomie-selon-ray-dalio
+test_url: http://www.contrepoints.org/2015/04/25/205734-huile-et-gaz-de-schiste-revolution-durable
\ No newline at end of file
index a4244097e9b7c636852c5dd988620c7f120dbe62..fc156f7b753d94bb2a8ead38b5e26dcfa47240f4 100755 (executable)
@@ -1,4 +1,4 @@
-body: //*[contains(@class,'body')]
+body: //div[contains(@class,'post-body')]
 date: //abbr[@class='published']
 
-test_url: http://www.cooper.com/journal/2012/08/2-weeks-left-to-win-your-way-to-the-woodstock-of-ux-coopers-ux-boot-camp.html/
\ No newline at end of file
+test_url: http://www.cooper.com/journal/2015/6/creating-personas
diff --git a/inc/3rdparty/site_config/standard/cwnp.com.txt b/inc/3rdparty/site_config/standard/cwnp.com.txt
new file mode 100755 (executable)
index 0000000..169fdf8
--- /dev/null
@@ -0,0 +1,14 @@
+title: //div[@class='entry-pad']//h2
+body: //div[contains(concat(' ',normalize-space(@class),' '),' entry-pad ')]
+strip: //h1
+strip: //p
+strip: //h2
+strip: //div[@class='clear']
+
+prune: no
+tidy: no
+
+autodetect_on_failure: no
+
+test_url: https://www.cwnp.com/wotd.php
+test_url: https://www.cwnp.com/qotd.php
index cd29a4d4334ec5d51ade46155023d97557ebcdf8..8535b19fa04f5e54750bc2c2fb4526b01005a032 100755 (executable)
@@ -7,6 +7,15 @@ strip_id_or_class: googleAds
 strip_id_or_class: digg-button
 strip_id_or_class: article-icon-links-container
 strip_id_or_class: clickToEnlarge
+strip_id_or_class: articleIconLinksContainer
+strip_id_or_class: related-carousel
+strip_id_or_class: reader-comments
+strip_id_or_class: most-watched
+strip_id_or_class: most-read
+
+find_string:blkBorder img-share
+replace_string: nothing
+
 tidy: no
 
 test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/dailytelegraph.com.au.txt b/inc/3rdparty/site_config/standard/dailytelegraph.com.au.txt
new file mode 100755 (executable)
index 0000000..571e811
--- /dev/null
@@ -0,0 +1,5 @@
+title: //h1[@class="heading"]
+author: //cite[@class='author']
+date: //li[contains(@class, 'date-and-time')]
+
+
diff --git a/inc/3rdparty/site_config/standard/deadspin.com.txt b/inc/3rdparty/site_config/standard/deadspin.com.txt
new file mode 100755 (executable)
index 0000000..e6ca16a
--- /dev/null
@@ -0,0 +1 @@
+http_header(user-agent): PHP/5.3
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/derbund.ch.txt b/inc/3rdparty/site_config/standard/derbund.ch.txt
new file mode 100755 (executable)
index 0000000..1363eff
--- /dev/null
@@ -0,0 +1,13 @@
+# Author: cirnod@gmail.com
+
+tidy: no
+prune: no
+
+body: //div[@id="article"]/h3 | //*[@id="mainContent"]
+
+# General Cleanup
+#strip_id_or_class: info_panel 
+
+
+# Try yourself
+test_url: http://www.derbund.ch/bern/nachrichten/Fossilienforscher-stehen-auf-Heavy-Metal/story/20919522
diff --git a/inc/3rdparty/site_config/standard/designbuildsource.com.au.txt b/inc/3rdparty/site_config/standard/designbuildsource.com.au.txt
new file mode 100755 (executable)
index 0000000..93d3507
--- /dev/null
@@ -0,0 +1,2 @@
+date: substring-after(//p[@class='post_date'], 'on')
+
index 85cc78e5fa440beb54fe4986c1d214cdef6447a9..b8788553267a91b8ced1220daad2885f5c467dcd 100755 (executable)
@@ -1,11 +1,9 @@
-#title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10)
-title: //div[contains(@class, 'SB_Title')]//a
-body: //div[contains(@class, 'STR_Image')]
-body: //*[contains(@class, 'SB_Content')]
+title: //a[@class="post-title"]/text()
+title: //meta[@name="twitter:title"]/@content
+body: //img[@class="img-responsive img-comic"]
 author: string('Scott Adams')
-date: //*[contains(@class, 'SB_Detail')]/text()[1]
-
+date: //meta[@property="article:publish_date"]/@content
 
 test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/
 test_url: http://dilbert.com/strips/comic/2013-10-22
-test_url: http://feed.dilbert.com/dilbert/daily_strip
\ No newline at end of file
+test_url: http://feed.dilbert.com/dilbert/daily_strip
index 5283a0cdea17950573bcd1a5e0c34eaaebae24ac..a2ad609b08af193bd174dedbfdba0df6e3e2340b 100755 (executable)
@@ -15,6 +15,9 @@ strip_id_or_class: hook
 strip_id_or_class: right
 strip_id_or_class: footer
 
+strip_id_or_class: ad-head
+strip_id_or_class: atc-share-title
+
 # Other news
 strip: //div[@id="mirrors"]
 
@@ -25,4 +28,5 @@ author: //div[@id="byline"]/div/p/strong
 date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11)
 
 test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade
-test_url: http://www.dn.se/m/rss/senaste-nytt
\ No newline at end of file
+test_contains: Ett tekniskt haveri tvingade
+test_url: http://www.dn.se/rss/senaste-nytt
diff --git a/inc/3rdparty/site_config/standard/economie.gouv.fr.txt b/inc/3rdparty/site_config/standard/economie.gouv.fr.txt
new file mode 100755 (executable)
index 0000000..b0db03c
--- /dev/null
@@ -0,0 +1,4 @@
+body: //div[contains(@class, 'txtVisu')]
+prune: no
+
+test_url: http://www.economie.gouv.fr/dgccrf/Publications/Vie-pratique/Fiches-pratiques/Assurance
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/entwickler.de.txt b/inc/3rdparty/site_config/standard/entwickler.de.txt
new file mode 100755 (executable)
index 0000000..316f399
--- /dev/null
@@ -0,0 +1,7 @@
+title: //h1[@class="post-title"]
+body: //section[@class="article-content"]
+author: //div[@class="post-bottom-meta"]/span[@class="post-author"]
+date: //div[@class="post-date"]/time/@datetime
+
+test_url: https://entwickler.de/online/mobile-welt-offline-welt-was-der-offline-first-ansatz-fuer-app-entwickler-heisst-140602.html
+test_url: https://entwickler.de/online/development/plex-docker-joomla-165345.html
diff --git a/inc/3rdparty/site_config/standard/explosm.net.txt b/inc/3rdparty/site_config/standard/explosm.net.txt
new file mode 100755 (executable)
index 0000000..f2d0a20
--- /dev/null
@@ -0,0 +1,4 @@
+body: //img[@id='main-comic']
+author: substring(//small[@class="author-credit-name"], 4)
+
+test_url: http://explosm.net/comics/3954/
index 26d4f90594043445744ee6d9627ac3b5b97eb4d7..2641a0b295aa937c07c3f273145e0ad10ef1e31d 100755 (executable)
@@ -1,12 +1,14 @@
 body: //div[@id='imagestage']
 body: //div[contains(@class, 'userContentWrapper')]
-
+body: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')]
 strip_id_or_class: commentable
+strip: //div[contains(@data-sigil, 'm-mentions-expand')]
 
 prune: no
 tidy: no
 
-# single_page_link: replace(substring-after(//noscript//meta[@http-equiv="refresh"]/@content, 'URL='), "&amp;", "&")
+single_page_link: concat("https://m.", substring-after(//link[@rel="alternate" and @media="handheld"]/@href, "//www."))
+if_page_contains: //link[@rel="alternate" and @media="handheld"]
 
 test_url: https://www.facebook.com/permalink.php?story_fbid=10154584776550183&id=294468630182
 test_contains: holding an extraordinary session in Brussels this month
index a641723721d6dae77074771278efc39fea879b75..bf8375ee27458fed38849a50e2171de51eee623c 100755 (executable)
@@ -1,16 +1,20 @@
-title: //h1
-author: //h5[@class='byline']//a
-date: //h5[@class='date']
-body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")]
-strip_id_or_class: article-top-wrapper
-strip_id_or_class: footer-message
-strip_id_or_class: print-logo
-strip: //cite
-strip://*[@class='timestamp']
-strip://div[@id='page_right']
-strip://section[@id='header_region']
-strip://h1[@class='node-title']
-strip://div[@class='node-submitted']
-strip_id_or_class: skipnav
+author: //div[@class='byline']//a
+date: //meta[@property='article:published_time']/@content
+body: //figure[@class='jumbotron'] | //div[@itemprop='body']
+
+prune: no
+
+#strip_id_or_class: article-top-wrapper
+#strip_id_or_class: footer-message
+#strip_id_or_class: print-logo
+#strip: //cite
+#strip://*[@class='timestamp']
+#strip://div[@id='page_right']
+#strip://section[@id='header_region']
+#strip://h1[@class='node-title']
+#strip://div[@class='node-submitted']
+#strip_id_or_class: skipnav
+
 test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity
-test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day
\ No newline at end of file
+test_contains: Some of you may have tried to reach me this morning
+test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day
diff --git a/inc/3rdparty/site_config/standard/fok.nl.txt b/inc/3rdparty/site_config/standard/fok.nl.txt
new file mode 100755 (executable)
index 0000000..012f07d
--- /dev/null
@@ -0,0 +1,4 @@
+# skip cookie warning
+single_page_link: concat(//form/@action, '?allowcookies=yes')
+
+test_url: http://fok.nl/687116
\ No newline at end of file
index c1bd2bac4b7cf77d680c3cac9b60958312183afa..ec9e580702a6aa64ca8eb876dd66a643524ada0f 100755 (executable)
@@ -6,4 +6,5 @@ strip: //div[contains(@class,"aside")]
 # remove some SharePoint webpart label junk
 strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"]
 strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"]
-test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx
\ No newline at end of file
+test_url: https://forsvaret.no/aktuelt/historisk-medaljeutdeling
+test_contains: Samarbeidet med Marinen har vært en sann glede
diff --git a/inc/3rdparty/site_config/standard/france24.com.txt b/inc/3rdparty/site_config/standard/france24.com.txt
new file mode 100755 (executable)
index 0000000..6356e04
--- /dev/null
@@ -0,0 +1,14 @@
+# Generated by FiveFilters.org's web-based selection tool
+# Place this file inside your site_config/custom/ folder
+# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.france24.com%2Fen%2F20150427-togo-gnassingbe-poised-extend-power-election%2F
+
+body: //article[contains(concat(' ',normalize-space(@class),' '),' article-long ')]//div[contains(concat(' ',normalize-space(@class),' '),' bd ')]
+title: //h1[@class="title"] 
+author://p[@class="author"]
+date://p[@class="modification"]
+
+find_string: <p class="modification">Latest update : 
+replace_string: <p class="modification">
+
+
+test_url: http://www.france24.com/en/20150427-togo-gnassingbe-poised-extend-power-election/
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/galwayindependent.com.txt b/inc/3rdparty/site_config/standard/galwayindependent.com.txt
new file mode 100755 (executable)
index 0000000..d45b7ac
--- /dev/null
@@ -0,0 +1,3 @@
+title: //div[@class='leftCol']/h1
+
+prune: no
index 73f8342fa8ae549ac969ce8dc20fda1e0f822a42..227d39ac24ffdb3bfba069c765af8bd8c7b9fc12 100755 (executable)
@@ -1,5 +1,7 @@
 title: //meta[@property="og:title"]/@content
 body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')]
+author: //span[contains(concat(' ',normalize-space(@class),' '),' author ')]
+date: //header[@id='gbArticleHeader']//div//time/@datetime
 
 prune: no
 
@@ -7,4 +9,4 @@ strip_id_or_class: noprint
 strip: //div[@id='gbNewsTextContent']/following-sibling::*
 
 test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video
-test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible
\ No newline at end of file
+test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible
index 9bc5613a9697627a8cf209d6f25cbf366c0daeb1..27e4b4bb13cc14b3518bc124cf07e6b2b78d44ae 100755 (executable)
@@ -3,4 +3,6 @@ body: //div[@class="post-body"]
 # Remove 'content is restricted'
 strip: //div[@id='agegate_IDHERE']
 
+http_header(user-agent): PHP/5.3
+
 test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/getpocket.com.txt b/inc/3rdparty/site_config/standard/getpocket.com.txt
new file mode 100755 (executable)
index 0000000..e6ca16a
--- /dev/null
@@ -0,0 +1 @@
+http_header(user-agent): PHP/5.3
\ No newline at end of file
index f11b7b426d3e01591d0f9832c09f099e31ad5e3e..902078627e3a0fb140ef334840e710d21edb71c6 100755 (executable)
@@ -1,4 +1,6 @@
+body: //div[@class="highlight"]/pre
 
-title: //div[contains(@class,'gist-description')]
-body: //div[contains(@class,'blob-wrapper')]
-test_url: https://gist.github.com/staltz/868e7e9bc2a7b8c1f754
+prune: no
+tidy: no
+
+test_url: https://gist.github.com/1258908
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/gizmodo.com.au.txt b/inc/3rdparty/site_config/standard/gizmodo.com.au.txt
new file mode 100755 (executable)
index 0000000..9dbfc15
--- /dev/null
@@ -0,0 +1,8 @@
+body: //div[@id='content_post' or @class="post-body" or contains(@class, 'illustration top')]
+author: (//cite//span[@class="plus-icon"])[1]
+date: //span[@class="date"]
+date: //time
+
+prune: no
+
+test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science
index e73ec9d2a2c89ae4d284efeee44fff4455119631..535041cddda4ae4db4579a6679dafcd1b5772915 100755 (executable)
@@ -6,6 +6,8 @@ date: //span[@class="date"]
 
 prune: no
 
+http_header(user-agent): PHP/5.3
+
 test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science
 test_url: http://gizmodo.com/what-van-goghs-paintings-would-look-like-if-they-came-874035680
 test_url: http://gizmodo.com/vip.xml
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/globalgrind.com.txt b/inc/3rdparty/site_config/standard/globalgrind.com.txt
new file mode 100755 (executable)
index 0000000..e2f4e23
--- /dev/null
@@ -0,0 +1,6 @@
+body: //div[contains(@class, 'content-body')]
+
+prune: no
+
+test_url: http://globalgrind.com/2015/04/26/listen-jeremih-featuring-chance-the-rapper-the-social-experiment-planes-remix-new-music/
+test_contains: The Chicago rapper has made a name for himself
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/gocomics.com.txt b/inc/3rdparty/site_config/standard/gocomics.com.txt
new file mode 100755 (executable)
index 0000000..212c02d
--- /dev/null
@@ -0,0 +1,5 @@
+body: //a[@class="photo"]/img[@class="strip"]
+author: //meta[@name="author"]/@content
+date: //meta[@property="gocomics:publish_date"]/@content
+
+test_url: http://www.gocomics.com/garfield/2015/06/13
diff --git a/inc/3rdparty/site_config/standard/help.fivefilters.org.txt b/inc/3rdparty/site_config/standard/help.fivefilters.org.txt
new file mode 100755 (executable)
index 0000000..70a7d15
--- /dev/null
@@ -0,0 +1,2 @@
+title: //div[@class="title"]/h3
+date: substring-after(//div[@class="meta"], ": ")
diff --git a/inc/3rdparty/site_config/standard/heraldsun.com.au.txt b/inc/3rdparty/site_config/standard/heraldsun.com.au.txt
new file mode 100755 (executable)
index 0000000..b0ce56c
--- /dev/null
@@ -0,0 +1,12 @@
+#body: //div[@class='story-body']
+body: //div[contains(@class, 'story-body')]
+title: //div[@class='story-headline']//h1
+author: //cite[contains(@class, 'author')]
+date: //span[@class='datestamp']
+
+strip_id_or_class: story-info
+strip: //div[contains(@class, 'story-promo')]
+strip: //div[contains(@class, 'story-related')]
+
+prune: no
+tidy: no
diff --git a/inc/3rdparty/site_config/standard/hiiraan.com.txt b/inc/3rdparty/site_config/standard/hiiraan.com.txt
new file mode 100755 (executable)
index 0000000..cf1f794
--- /dev/null
@@ -0,0 +1,10 @@
+# Generated by FiveFilters.org's web-based selection tool
+# Place this file inside your site_config/custom/ folder
+# Source: http://siteconfig.fivefilters.org/grab.php?url=http%3A%2F%2Fwww.hiiraan.com%2Fnews%2F2014%2FDec%2Fwararka_maanta20-89428.htm
+
+body: //div[contains(concat(' ',normalize-space(@class),' '),' single ')]//div[contains(concat(' ',normalize-space(@class),' '),' description ')]
+
+prune: no
+
+test_url: http://www.hiiraan.com/news/2014/Dec/wararka_maanta20-89428.htm
+test_url: http://rss.hiiraan.com/wararka_maanta_rss.xml
\ No newline at end of file
index af7422097489ddc08e9814eda427355d8c81ed00..6711a0a2796e2aa4913e35169c25a3100bac2ca2 100755 (executable)
@@ -1,9 +1,16 @@
 title: //meta[@property='og:title']/@content
-body: //div[contains(@class, 'articleContent')]
+body: //img[contains(@class, 'FirstImage')] | //div[contains(@class, 'articleContent')]
 date: //meta[@property='article:published_time']/@content
 author: //div[@id='main']//div[@class='byline']//span[@class='authorName']
 
 strip_id_or_class: RelatedArtTag
 
+strip: //h5[contains(., 'READ MORE:')]
+strip: //h5[contains(., 'Read more:')]
+
 tidy: no
-test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html
\ No newline at end of file
+test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html
+test_url: http://www.independent.co.uk/voices/comment/robert-fisk-on-the-cia-torture-report-once-again-language-is-distorted-in-order-to-hide-us-state-wrongdoing-9924501.html
+test_contains: Thank God for Noam Chomsky.
+
+test_url: http://www.independent.co.uk/news/uk/rss
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/io9.com.txt b/inc/3rdparty/site_config/standard/io9.com.txt
new file mode 100755 (executable)
index 0000000..e6ca16a
--- /dev/null
@@ -0,0 +1 @@
+http_header(user-agent): PHP/5.3
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/ippmedia.com.txt b/inc/3rdparty/site_config/standard/ippmedia.com.txt
new file mode 100755 (executable)
index 0000000..99f25dc
--- /dev/null
@@ -0,0 +1,4 @@
+title: //div[@class="content_title"]//h2
+author: substring-after(//div[@class="byline"], "By ")
+date: //div[@class="publish_date"]
+strip: //div[@class="read_image_box"]
diff --git a/inc/3rdparty/site_config/standard/itnews.com.au.txt b/inc/3rdparty/site_config/standard/itnews.com.au.txt
new file mode 100755 (executable)
index 0000000..47cbb0f
--- /dev/null
@@ -0,0 +1,5 @@
+title: //h1[@class='article-header']
+body: //div[@class='body-content']
+author: //span[@class='author-byline']/a[contains(@id, 'Author')]
+
+strip: //span[contains(@id, 'Article_SourceLabel')]
index fc2eef8edcef28da3cf8c2b535512cd5a08946c9..7823dbd75a6c6fe459f08f8a919c05e1326250cf 100755 (executable)
@@ -1,2 +1,5 @@
 author: //span[@class='plus-icon']
+
+http_header(user-agent): PHP/5.3
+
 test_url: http://jalopnik.com/5892124/1955-porsche-550-spyder-sells-for-record-3685-million/
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/jezebel.com.txt b/inc/3rdparty/site_config/standard/jezebel.com.txt
new file mode 100755 (executable)
index 0000000..e6ca16a
--- /dev/null
@@ -0,0 +1 @@
+http_header(user-agent): PHP/5.3
\ No newline at end of file
index be439d75f9b41ccb23d9bcc9c14324d9687ae0ce..61ccbc46ab22aef983abf1e4906c6d798d5f26e1 100755 (executable)
@@ -1,2 +1,5 @@
 author: //span[@class="plus-icon"]
+
+http_header(user-agent): PHP/5.3
+
 test_url: http://kotaku.com/5920211/save-the-furries-on-your-wii-in-this-weeks-nintendo-download
\ No newline at end of file
index e720e377f13bd97630b3410f374a5ab23c53f4c5..7e1d12d72c911f77eaa8077d7e19a41239fa3617 100755 (executable)
@@ -2,7 +2,8 @@ title: //meta[@name='title']/@content
 author: //span[@class='sign']//a[@class='journaliste']
 author: //meta[@name='author']/@content
 body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte']
-date: //time[@pubdate]/@datetime
+date: //li[contains(concat(' ',normalize-space(@class),' '),' fig-date-pub ')]//time
 prune: no
 test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php
-test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php
\ No newline at end of file
+test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php
+test_url: http://www.lefigaro.fr/social/2015/03/10/09010-20150310ARTFIG00312-encore-une-annee-noire-pour-l-emploi-salarie.php
index ec97f06cb16946050ff097a6b31b2fe312c0d72a..330c4e785d79c560b6c4b3275721b147d8fdd9a1 100755 (executable)
@@ -42,6 +42,12 @@ strip: //p[@class="arrow"]
 
 # Remove "track" image from article body
 strip: //img[@alt="track"]
+
+# Remove hidden URLs
+strip: //a[@x-inset="hidden"]
+
+http_header(user-agent): PHP/5.3
+
 test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos
 test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse
-test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314
\ No newline at end of file
+test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314
diff --git a/inc/3rdparty/site_config/standard/linuxjournal.com.txt b/inc/3rdparty/site_config/standard/linuxjournal.com.txt
new file mode 100755 (executable)
index 0000000..c5e6446
--- /dev/null
@@ -0,0 +1,6 @@
+body: //div[@class='content-area']
+next_page_link: //a[@title='Go to next page']
+author: //a[@title='View user profile.']
+strip_id_or_class: comments
+
+test_url: http://www.linuxjournal.com/content/be-mechanicwith-android-and-linux
index d1ff0b43789438e84fc3382f25379bab03c07b7e..7037c64b06d5cd70c3f9fdd3c3eda5ec2c744aea 100755 (executable)
@@ -3,6 +3,12 @@ body: //div[@class="story-body"]
 date: //p[@class='date']/strong
 author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By')
 
+find_string: http://ichef.bbci.co.uk/news/200/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
+find_string: http://ichef.bbci.co.uk/news/304/
+replace_string: http://ichef.bbci.co.uk/news/624/
+
 strip: //div[@class="story-inner"]/div[@class="byline"]
 
 test_url: http://m.bbc.co.uk/news/science-environment-19144464
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/m.facebook.com.txt b/inc/3rdparty/site_config/standard/m.facebook.com.txt
new file mode 100755 (executable)
index 0000000..1b9c1b3
--- /dev/null
@@ -0,0 +1,12 @@
+body: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')]
+
+title: //div[@id='m_story_permalink_view' or contains(@data-sigil, 'm-story-view')]//h3
+
+strip_id_or_class: commentable
+strip: //*[contains(@data-sigil, 'm-mentions-expand') or contains(@data-sigil, 'story-popup-context') or contains(@data-sigil, 'share') or contains(@data-sigil, 'translate')]
+
+prune: no
+tidy: no
+
+test_url: https://www.facebook.com/permalink.php?story_fbid=10154584776550183&id=294468630182
+test_contains: holding an extraordinary session in Brussels this month
diff --git a/inc/3rdparty/site_config/standard/m.theregister.co.uk.txt b/inc/3rdparty/site_config/standard/m.theregister.co.uk.txt
new file mode 100755 (executable)
index 0000000..64cb1c3
--- /dev/null
@@ -0,0 +1,4 @@
+strip: //div[@class='wptl btm']
+body: //div[@id='article']//h2 | //div[@id='body']
+
+test_url: http://m.theregister.co.uk/2015/07/06/geeks_guide_spaceguard_center/
diff --git a/inc/3rdparty/site_config/standard/marketingmag.com.au.txt b/inc/3rdparty/site_config/standard/marketingmag.com.au.txt
new file mode 100755 (executable)
index 0000000..910741f
--- /dev/null
@@ -0,0 +1 @@
+strip: //h3[@class="related-posts"]
index 9e9c6895167d59d35f4457516d48353e099a9f91..5ab3ac5ed29c76087219191eaf2cf4d1471633b5 100755 (executable)
@@ -1,4 +1,4 @@
-body: //div[contains(@class, 'postContent-inner')]
+body: //div[contains(@class, 'postArticle-content')]
 strip_id_or_class: supplementalPostContent
 
 prune: no
index 6a669253cccb5283bc478636192120386b7f3df6..af450b5e91058fbcbd9db4d50cd7b9bfc38a748e 100755 (executable)
@@ -3,10 +3,5 @@ body: //div[@style="float:left;width:740px;"]
 
 tidy: no
 
-test_url: http://www.menshealth.com.sg/fitness/mh-picks-under-armour-clutchfit-nitro-mid-cleats
-test_contains: These cleats are made for one thing
-
-test_url: http://www.menshealth.com.sg/fitness/top-10-fat-burning-bodyweight-moves-you-can-do-10-minutes
-test_contains: let this workout fool you
-
-test_url: http://www.menshealth.com.sg/fitness/feed
\ No newline at end of file
+# broken feed?
+test_url: http://www.menshealth.com.sg/fitness/feed
diff --git a/inc/3rdparty/site_config/standard/mitchellrepublic.com.txt b/inc/3rdparty/site_config/standard/mitchellrepublic.com.txt
new file mode 100755 (executable)
index 0000000..fae858a
--- /dev/null
@@ -0,0 +1,3 @@
+body: //div[@class='section']
+strip_id_or_class: mediumtxt
+strip: //strong[contains
index c60252ef52cb68afbed80b6d25c691a4bc4d83e4..ef1ce98d67a5c64b9f1dc2279b45540b3d7ac839 100755 (executable)
@@ -1,4 +1,7 @@
 title: //h1[contains(@class, 'headline')]
 body: //article[contains(@class, 'full-art')]
+date: //meta[@name="pdate"]/@content
+author: //meta[@name="byl"]/@content
+
 strip_id_or_class: image-credit
 test_url: http://mobile.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/moneymanagement.com.au.txt b/inc/3rdparty/site_config/standard/moneymanagement.com.au.txt
new file mode 100755 (executable)
index 0000000..9892f66
--- /dev/null
@@ -0,0 +1,2 @@
+date: //span[@class="publishdate"]//time
+author: //span[@class="byline"]
diff --git a/inc/3rdparty/site_config/standard/nbnnews.com.au.txt b/inc/3rdparty/site_config/standard/nbnnews.com.au.txt
new file mode 100755 (executable)
index 0000000..a240987
--- /dev/null
@@ -0,0 +1,3 @@
+body: //div[contains(concat(' ',normalize-space(@class),' '),' entry ') or contains(@class, 'single-post-thumb')]
+test_url: http://www.nbnnews.com.au/2015/03/24/lismore-man-will-attempt-to-run-around-australia/
+test_url: http://www.nbnnews.com.au/category/nthn-rivers-sport/feed/
diff --git a/inc/3rdparty/site_config/standard/news.com.au.txt b/inc/3rdparty/site_config/standard/news.com.au.txt
new file mode 100755 (executable)
index 0000000..57b89a5
--- /dev/null
@@ -0,0 +1,3 @@
+body: //div[@class='story-body']
+prune: no
+tidy: no
diff --git a/inc/3rdparty/site_config/standard/news.menshealth.com.txt b/inc/3rdparty/site_config/standard/news.menshealth.com.txt
new file mode 100755 (executable)
index 0000000..a07fdac
--- /dev/null
@@ -0,0 +1 @@
+strip: //span[@style="color: #cf1206;"]
diff --git a/inc/3rdparty/site_config/standard/news.ninemsn.com.au.txt b/inc/3rdparty/site_config/standard/news.ninemsn.com.au.txt
new file mode 100755 (executable)
index 0000000..ddd6406
--- /dev/null
@@ -0,0 +1,3 @@
+strip: //a[@class="contact"]
+strip: //div[@class="article-media video-item"]
+date: //div[@class='display-date']
diff --git a/inc/3rdparty/site_config/standard/parool.nl.txt b/inc/3rdparty/site_config/standard/parool.nl.txt
new file mode 100755 (executable)
index 0000000..2ceef94
--- /dev/null
@@ -0,0 +1,7 @@
+#bypass cookie check
+single_page_link: //a[contains(@href, '/acceptCookieCheck.do?url=')]
+
+test_url: http://www.parool.nl/parool/nl/4/AMSTERDAM/article/detail/4042734/2015/05/29/MRSA-bacterie-niet-verder-verspreid-in-Bijlmerbajes.dhtml
+test_contains: De twee gevangenen die
+
+test_url: http://www.parool.nl/amsterdam/rss.xml
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/perthnow.com.au.txt b/inc/3rdparty/site_config/standard/perthnow.com.au.txt
new file mode 100755 (executable)
index 0000000..b0ce56c
--- /dev/null
@@ -0,0 +1,12 @@
+#body: //div[@class='story-body']
+body: //div[contains(@class, 'story-body')]
+title: //div[@class='story-headline']//h1
+author: //cite[contains(@class, 'author')]
+date: //span[@class='datestamp']
+
+strip_id_or_class: story-info
+strip: //div[contains(@class, 'story-promo')]
+strip: //div[contains(@class, 'story-related')]
+
+prune: no
+tidy: no
diff --git a/inc/3rdparty/site_config/standard/planetsave.com.txt b/inc/3rdparty/site_config/standard/planetsave.com.txt
new file mode 100755 (executable)
index 0000000..d6f34e2
--- /dev/null
@@ -0,0 +1 @@
+strip_id_or_class: author-bio-box
diff --git a/inc/3rdparty/site_config/standard/presseportal.de.txt b/inc/3rdparty/site_config/standard/presseportal.de.txt
new file mode 100755 (executable)
index 0000000..703806d
--- /dev/null
@@ -0,0 +1,11 @@
+body: //div[contains(concat(' ',normalize-space(@class),' '),' story-text ')]
+
+strip_id_or_class: news-bodycopy
+
+parser: html5php
+tidy: no
+
+test_url: http://www.presseportal.de/pm/103258/2930232/felix-neureuther-vor-der-ski-wm-ich-denke-von-rennen-zu-rennen
+test_url: http://www.presseportal.de/pm/66749/2933779/koelner-stadt-anzeiger-bahnmitarbeiter-werden-in-nrw-immer-haeufiger-angegriffen-zahl-der/rss
+test_contains: kleineren Bahnhöfen installieren und erhofft
+test_url: http://www.presseportal.de/rss/presseportal.rss2
index 732d12d7cefda1b0683e6eb5322ac2d41b2c5457..f2b75a998c9c93d29ce335dcb5682b806cd0abd0 100755 (executable)
@@ -1,8 +1,10 @@
 tidy: no
 prune: no
-body: //div[contains(@class, 'main_col')]
-title: //h1
+body: //div[contains(concat(' ',normalize-space(@class),' '),' Answer ')] | //div[contains(concat(' ',normalize-space(@class),' '),' header ')] | //div[contains(concat(' ',normalize-space(@class),' '),' AnswerWikiArea ')] | //hr
+#body: //div[contains(@class, 'main_col')]
 
+strip_id_or_class: AnswerFooter
+strip_id_or_class: ActionBar
 strip_id_or_class: hidden
 strip_id_or_class: item_action_bar
 strip_id_or_class: answer_voters
@@ -13,5 +15,15 @@ strip_id_or_class: view_tag
 strip_id_or_class: include_details
 strip_id_or_class: sig_edit
 strip_id_or_class: profile_photo_img
+strip_id_or_class: question_text_icons
 
-test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life
\ No newline at end of file
+# insert hr between answers
+find_string: <div class="Answer" 
+replace_string: <hr /><div class="Answer" 
+
+test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life
+test_contains: Please provide a specific practical/measurable action-based everyday
+test_contains: Exercise every day
+
+test_url: http://www.quora.com/What-is-the-greatest-illusion-in-life
+test_contains: What is the greatest illusion in life?
index c3f2d3e5c510fb811e65e90e445fc19506a73262..ba342c7cd2aa0fe1f6f85cb28b80d1e29719cc5b 100755 (executable)
@@ -7,9 +7,7 @@ author: //p[@class="tagline"]/a
 # this doesn't work for some reason...?
 date: //p[@class="tagline"]//@datetime
 
-#body: (//div[contains(@class, 'noncollapsed')]//div[contains(@class, 'usertext-body')])[1]
-
-body: //div[contains(concat(' ',normalize-space(@class),' '),' usertext-body ') and (contains(concat(' ',normalize-space(@class),' '),' may-blank-within ')) and (contains(concat(' ',normalize-space(@class),' '),' md-container '))]//div[contains(concat(' ',normalize-space(@class),' '),' md ')]
+body: (//div[contains(@class, 'noncollapsed')]//div[contains(@class, 'usertext-body')])[1]
 
 strip_id_or_class: tagline
 strip_id_or_class: unvotable-message
@@ -20,4 +18,4 @@ single_page_link: //p[@class="title"]/a[contains(@href, 'http://')]
 
 test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/
 test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/
-test_url: http://www.reddit.com/r/WritingPrompts/comments/2786lw/wp_in_a_world_where_puns_are_illegal_one_man/chybk8e
+test_url: http://www.reddit.com/r/WritingPrompts/comments/2786lw/wp_in_a_world_where_puns_are_illegal_one_man/chybk8e
\ No newline at end of file
index f8c9541f3eca675542d7ae3b1cb7787aa4411a02..83342cb787929ef25acd43ec3f47b14fe3a890b3 100755 (executable)
@@ -1,8 +1,10 @@
 title: //h2
 
 strip: //div[ contains(@class, 'respond') ]  |  //h2  |  //h1
+strip_id_or_class: social
+strip_id_or_class: dd_post_share
 
 date: substring-after(//p[@class='info'], ' on ')
 
 author: //p[@class='info']//a
-test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/
\ No newline at end of file
+test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/
diff --git a/inc/3rdparty/site_config/standard/saadaalnews.net.txt b/inc/3rdparty/site_config/standard/saadaalnews.net.txt
new file mode 100755 (executable)
index 0000000..b9ce04e
--- /dev/null
@@ -0,0 +1,11 @@
+body: //div[contains(@class, 'section-content-left')]
+
+strip_id_or_class: related
+strip_id_or_class: nocontent
+strip_id_or_class: comment
+strip_id_or_class: widget
+strip_id_or_class: respond
+strip: //h3[.='Comments']
+strip: //p[.='comments']
+
+test_url: http://saadaalnews.net/?p=42624
diff --git a/inc/3rdparty/site_config/standard/smh.com.au.txt b/inc/3rdparty/site_config/standard/smh.com.au.txt
new file mode 100755 (executable)
index 0000000..f647f81
--- /dev/null
@@ -0,0 +1,14 @@
+body: //div[@id='content']
+title: //h1[@class='cN-headingPage']
+author: //h3[@class='authorName']
+date: //dd[@class='updated dtstamp']
+
+strip: //ul[@class='social sponsored cfix']
+strip: //div[contains(@class, 'hiddenVisually')]
+strip: //dd[@class='updated dtstamp']
+strip: //h3[@class='authorName']
+strip: //ul[@class='social  cfix']
+strip: //div[contains(@id, 'adspot')]
+
+strip: //div[contains(@class, 'overlayPlayCountdown')]
+strip: //div[@class='fdVideoWof']//span[@class='gone']
diff --git a/inc/3rdparty/site_config/standard/smh.drive.com.au.txt b/inc/3rdparty/site_config/standard/smh.drive.com.au.txt
new file mode 100755 (executable)
index 0000000..463fd88
--- /dev/null
@@ -0,0 +1,13 @@
+body: //div[@id='content']
+title: //h1[@class='cN-headingPage']
+author: //h3[@class='authorName']
+date: //dd[@class='updated dtstamp']
+
+strip: //ul[@class='social sponsored cfix']
+strip: //div[contains(@class, 'hiddenVisually')]
+strip: //dd[@class='updated dtstamp']
+strip: //h3[@class='authorName']
+strip: //ul[@class='social  cfix']
+strip: //div[contains(@id, 'adspot')]
+
+test_url: http://smh.drive.com.au/roads-and-traffic/driver-distraction-responsible-for-more-car-crashes-than-alcohol-20130503-2iyg0.html
index 3e8fee9578a22dc79e16093274ec581b305ce20c..fc479c2a81fcdbffc0d275e2cee59245f7ef52a6 100755 (executable)
@@ -7,6 +7,9 @@ body://div[@id = 'article-body']
 # full content
 single_page_link://td/li[@class = 'article-singlepage']/a
 
+# continue link
+single_page_link: //a[@id='continue-btn']
+
 # caption clean up
 wrap_in(i)://span[@class='articleImageCaptionwide']
 move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p
@@ -17,4 +20,4 @@ strip://p[@id = 'articlePaginationWrapper']
 strip://ul[contains(@class, 'cat-breadcrumb')]
 strip://div [@class= 'viewMorePhotos']
 
-test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html
\ No newline at end of file
+test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html
diff --git a/inc/3rdparty/site_config/standard/snip.ly.txt b/inc/3rdparty/site_config/standard/snip.ly.txt
new file mode 100755 (executable)
index 0000000..4e80fca
--- /dev/null
@@ -0,0 +1,3 @@
+single_page_link: //meta[@property="og:url"]/@content
+
+test_url: http://snip.ly/qa1R
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/soundcity.tv.txt b/inc/3rdparty/site_config/standard/soundcity.tv.txt
new file mode 100755 (executable)
index 0000000..c26b9f9
--- /dev/null
@@ -0,0 +1,3 @@
+strip_id_or_class: sharing
+
+test_url: http://soundcity.tv/feed/
index 413e0155a6671d162d450a9af56dd39572bdb18d..7b7b1752423ca5807e93a3b76e95d14330494153 100755 (executable)
@@ -8,6 +8,9 @@
 # - Fixed single_page_link
 # - Included intro text in single page view
 # - Added body in default view
+# stesie@
+# - removed copyright box
+# - removed "print more" box
 
 # set body
 tidy: no
@@ -15,6 +18,7 @@ tidy: no
 body: //div[@id="spArticleContent"]
 # body in default view
 body: //div[@id="spArticleSection"]
+body: //div[contains(@class, 'article-section')] | //div[@id='js-article-top-wide-asset'] | //p[contains(@class, 'article-intro')] | //div[contains(@class, 'js-module-box-image')]
 # body in "Fotostrecke"
 body: //div[@id="spBigaContent"]
 
@@ -25,6 +29,8 @@ strip: //div[@id="spArticleContent"]/h3
 # set date in "Fotostrecke"
 date: //div[@id="spBigaDatum"]
 
+# title in default view
+title: //h2[contains(@class, 'article-title')]
 #set title in single page view
 title: //div[@id='spArticleContent']/h2
 # strip title
@@ -49,7 +55,7 @@ strip: //*/div[@class='spCredit']/following-sibling::p
 strip: //div[@class='spMInline']
 
 # remove photogalleries and extras
-strip: //div[@class='spPhotoGallery']
+strip: //div[contains(@class, 'spPhotoGallery')]
 strip: //div[@class='spPhotoGallery']/following-sibling::br
 strip: //div[@class='spAssetAlignleft']
 strip: //div[contains(@class,'spAsset')]
@@ -67,9 +73,24 @@ strip: //div[@id='spBigaLatestEntries']
 strip: //div[contains(@class, 'spBigaNavi')]
 strip: //div[@class='spDottedLine']
 
+strip: //div[@class='asset-box article-print-more']
+strip: //div[@class='article-copyright']
+strip: //span[@class='image-buttons']
+
 # Use link to print article for single page view
 single_page_link: //a[contains(@href, '-druck')]
+if_page_contains: //div[contains(@class, 'multi-pager-control')]
+
+# Clean up title in print view
+find_string: <title>Druckversion - 
+replace_string: <title>
 
 # use next link in "Fotostrecke"
 next_page_link: //a[@class='spBigaControlForw']
-test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html
\ No newline at end of file
+test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html
+
+# regular article
+test_url: http://www.spiegel.de/wirtschaft/soziales/griechenland-was-den-griechischen-buergern-nun-droht-a-1042682.html
+
+# multipage article
+test_url: http://www.spiegel.de/spiegel/a-710880.html
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/srf.ch.txt b/inc/3rdparty/site_config/standard/srf.ch.txt
new file mode 100755 (executable)
index 0000000..d07a905
--- /dev/null
@@ -0,0 +1,24 @@
+# Author: cirnod@gmail.com
+
+tidy: no
+prune: no
+
+body: //div[@id="article-content"]/p | //div[@class="main-article-content clearfix"]
+
+# General Cleanup
+strip_id_or_class: offscreen
+strip_id_or_class: video-description
+strip_id_or_class: v2 big-video
+strip_id_or_class: module smb freetext
+strip_id_or_class: asset span3
+strip_id_or_class: module smb related-links
+
+# fix image-galleries
+strip_id_or_class: module lightbox-gallery image hide
+replace_string(width="624"): width="100%"
+replace_string(height="468"): height="%"
+
+# Try yourself
+test_url: http://www.srf.ch/news/wirtschaft/weltbank-korrigiert-konjunktur-erwartungen-nach-unten
+test_url: http://www.srf.ch/news/wirtschaft/ural-statt-alpen-russische-touristen-bleiben-zuhause
+test_url: http://www.srf.ch/news/international/zwei-schweizer-bei-blutigem-attentat-in-mali-verletzt
\ No newline at end of file
index 74b8d4511abca1d29d3aa6276261178f3291df69..26e056057df7eeb58761f558636918b0a14fd48d 100755 (executable)
@@ -1,6 +1,6 @@
 # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...
 
-single_page_link: //a[ contains( @href, "/2.220/" ) ]
+single_page_link: //li[@id="article-sidebar-action-print"]/@data-clickurl
 
 body: //article[@id="sitecontent"]/section[@class="body"]
 author: //address[@class="author"]
diff --git a/inc/3rdparty/site_config/standard/sunshinecoastdaily.com.au.txt b/inc/3rdparty/site_config/standard/sunshinecoastdaily.com.au.txt
new file mode 100755 (executable)
index 0000000..bf5e918
--- /dev/null
@@ -0,0 +1,10 @@
+body: //section//article//p
+
+strip: //aside
+strip: //div[@class='margin-top-15']
+strip: //p[@class='tags']
+
+author: //span[@class='byline']//ul[@class='piped']//li[1]
+date: //span[@class='byline']//ul[@class='piped']//li[2]
+
+parser: html5lib
index f67637d28f95f16e87e20256d92e078660593d00..f194271f5467a606377b7030f753a2fc87d40dfa 100755 (executable)
@@ -1,6 +1,6 @@
 # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...
 
-single_page_link: //a[ contains( @href, "/2.220/" ) ]
+single_page_link: //li[@id="article-sidebar-action-print"]/@data-clickurl
 
 body: //article[@id="sitecontent"]/section[@class="body"]
 author: //address[@class="author"]
diff --git a/inc/3rdparty/site_config/standard/tagesanzeiger.ch.txt b/inc/3rdparty/site_config/standard/tagesanzeiger.ch.txt
new file mode 100755 (executable)
index 0000000..45c5cd0
--- /dev/null
@@ -0,0 +1,14 @@
+# Author: cirnod@gmail.com
+
+tidy: no
+prune: no
+
+body: //div[@id="article"]/h3 | //*[@id="mainContent"]
+
+# General Cleanup
+#strip_id_or_class: info_panel 
+
+
+# Try yourself
+test_url: http://www.tagesanzeiger.ch/zuerich/stadt/Nach-spektakulaerer-Abseilaktion-verhaftet/story/18039895
+test_url: http://www.tagesanzeiger.ch/ausland/naher-osten-und-afrika/IS-zerstoert-auch-das-antike-Hatra/story/19865699
index be76cd0544c5904cdd2705bd407b0f4940c30595..ba3b1d3b425c95a88b4b9e6a3904ba97f1224d39 100755 (executable)
@@ -1,23 +1,14 @@
-title://h1[1]
+body: //div[contains(@class, 'sectionArticle')]//div[contains(concat(' ',normalize-space(@class),' '),' box ')]
 
-author: substring-after(//em, 'Von ')
-author:string('tagesschau.de')
+strip_id_or_class: infokasten
+strip_id_or_class: teaserImTeaser
+strip_id_or_class: Comments
+strip_id_or_class: mediaInfo
+strip: //div[contains(@class, 'mediaCon')]//iframe
 
-date:substring-after(//div[@class='standDatum'], 'Stand: ')
+prune: no
 
-body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')]
+test_url: http://www.tagesschau.de/ausland/snowden-dateien-entschluesselung-101.html
+test_contains: Snowden hatte zunächst für
 
-strip://h1[1]
-strip: //div[contains(@class, 'directLinks')]
-strip: //div[contains(@class, 'zitatBox')]
-strip: //div[contains(@class, 'teaserBox metaBlock')]
-strip: //*[contains(@class, 'inv')]
-strip: //span[@class='imgSubline']
-strip: //*[contains(@class, 'topline')][1]
-strip: //div[@id='rightCol'][1]
-strip: //div[@id="footer"][1]
-strip: //div[@class="fPlayer"] 
-strip: //div[@id='seitenanfang']
-strip: //div[@class='standDatum']
-strip: //em
-test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html
\ No newline at end of file
+test_url: http://www.tagesschau.de/xml/rss2
index cf853662371873c9f5cdd89814d8f1c79bd8ce52..a33685687945a1118774954e21034929c783187d 100755 (executable)
@@ -1,8 +1,9 @@
 date: //div[@class='secthead']
-body: //div[@class='sectbody']
+body: (//div[@class='sectbody'])[1]
 title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1)
 author: //span[@class='author']
 strip: //p[@class='caption']
+strip_id_or_class: ad_bin
 strip_id_or_class: rack
 
 test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/
\ No newline at end of file
index 3fc5611b2c9964666b671da5cd293fb56f8600a4..36864197bd88d0c57c39b89b325d11c3a6485123 100755 (executable)
@@ -1,5 +1,6 @@
 title: //div[contains(@class, 'articleHead')]//h1
 
+body: //div[@itemprop='articleBody']
 body: //div[@class='articleText']
 body: //div[@class='articleContent']
 body: //div[@id='article']
@@ -13,10 +14,14 @@ strip: //p[contains(., 'This article available online at:')]
 strip: //p[contains(., 'This article available online at:')]/following::*
 strip: //div[@class='earthbox']
 
-single_page_link: //article//a[contains(@class, 'print')]
+single_page_link: //div[contains(@class, 'article-tools')]//a[contains(@class, 'print')]
 
 native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')]
 
+#multi-page article
+test_url: http://www.theatlantic.com/magazine/archive/2014/12/the-real-roots-of-midlife-crisis/382235/
+test_contains: The curve tends to evince itself
+
 test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/
 test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/
-test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/
\ No newline at end of file
+test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/
diff --git a/inc/3rdparty/site_config/standard/theaustralian.com.au.txt b/inc/3rdparty/site_config/standard/theaustralian.com.au.txt
new file mode 100755 (executable)
index 0000000..1245efc
--- /dev/null
@@ -0,0 +1,6 @@
+body: //div[contains(@class, 'story-body')]
+author: //cite[contains(@class, 'author')]
+date: //span[@class='datestamp']
+
+strip: //div[@class='story-info']
+
index b74442de535b9ccedf2d2b9a26ff1a72312e8bd8..808876daca9490e33d47ebefe112d35d71e75a2c 100755 (executable)
@@ -1,7 +1,7 @@
 title: //meta[@name='og:title']/@content
 date: //meta[@name='created']/@content
-body: //div[@class="StoryBody" or @class="storyTeaser"]
+body: //div[contains(@class, "article-body")]
 
 replace_string(<p></p>): <br /><br />
 
-test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html
\ No newline at end of file
+test_url: http://www.wcvb.com/news/2-teens-arrested-in-fatal-dorchester-shooting-of-16yearold-boy/33564886
index 88e2ecf4e29f74874621923e34bdeca5b9dc184c..c8b70e6fdfe8ffec6345759b956842f149e185e8 100755 (executable)
@@ -1,5 +1,5 @@
 title: //div[@id='main-article-info']//h1
-body: //div[@id='article-wrapper']
+body: //figure[contains(@itemprop, "associatedMedia")] | //div[contains(@itemprop, "articleBody")]
 date: //li[@class='publication']//time[@pubdate] | //li[@class='publication']//data[@pubdate]
 strip: //div[contains(@class, 'email-subscription')]
 strip: //div[contains(@class, 'kindleWidget')]
@@ -11,6 +11,8 @@ native_ad_clue: //meta[@property="video:tag" and contains(@content, "Partner zon
 prune: no
 tidy: no
 
+strip_id_or_class: -expand-
+
 test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption
 test_contains: The National Security Agency has made repeated attempts to develop
 test_contains: The agency did not directly address those questions, instead providing a statement.
index 5d30230d6bdec75d07915ef76f71d4c3ecd65d87..70d3d4373e02e3773ffcb4bcdb55443d1e860cf2 100755 (executable)
@@ -1,8 +1,9 @@
-# Updated 25-Jan-2014
-single_page_link: //a[contains(@href, '/Print/')]
+single_page_link: //link[contains(@href, 'm.theregister')]
+if_page_contains: //div[@id='nextpage']
+strip: //div[@class='wptl btm']
+body: //div[contains(@class,'article_head')]//h2 | //div[@id='body']
 
-title: //div[@id="article"]/h2
-author: //p[@class="byline"]/a
-date: //p[@class="dateline"]/a[last()]
-
-test_url: http://www.theregister.co.uk/2014/01/24/thirty_years_of_the_apple_macintosh_part_2/
+#multipage
+test_url: http://www.theregister.co.uk/2015/07/06/geeks_guide_spaceguard_center/
+#singlepage
+test_url: http://www.theregister.co.uk/2015/07/06/us_japan_massive_robots_in_the_ring/
index 78f8654a00fd7a8fd71d803278d2c86684fdd992..cee50c9be4e9c4bf6fde4e6ad4ba701965f95204 100755 (executable)
@@ -33,6 +33,8 @@ strip_id_or_class: m-ad
 strip_id_or_class: social-sharing
 strip_id_or_class: m-video-entry__excerpt
 strip_id_or_class: hidden
+strip_id_or_class: m-article__follow-bar
+strip_id_or_class: m-article__share-buttons
 
 replace_string(<noscript>): <div>
 replace_string(</noscript>): </div>
diff --git a/inc/3rdparty/site_config/standard/titanic-magazin.de.txt b/inc/3rdparty/site_config/standard/titanic-magazin.de.txt
new file mode 100755 (executable)
index 0000000..70108e3
--- /dev/null
@@ -0,0 +1,8 @@
+body: //div[contains(@class, 'tt_news-bodytext')]
+
+# cut html short
+find_string: <!--TYPO3SEARCH_end-->
+replace_string: </div></body></html>
+
+test_url: http://www.titanic-magazin.de/ich.war.bei.der.waffen.rss
+test_url: http://www.titanic-magazin.de/news/wenig-bekannte-fakten-ueber-2014-6986/
\ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/truongtx.me.txt b/inc/3rdparty/site_config/standard/truongtx.me.txt
new file mode 100755 (executable)
index 0000000..6b10adc
--- /dev/null
@@ -0,0 +1,9 @@
+prune: false
+tidy: false
+
+body: //div[@class='col-md-9']
+author: //meta[@name='author']
+date: //i[@class='fa fa-calendar']/../span
+title: //div[@class='page-header']/h1
+
+test_url: https://truongtx.me/2014/04/20/emacs-javascript-completion-and-refactoring/
index d37911bc9d2ab33065a2c9cc7b79b5e5c487fa56..c4593d55644dc0ab36f5608e3ee4109f39ec9d0c 100755 (executable)
@@ -1,5 +1,2 @@
-title: //h1
-author: //*[@class='byline']
-date: substring-after(//*[@class='pubdatetime'], 'Published: ')
-body: //*[@class='body-block']
-test_url: http://utdailybeacon.com/news/2012/oct/8/energy-forum-continues/
\ No newline at end of file
+body: //div[@id='blox-story-text']
+test_url: http://www.utdailybeacon.com/news/article_ccf6d024-0f15-11e5-ae29-9f63598deb81.html
diff --git a/inc/3rdparty/site_config/standard/volkskrant.nl.txt b/inc/3rdparty/site_config/standard/volkskrant.nl.txt
new file mode 100755 (executable)
index 0000000..c277034
--- /dev/null
@@ -0,0 +1,15 @@
+#bypass cookie check
+single_page_link: //a[contains(@href, '/cookiewall/accept.do?')]
+
+title: //h1[@itemprop="headline"]
+body: //figure[contains(@class, 'article__top-image')] | //div[@itemprop="articleBody"]
+
+strip: //div[contains(@class, 'media-container') and contains(@class, 'pull-right')]
+
+tidy: no
+prune: no
+
+test_url: http://www.volkskrant.nl/sport/dossier-wereldvoetbalbond-fifa-wankelt~a4042695/
+test_contains: De belangrijkste spil in het
+
+test_url: http://www.volkskrant.nl/nieuws/rss.xml
\ No newline at end of file
index 21f56352f0d95e49d80dc3f1974df0bec91f964d..6215b7277828a5a24bd44f2ca98dde584cf7cc8b 100755 (executable)
@@ -1,3 +1,2 @@
-title: //h3
-body: //div[@class="content_wysiwyg"]
-test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html
\ No newline at end of file
+body: //div[@class="article-body"]
+test_url: https://www.warnerbros.fr/articles/magic-mike-xxl-adam-rodriguez-portrait
index 0aa9f1d821741746b4237c7373a3ec09d70d294d..c29af00f517265e72b77c3e35ed9a034f0d96de2 100755 (executable)
@@ -5,9 +5,14 @@ body: //div[contains(@class, "article_body")]
 body: //div[@id='print_facet']//div[@id='body']
 
 author://meta[@name='DC.creator']/@content
+author://span[@class="pb-byline"]
+author://h3[@property='dc.creator']//a[@rel='author']
 title://meta[@name='title']/@content
 date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title
 date://meta[@name="DC.date.issued"]/@content
+date://span[contains(@class,"pb-timestamp")]
+date://meta[@name="eomportal-lastUpdate"]/@content
+
 strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"]
 strip://div[@id="wp-column six end"]
 strip://div[contains(@class,'hidden')]
@@ -23,6 +28,7 @@ strip_id_or_class: module
 
 # Change gJQAwdJG4U_story.html to gJQAwdJG4U_print.html
 single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html")
+if_page_contains: //link[@rel="canonical" and contains(@href, '_story.html')]
 
 # [OLD] Change gJQAwdJG4U_story.html to gJQAwdJG4U_story_print.html
 #single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_story_print.html")
diff --git a/inc/3rdparty/site_config/standard/watoday.com.au.txt b/inc/3rdparty/site_config/standard/watoday.com.au.txt
new file mode 100755 (executable)
index 0000000..4302ac5
--- /dev/null
@@ -0,0 +1,7 @@
+author: //h3[@class="authorName"]
+date: //dd[@class='updated dtstamp']//time
+
+strip: //div[contains(@class, "adspot")]
+strip: //noscript
+strip: //p//small
+
diff --git a/inc/3rdparty/site_config/standard/weeklytimesnow.com.au.txt b/inc/3rdparty/site_config/standard/weeklytimesnow.com.au.txt
new file mode 100755 (executable)
index 0000000..a79871f
--- /dev/null
@@ -0,0 +1,4 @@
+body: //div[@class='main-col' or @class='article-image-wide']
+title: //h1[@class='article-title']
+author: substring-before(//span[@class='author'], "|")
+date: //span[@class='date']
diff --git a/inc/3rdparty/site_config/standard/westernadvocate.com.au.txt b/inc/3rdparty/site_config/standard/westernadvocate.com.au.txt
new file mode 100755 (executable)
index 0000000..eb00f77
--- /dev/null
@@ -0,0 +1,11 @@
+title: //header[contains(@class, "news-article-title")]//h1
+date: //div[@class="news-article-byline"]//time
+author: //h2[@class="news-article-author"]//cite
+
+# Turns out that westernadvocate is doing funky things with the slide show images. :<
+# body: //ul[@class="slides"]//img | //div[contains(@class, "news-article-body")]
+body: //div[contains(@class, "news-article-body")]
+
+strip: //div[contains(@class, "flexslider")]
+
+test_url: http://www.westernadvocate.com.au/story/1532050/roos-accept-ziebell-ban-commentators-do-not/
diff --git a/inc/3rdparty/site_config/standard/wsj.com.txt b/inc/3rdparty/site_config/standard/wsj.com.txt
new file mode 100755 (executable)
index 0000000..467c39c
--- /dev/null
@@ -0,0 +1,29 @@
+title: //meta[@property="og:title"]/@content
+body: //div[@id='wsj-article-wrap']
+# is this still used?
+body: //div[@id='article_story_body']
+
+author: //h3[@class='byline']/a
+# for slide show content
+body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1]
+date: //li[@class='dateStamp']/small
+
+strip_id_or_class: insetFullBracket
+strip_id_or_class: insettipBox
+#strip_id_or_class: legacyInset
+strip_id_or_class: recipeACShopAndBuyText
+
+strip: //div[contains(@class, 'insetContent')]//cite
+strip: //*[contains(@style, 'visibility: hidden;')]
+strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))]
+strip: //div[contains(@class, 'carousel')]
+
+prune: no
+tidy: no
+
+test_url: http://www.wsj.com/articles/airasia-flight-8501-tail-recovered-1420878809
+test_contains: Saturday evening that the black boxes
+test_url: http://www.wsj.com/news/articles/SB10001424052702304626304579509100018004342
+test_url: http://www.wsj.com/article/SB10001424052970203363504577185322849515102.html
+# slide show
+test_url: http://www.wsj.com/article/SB10001424052970204791104577110550376458164.html
index b46b09e885576550cce4e9e836763e822eac143b..46ee5ba10812f7c3dee69e62bf006b643eb85af1 100755 (executable)
@@ -1,2 +1,2 @@
-body: //div[@class="nxFullTextData"]
-test_url: http://yourerie.com/fulltext?nxd_id=306552
+body: //div[@itemprop="articleBody"]
+test_url: http://www.yourerie.com/news/news-article/d/story/cd-release-party-at-pi-downs/22898/G_gFL3mSQkWH_DW2wLuMOA
index 9815d478f06e3cd269e409ba8ac7f072d7404c28..4e008946c8146cf3d5ac186e68f89a3038cbf302 100755 (executable)
@@ -1,3 +1,4 @@
+# 2015.07.08 [Marvin Dickhaus] fixed single_page_link
 # 2013.10.30 [rezor92] fixed single_page_link
 # 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions
 # 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)
@@ -5,8 +6,7 @@
 # 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications.
 # 2011-08-20 [carlo@...] added author, fixed date
 
-
-single_page_link: //a[@title='Auf einer Seite']
+single_page_link: //a[contains(@href, 'komplettansicht')]
 tidy: no
 
 title: //title
@@ -24,6 +24,8 @@ strip: //p[@class="copyright"]
 strip: //div[@class="copyright"]
 #Removes pagination links at the end
 strip: //div[@class="pagination"]
+#Removes link to main page at the bottom of some articles (Zur Startseite)
+strip: //a[@href='http://www.zeit.de']
 
 # Fix picture captions
 wrap_in(small): //p[@class="caption"]/text()
@@ -43,3 +45,4 @@ strip_id_or_class:"pagination"
 
 footnotes: no
 test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag
+test_url: http://www.zeit.de/kultur/2015-07/kapitalismuskritik-selbstberuhigung-armin-nassehi