aboutsummaryrefslogtreecommitdiffhomepage
path: root/inc/3rdparty/site_config/standard
diff options
context:
space:
mode:
authorThomas Citharel <tcit@tcit.fr>2014-10-27 09:28:30 +0100
committerThomas Citharel <tcit@tcit.fr>2014-10-27 09:28:30 +0100
commit24479b479d6a9fc406c92def1f7609fbfa142bcd (patch)
tree8cf17d3e8a13bf21d9709c1df7c790107ab5c1f9 /inc/3rdparty/site_config/standard
parent4a50075784bb13ed0764a8a175779d9683782846 (diff)
parent90a1a78b1e2f4d40e1d9b8e6f46aca129a9d7bcf (diff)
downloadwallabag-24479b479d6a9fc406c92def1f7609fbfa142bcd.tar.gz
wallabag-24479b479d6a9fc406c92def1f7609fbfa142bcd.tar.zst
wallabag-24479b479d6a9fc406c92def1f7609fbfa142bcd.zip
Merge pull request #888 from wallabag/updated-site-config
updated site_config
Diffstat (limited to 'inc/3rdparty/site_config/standard')
-rwxr-xr-xinc/3rdparty/site_config/standard/512pixels.net.txt4
-rwxr-xr-xinc/3rdparty/site_config/standard/README.md8
-rwxr-xr-xinc/3rdparty/site_config/standard/alexduner.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/anandtech.com.txt6
-rwxr-xr-xinc/3rdparty/site_config/standard/apotheke-adhoc.de.txt23
-rwxr-xr-xinc/3rdparty/site_config/standard/arstechnica.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/autocar.co.uk.txt13
-rwxr-xr-xinc/3rdparty/site_config/standard/bbc.co.uk.txt17
-rwxr-xr-xinc/3rdparty/site_config/standard/bbc.com.txt60
-rwxr-xr-xinc/3rdparty/site_config/standard/bit-tech.net.txt19
-rwxr-xr-xinc/3rdparty/site_config/standard/bleacherreport.com.txt16
-rwxr-xr-xinc/3rdparty/site_config/standard/blogs.faz.net.txt45
-rwxr-xr-xinc/3rdparty/site_config/standard/brasil.elpais.com.txt7
-rwxr-xr-xinc/3rdparty/site_config/standard/businessweek.com.txt41
-rwxr-xr-xinc/3rdparty/site_config/standard/buzzfeed.com.txt11
-rwxr-xr-xinc/3rdparty/site_config/standard/canonrumors.com.txt28
-rwxr-xr-xinc/3rdparty/site_config/standard/chomsky.info.txt3
-rwxr-xr-xinc/3rdparty/site_config/standard/cn.reuters.com.txt6
-rwxr-xr-xinc/3rdparty/site_config/standard/code.fivefilters.org.txt4
-rwxr-xr-xinc/3rdparty/site_config/standard/csmonitor.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/da.feedsportal.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/designsponge.com.txt31
-rwxr-xr-xinc/3rdparty/site_config/standard/desitvforum.net.txt4
-rwxr-xr-xinc/3rdparty/site_config/standard/deutsche-apotheker-zeitung.de.txt29
-rwxr-xr-xinc/3rdparty/site_config/standard/dictionary.reference.com.txt8
-rwxr-xr-xinc/3rdparty/site_config/standard/dropbox.com.txt4
-rwxr-xr-xinc/3rdparty/site_config/standard/echo-online.de.txt24
-rwxr-xr-xinc/3rdparty/site_config/standard/economist.com.txt7
-rwxr-xr-xinc/3rdparty/site_config/standard/eurogamer.net.txt13
-rwxr-xr-xinc/3rdparty/site_config/standard/facebook.com.txt9
-rwxr-xr-x[-rw-r--r--]inc/3rdparty/site_config/standard/faz.net.txt0
-rwxr-xr-xinc/3rdparty/site_config/standard/finance.yahoo.com.txt4
-rwxr-xr-xinc/3rdparty/site_config/standard/fivechapters.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/fivefilters.org.txt5
-rwxr-xr-xinc/3rdparty/site_config/standard/foreignpolicy.com.txt8
-rwxr-xr-xinc/3rdparty/site_config/standard/golem.de.txt53
-rwxr-xr-xinc/3rdparty/site_config/standard/heise.de.txt45
-rwxr-xr-xinc/3rdparty/site_config/standard/hosted.ap.org.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/itunes.apple.com.txt14
-rwxr-xr-xinc/3rdparty/site_config/standard/kachiblog.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/lifehacker.co.uk.txt7
-rwxr-xr-xinc/3rdparty/site_config/standard/mainpost.de.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/medialens.org.txt3
-rwxr-xr-xinc/3rdparty/site_config/standard/medium.com.txt13
-rwxr-xr-xinc/3rdparty/site_config/standard/menshealth.com.sg.txt12
-rwxr-xr-xinc/3rdparty/site_config/standard/northumberlandview.ca.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/nytimes.com.txt6
-rwxr-xr-xinc/3rdparty/site_config/standard/real.gr.txt6
-rwxr-xr-xinc/3rdparty/site_config/standard/reddit.com.txt5
-rwxr-xr-xinc/3rdparty/site_config/standard/searchengineland.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/sourcebooks.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/tabletmag.com.txt5
-rwxr-xr-xinc/3rdparty/site_config/standard/tagesspiegel.de.txt60
-rwxr-xr-xinc/3rdparty/site_config/standard/techmeme.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/theatlantic.com.txt2
-rwxr-xr-xinc/3rdparty/site_config/standard/theglobeandmail.com.txt7
-rwxr-xr-xinc/3rdparty/site_config/standard/theguardian.com.txt13
-rwxr-xr-xinc/3rdparty/site_config/standard/theverge.com.txt7
-rwxr-xr-xinc/3rdparty/site_config/standard/thisiscolossal.com.txt25
-rwxr-xr-xinc/3rdparty/site_config/standard/towerofthehand.com.txt10
-rwxr-xr-xinc/3rdparty/site_config/standard/twitter.com.txt3
-rwxr-xr-xinc/3rdparty/site_config/standard/vanityfair.com.txt5
-rwxr-xr-xinc/3rdparty/site_config/standard/wn.de.txt18
-rwxr-xr-xinc/3rdparty/site_config/standard/zeit.de.txt4
64 files changed, 685 insertions, 119 deletions
diff --git a/inc/3rdparty/site_config/standard/512pixels.net.txt b/inc/3rdparty/site_config/standard/512pixels.net.txt
index e458980f..02a996f7 100755
--- a/inc/3rdparty/site_config/standard/512pixels.net.txt
+++ b/inc/3rdparty/site_config/standard/512pixels.net.txt
@@ -1,2 +1,2 @@
1title: substring-before(//title, '&mdash;') 1title: //meta[@property='og:title']/@content
2test_url: http://512pixels.net/more-on-linked-lists/ \ No newline at end of file 2test_url: http://www.512pixels.net/blog/2014/10/the-move
diff --git a/inc/3rdparty/site_config/standard/README.md b/inc/3rdparty/site_config/standard/README.md
index 9040ba85..ab5b12d9 100755
--- a/inc/3rdparty/site_config/standard/README.md
+++ b/inc/3rdparty/site_config/standard/README.md
@@ -1,12 +1,14 @@
1Full-Text RSS site config files 1Full-Text RSS site config files
2================ 2================
3 3
4[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically. 4[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically.
5 5
6This repository contains the site config files we use in Full-Text RSS. 6This repository contains the site-specific extraction rules we rely on in Full-Text RSS.
7 7
8### Contributing changes 8### Contributing changes
9 9
10We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for.
11
10We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface. 12We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
11 13
12You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model: 14You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
@@ -31,7 +33,7 @@ Marco, Instapaper's creator, graciously opened up the database of contributions
31 33
32> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached. 34> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
33 35
34Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required). 36Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold).
35 37
36### Testing site config files 38### Testing site config files
37 39
diff --git a/inc/3rdparty/site_config/standard/alexduner.com.txt b/inc/3rdparty/site_config/standard/alexduner.com.txt
index bd9de9d7..3897f9ec 100755
--- a/inc/3rdparty/site_config/standard/alexduner.com.txt
+++ b/inc/3rdparty/site_config/standard/alexduner.com.txt
@@ -1,4 +1,4 @@
1body: //section[@class='content'] 1body: //section[@class='content']
2date: //span[1] 2date: //span[1]
3author: //h1[@id='sitetitle'] 3author: //h1[@id='sitetitle']
4test_url: https://alexduner.com/blog/2013/1/something-i-learned-today \ No newline at end of file 4test_url: http://alexduner.com/blog/something-i-learned-today
diff --git a/inc/3rdparty/site_config/standard/anandtech.com.txt b/inc/3rdparty/site_config/standard/anandtech.com.txt
index 7d804918..fc95c5d8 100755
--- a/inc/3rdparty/site_config/standard/anandtech.com.txt
+++ b/inc/3rdparty/site_config/standard/anandtech.com.txt
@@ -1,3 +1,5 @@
1body: //section[@class='main_cont']/img | //div[@class='articleContent']
2title: //div[@class='blog_top_left']//h2
1author: //a[@class='b'][1] 3author: //a[@class='b'][1]
2date: substring-after(substring-before(//div, 'Posted in'), ' on ') 4date: substring-after(substring-before(//div, 'Posted in'), ' on ')
3strip_image_src: /content/images/globals/ 5strip_image_src: /content/images/globals/
@@ -8,4 +10,6 @@ prune: no
8 10
9single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) 11single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
10 12
11test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ \ No newline at end of file 13test_url: http://www.anandtech.com/show/8370/gigabyte-am1m-s2h-review
14test_url: http://www.anandtech.com/show/8402/sandisk-releases-ultra-ii-ssd-the-second-tlc-nand-ssd-in-the-market
15test_url: http://www.anandtech.com/show/8400/arms-cortex-m-even-smaller-and-lower-power-cpu-cores
diff --git a/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt b/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt
new file mode 100755
index 00000000..3a702e7b
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt
@@ -0,0 +1,23 @@
1# Author: zinnober
2
3prune: no
4
5title: substring-before(//div[@id='content']/h1, ',')
6
7single_page_link: //a[@title='Seite drucken']
8
9body: //div[@id='detail-body']
10
11replace_string(<span class="description">): <em>
12replace_string(<p class="leadtext"><small>): <p class="leadtext">
13
14# Fix headlines
15replace_string(Patrick Hollstein): &nbsp;
16replace_string(APOTHEKE ADHOC): &nbsp;
17replace_string(dpa): &nbsp;
18replace_string(Katharina Lübke): &nbsp;
19replace_string(Julia Pradel): &nbsp;
20replace_string(Franziska Gerhardt): &nbsp;
21
22test_url: http://www.apotheke-adhoc.de/nachrichten/politik/nachricht-detail-politik/deutscher-apothekertag-antraege-gegen-lieferengpaesse-2/
23
diff --git a/inc/3rdparty/site_config/standard/arstechnica.com.txt b/inc/3rdparty/site_config/standard/arstechnica.com.txt
index 767f6800..eb92aa2c 100755
--- a/inc/3rdparty/site_config/standard/arstechnica.com.txt
+++ b/inc/3rdparty/site_config/standard/arstechnica.com.txt
@@ -13,5 +13,7 @@ title: //div[@id='story']//h2[@class='title']
13strip: //div[@class='pager'] 13strip: //div[@class='pager']
14next_page_link: //nav//a[span/@class='next']/@href 14next_page_link: //nav//a[span/@class='next']/@href
15 15
16native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')]
17
16test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars 18test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
17test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ 19test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
diff --git a/inc/3rdparty/site_config/standard/autocar.co.uk.txt b/inc/3rdparty/site_config/standard/autocar.co.uk.txt
new file mode 100755
index 00000000..9f4fe18b
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/autocar.co.uk.txt
@@ -0,0 +1,13 @@
1title: //div[@class='col-center']/h1
2author: //div[@class='personality']/a
3date: //div[@class='personality-date']
4body: //div[@class='content-top ']//div[@class='content'][1] | //div[contains(@class,'article-body')] | //div[contains(@class,'main-article')]
5
6next_page_link: //div[@id='review-link']/a
7
8strip: //div[@class='author-block']
9strip: //p//iframe[contains(@src,'signup')]/preceding::p[1]
10
11test_url: http://www.autocar.co.uk/car-review/volkswagen/golf
12test_url: http://www.autocar.co.uk/car-news/pebble-beach/saleen-unveils-performance-electric-vehicle-based-tesla-model-s
13test_url: http://www.autocar.co.uk/car-review/rolls-royce/first-drives/rolls-royce-ghost-series-ii-first-drive-review
diff --git a/inc/3rdparty/site_config/standard/bbc.co.uk.txt b/inc/3rdparty/site_config/standard/bbc.co.uk.txt
index ef1f491a..bad77654 100755
--- a/inc/3rdparty/site_config/standard/bbc.co.uk.txt
+++ b/inc/3rdparty/site_config/standard/bbc.co.uk.txt
@@ -13,7 +13,7 @@ body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
13#strip: //div[@class="story-feature narrow"] 13#strip: //div[@class="story-feature narrow"]
14#strip: //div[@class="story-feature wide"] 14#strip: //div[@class="story-feature wide"]
15#strip: //div[@class="story-feature dslideshow-enclosure"] 15#strip: //div[@class="story-feature dslideshow-enclosure"]
16strip: //div[contains(@class, "story-feature")] 16strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))]
17strip: //span[@class="story-date"] 17strip: //span[@class="story-date"]
18#strip: //div[@class="caption body-narrow-width"] 18#strip: //div[@class="caption body-narrow-width"]
19strip: //div[@class="warning"]//p 19strip: //div[@class="warning"]//p
@@ -30,13 +30,26 @@ strip: //div[contains(@class, 'comment-introduction')]
30strip: //div[contains(@class, 'share-tools')] 30strip: //div[contains(@class, 'share-tools')]
31strip: //div[@id='also-related-links'] 31strip: //div[@id='also-related-links']
32 32
33strip_id_or_class: share-help
34strip_id_or_class: comments_module
35
33replace_string(<noscript>): <div> 36replace_string(<noscript>): <div>
34replace_string(</noscript>): </div> 37replace_string(</noscript>): </div>
35 38
39tidy: no
36prune: no 40prune: no
37 41
38dissolve: //h2 42dissolve: //h2
43
39test_url: http://www.bbc.co.uk/sport/0/football/23224017 44test_url: http://www.bbc.co.uk/sport/0/football/23224017
45test_contains: Swansea City have completed the club-record signing
46
40test_url: http://www.bbc.co.uk/news/business-15060862 47test_url: http://www.bbc.co.uk/news/business-15060862
48test_contains: Europe's leaders are meeting again to try to solve
49
50# news feed
51test_url: http://feeds.bbci.co.uk/news/rss.xml
52# sports feed
53test_url: http://feeds.bbci.co.uk/sport/0/football/rss.xml?edition=int
41# video entry 54# video entry
42test_url: http://www.bbc.co.uk/news/world-asia-22056933 \ No newline at end of file 55test_url: http://www.bbc.co.uk/news/world-asia-22056933
diff --git a/inc/3rdparty/site_config/standard/bbc.com.txt b/inc/3rdparty/site_config/standard/bbc.com.txt
new file mode 100755
index 00000000..c04a683e
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/bbc.com.txt
@@ -0,0 +1,60 @@
1body: //div[@class="story-body"]
2# for video entries
3body: //div[contains(@class, "videoInStory") or @id="meta-information"]
4title: //h1[@class="story-header"]
5date: //span[@class="story-date"]/span[@class='date']
6# for sport site
7date: //meta[@name='DCTERMS.created']/@content
8author: //div[@id='headline']//span[@class='byline-name']
9
10# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
11body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
12
13#strip: //div[@class="story-feature narrow"]
14#strip: //div[@class="story-feature wide"]
15#strip: //div[@class="story-feature dslideshow-enclosure"]
16strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))]
17strip: //span[@class="story-date"]
18#strip: //div[@class="caption body-narrow-width"]
19strip: //div[@class="warning"]//p
20strip: //div[@id='page-bookmark-links-head']
21strip: //object
22strip: //div[contains(@class, "bbccom_advert_placeholder")]
23strip: //div[contains(@class, "embedded-hyper")]
24strip: //div[contains(@class, 'market-data')]
25strip: //a[contains(@class, 'hidden')]
26strip: //div[contains(@class, 'hypertabs')]
27strip: //div[contains(@class, 'related')]
28strip: //form[@id='comment-form']
29strip: //div[contains(@class, 'comment-introduction')]
30strip: //div[contains(@class, 'share-tools')]
31strip: //div[@id='also-related-links']
32
33strip_id_or_class: share-help
34strip_id_or_class: comments_module
35
36replace_string(<noscript>): <div>
37replace_string(</noscript>): </div>
38
39native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')]
40
41tidy: no
42prune: no
43
44dissolve: //h2
45
46test_url: http://www.bbc.com/sport/0/football/28918021
47test_contains: Cameroonian footballer Albert Ebosse has died
48
49test_url: http://www.bbc.com/sport/0/football/23224017
50
51test_url: http://www.bbc.com/news/business-15060862
52test_contains: Europe's leaders are meeting again to try
53
54
55# news feed
56test_url: http://feeds.bbci.co.uk/news/rss.xml
57# sports feed
58test_url: http://feeds.bbci.co.uk/sport/0/football/rss.xml?edition=int
59# video entry
60test_url: http://www.bbc.com/news/world-asia-22056933
diff --git a/inc/3rdparty/site_config/standard/bit-tech.net.txt b/inc/3rdparty/site_config/standard/bit-tech.net.txt
new file mode 100755
index 00000000..c6f5b204
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/bit-tech.net.txt
@@ -0,0 +1,19 @@
1body: //div[@id='column_1']
2next_page_link: //div[@class='next']/a[not(contains(@href, '/comments') or contains(@href, '/news/'))]
3prune: no
4
5author: substring-after(//p[@class='byline'], 'by ')
6date: substring-before(substring-after(//p[@class='byline'], 'on '), ' by')
7
8strip: //h1
9strip_id_or_class: socialLinks
10strip_id_or_class: byline
11strip_id_or_class: pageSelector
12strip_id_or_class: articleTabs
13strip_id_or_class: pageNav
14strip_id_or_class: share
15strip_id_or_class: commentsContainer
16strip_id_or_class: below_article_related
17
18test_url: http://www.bit-tech.net/hardware/storage/2014/08/13/ocz-arc-100-240gb-review/1
19test_url: http://www.bit-tech.net/news/bits/2014/08/15/google-trojan/1
diff --git a/inc/3rdparty/site_config/standard/bleacherreport.com.txt b/inc/3rdparty/site_config/standard/bleacherreport.com.txt
new file mode 100755
index 00000000..9205e44e
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/bleacherreport.com.txt
@@ -0,0 +1,16 @@
1body: //div[contains(@class, 'article_pages')]
2
3strip_id_or_class: article_page-header
4strip_id_or_class: paginator
5strip_id_or_class: article_info
6
7find_string: src="data:image
8replace_string: ignore-src="data:image
9find_string: data-defer-src="
10replace_string: src="
11
12prune: no
13
14test_url: http://bleacherreport.com/articles/feed
15test_url: http://bleacherreport.com/articles/2137787-christian-ponders-newborn-daughter-was-named-after-fsu-legend-bobby-bowden
16test_url: http://bleacherreport.com/articles/2137596-college-football-week-1-picks-unlv-runnin-rebels-vs-arizona-wildcats/ \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/blogs.faz.net.txt b/inc/3rdparty/site_config/standard/blogs.faz.net.txt
new file mode 100755
index 00000000..4f2626f1
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/blogs.faz.net.txt
@@ -0,0 +1,45 @@
1# Author: zinnober
2
3tidy: no
4prune: no
5
6# Set author
7author: //a[@rel='author']
8
9# Set date
10date: //span[@class='Datum']
11
12# Content is here
13body: //div[@class='Artikel']
14
15# Tidy up before article
16strip: //div[@id='FAZHeaderNeu']
17strip: //h2[@itemprop='headline']
18strip: //span[@class='Datum']
19strip: //span[@class='Autor']
20strip_id_or_class: ArticlePagerTop
21strip: //div[@class='FAZArtikelEinleitung']/h2
22
23# General cleanup
24strip: //div[@class='clear']
25strip: //span[@class='Bildnachweis']
26strip: //iframe
27strip_id_or_class: Community
28strip: ' · '
29
30# Remove tracking and ads
31strip_image_src: /l.gif?
32strip: //img[@width='1']
33strip_id_or_class: invisible
34strip_id_or_class: Anzeige
35strip_id_or_class: billboard
36
37# Remove clutter after article
38strip_id_or_class: Tagline
39strip_id_or_class: ArtikelAbbinder
40strip_id_or_class: FAZArtikelKommentare
41strip_id_or_class: ArtikelKommentieren
42strip_id_or_class: FAZContentRight
43
44# Try it yourself
45test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/
diff --git a/inc/3rdparty/site_config/standard/brasil.elpais.com.txt b/inc/3rdparty/site_config/standard/brasil.elpais.com.txt
index 0b8feb6a..6a22dcb7 100755
--- a/inc/3rdparty/site_config/standard/brasil.elpais.com.txt
+++ b/inc/3rdparty/site_config/standard/brasil.elpais.com.txt
@@ -19,5 +19,8 @@ strip: //p[@class='nota_pie']
19strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] 19strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')]
20strip: //div[@id='coment' or @id='foros_not'] 20strip: //div[@id='coment' or @id='foros_not']
21 21
22test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html 22test_url: http://brasil.elpais.com/brasil/2014/10/15/politica/1413334841_878730.html
23test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes 23test_contains: O PT quer intensificar a presença do ex-presidente
24
25test_url: http://brasil.elpais.com/brasil/2014/10/13/internacional/1413225730_450761.html
26test_contains: Todos na localidade onde ele nasceu ainda falavam da façanha
diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt
index 03085593..f546b708 100755
--- a/inc/3rdparty/site_config/standard/businessweek.com.txt
+++ b/inc/3rdparty/site_config/standard/businessweek.com.txt
@@ -1,30 +1,17 @@
1# story has several pages, should be detected 1# include the lead graphic in the body, if available
2body: //div[@id='storyBody'] 2body: //div[contains(concat(' ', normalize-space(@id), ' '), ' lead_graphic ')] | //div[contains(concat(' ', normalize-space(@itemprop), ' '), ' articleBody ')]
3body: //div[@id='article_body'] 3title: //h1[contains(concat(' ', normalize-space(@itemprop), ' '), ' headline ')]
4body: //div[@id='story_body'] 4date: //time[contains(concat(' ', normalize-space(@itemprop), ' '), ' datePublished ')]
5 5
6title://h1[@id='article_headline'] 6strip_id_or_class: photo_credit
7 7strip_id_or_class: photo_caption
8# article author 8strip_id_or_class: inline_gallery
9author: //p[@class='author']/a 9# pull quote, often inside a blockquote element
10# story author(s) 10strip_id_or_class: pq
11author: substring-after(//p[@class='byline'], 'By ') 11strip_id_or_class: credit
12 12strip_id_or_class: figcaption
13# article date 13strip_id_or_class: related_item
14date: //span[@class='published_date']
15# story date
16date: //span[@class='date']
17
18date: substring-after(//div[contains(@class,'attributor')],'on')
19strip_id_or_class: inset
20strip: //p/span[@class='photoCredit']
21strip: //h1
22
23strip_id_or_class: page_count
24strip_id_or_class: tools
25strip_id_or_class: pagination
26
27single_page_link: //li[@id='stPrint']/a
28 14
29test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html 15test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html
30test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file 16test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall
17test_url: http://www.businessweek.com/articles/2014-07-09/american-apparel-dov-charneys-sleazy-struggle-for-control
diff --git a/inc/3rdparty/site_config/standard/buzzfeed.com.txt b/inc/3rdparty/site_config/standard/buzzfeed.com.txt
index 97dddaee..ea88ea47 100755
--- a/inc/3rdparty/site_config/standard/buzzfeed.com.txt
+++ b/inc/3rdparty/site_config/standard/buzzfeed.com.txt
@@ -10,6 +10,15 @@ date: //time[@data-print='date']
10body: //div[@data-print='body'] 10body: //div[@data-print='body']
11body: //section[@data-print='body'] 11body: //section[@data-print='body']
12 12
13find_string: rel:bf_image_src=
14replace_string: src=
15find_string: src="data:
16replace_string: disabled_src="data:
17
18native_ad_clue: //meta[@property="article:section" and @content="Advertiser"]
19
13# For various things... 20# For various things...
14strip: *[@data-print="ignore"] 21strip: *[@data-print="ignore"]
15test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays \ No newline at end of file 22test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays
23# Native ad
24test_url: http://www.buzzfeed.com/bravo/ways-to-up-your-online-dating-game \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/canonrumors.com.txt b/inc/3rdparty/site_config/standard/canonrumors.com.txt
new file mode 100755
index 00000000..c22cf4f1
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/canonrumors.com.txt
@@ -0,0 +1,28 @@
1# Author: zinnober
2
3tidy: no
4prune: no
5
6# Set title
7title: //h2
8
9date: //li[@class='time']
10
11# Set author
12author: //a[contains(@rel, 'author')]
13
14# Content is here
15body: //div[@id='content']
16
17# Tidy up before article
18strip: //div[@class='meta']
19
20# Tidy up after article
21strip_id_or_class: nr_related_placeholder
22strip_id_or_class: twitter-share-button
23strip_id_or_class: afterpost
24strip_id_or_class: tags
25
26# Try it yourself
27test_url: http://www.canonrumors.com/2014/09/chuck-westfall-talks-canon-eos-7d-mark-ii/
28test_url: http://www.canonrumors.com/2014/09/canon-cinema-eos-captures-space-in-4k-for-new-imax-3d-film/
diff --git a/inc/3rdparty/site_config/standard/chomsky.info.txt b/inc/3rdparty/site_config/standard/chomsky.info.txt
index 31440538..2645f119 100755
--- a/inc/3rdparty/site_config/standard/chomsky.info.txt
+++ b/inc/3rdparty/site_config/standard/chomsky.info.txt
@@ -2,4 +2,5 @@ title: //div[@class='title']
2author: //div[@class='author'] 2author: //div[@class='author']
3prune: no 3prune: no
4 4
5test_url: http://www.chomsky.info/onchomsky/2002----.htm \ No newline at end of file 5test_url: http://www.chomsky.info/onchomsky/2002----.htm
6test_contains: The propaganda model argues
diff --git a/inc/3rdparty/site_config/standard/cn.reuters.com.txt b/inc/3rdparty/site_config/standard/cn.reuters.com.txt
index b3878662..28f10472 100755
--- a/inc/3rdparty/site_config/standard/cn.reuters.com.txt
+++ b/inc/3rdparty/site_config/standard/cn.reuters.com.txt
@@ -1,5 +1,9 @@
1title: //div[@id='maincontent']//h1 1title: //div[@id='maincontent']//h1
2body: //div[@id='resizeableText'] 2body: //div[@id='resizeableText']
3 3
4single_page_link: concat(//link[@rel='canonical']/@href, '?sp=true')
5
4test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710 6test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710
5test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews \ No newline at end of file 7test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews
8# multipage link
9test_url: http://cn.reuters.com/article/idCNKBS0FF0UL20140710 \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/code.fivefilters.org.txt b/inc/3rdparty/site_config/standard/code.fivefilters.org.txt
index 269fb547..f8a88cae 100755
--- a/inc/3rdparty/site_config/standard/code.fivefilters.org.txt
+++ b/inc/3rdparty/site_config/standard/code.fivefilters.org.txt
@@ -1 +1,3 @@
1body: //div[@id='content'] 1body: //div[@id='readme']
2
3test_url: http://code.fivefilters.org/full-text-rss
diff --git a/inc/3rdparty/site_config/standard/csmonitor.com.txt b/inc/3rdparty/site_config/standard/csmonitor.com.txt
index b482e34e..70ab9885 100755
--- a/inc/3rdparty/site_config/standard/csmonitor.com.txt
+++ b/inc/3rdparty/site_config/standard/csmonitor.com.txt
@@ -15,4 +15,4 @@ strip_id_or_class: promotion-tag
15tidy: no 15tidy: no
16prune: no 16prune: no
17 17
18test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84 \ No newline at end of file 18test_url: http://www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84
diff --git a/inc/3rdparty/site_config/standard/da.feedsportal.com.txt b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt
index 381446e5..2bd66be8 100755
--- a/inc/3rdparty/site_config/standard/da.feedsportal.com.txt
+++ b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt
@@ -2,4 +2,4 @@ single_page_link: //a
2tidy: no 2tidy: no
3prune: no 3prune: no
4 4
5test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm \ No newline at end of file 5test_url: http://da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm
diff --git a/inc/3rdparty/site_config/standard/designsponge.com.txt b/inc/3rdparty/site_config/standard/designsponge.com.txt
new file mode 100755
index 00000000..2cd2f1f6
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/designsponge.com.txt
@@ -0,0 +1,31 @@
1# Author: zinnober
2
3tidy: no
4prune: no
5
6# Set title
7title: //header/h1
8
9# Set author
10author: //a[rel='author']
11
12# Content is here
13body: //article
14
15# Tidy up before article
16strip: //header
17
18# Tidy up article
19strip: //div[contains(@id, 'gallery-')]
20replace_string(<a rel="attachment): <p rel="attachment
21
22
23# Tidy up after article
24strip: //div[@class='sm']
25strip_id_or_class: related
26strip_id_or_class: comments
27strip: //footer
28
29# Try it yourself
30test_url: http://www.designsponge.com/2010/06/seattle-design-guide.html
31test_url: http://www.designsponge.com/2012/04/sneak-peek-liz-cook.html
diff --git a/inc/3rdparty/site_config/standard/desitvforum.net.txt b/inc/3rdparty/site_config/standard/desitvforum.net.txt
index efa85f76..c77007b7 100755
--- a/inc/3rdparty/site_config/standard/desitvforum.net.txt
+++ b/inc/3rdparty/site_config/standard/desitvforum.net.txt
@@ -2,4 +2,6 @@ body: (//blockquote[contains(@class, 'postcontent')])[1]
2body: (//div[starts-with(@id, 'post_message')])[1] 2body: (//div[starts-with(@id, 'post_message')])[1]
3 3
4prune: no 4prune: no
5tidy: no \ No newline at end of file 5tidy: no
6
7test_url: http://www.desitvforum.net/forum/watch-online/431739-creature-3d-2014-watch-online-download-dvd-rip.html
diff --git a/inc/3rdparty/site_config/standard/deutsche-apotheker-zeitung.de.txt b/inc/3rdparty/site_config/standard/deutsche-apotheker-zeitung.de.txt
new file mode 100755
index 00000000..36709cab
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/deutsche-apotheker-zeitung.de.txt
@@ -0,0 +1,29 @@
1# Author: zinnober
2
3prune: yes
4tidy: yes
5
6title: //h1
7date: //p[@class='news_datum']
8author: //span[@class='author']
9
10body: //div[@class='tagesnews-content']
11
12# General clenaup
13strip_id_or_class: dachzeile
14strip: //h3
15strip: //p[@class='bodytext']//a
16strip_id_or_class: autor_datum
17strip_id_or_class: comments
18strip_id_or_class: banner-
19
20strip: //p[contains(., 'Lesen Sie')]
21strip: //p[contains(., '– in DAZ')]
22
23# Fix image captions
24replace_string(<p class="image_caption">): <p><small><em>
25replace_string(</dd>): </em></small></dd>
26
27test_url: http://www.deutsche-apotheker-zeitung.de/pharmazie/news/2014/09/03/weniger-nebenwirkungen-aber-kein-zusatznutzen/13715.html
28test_url: http://www.deutsche-apotheker-zeitung.de/recht/news/2014/09/02/urteile-zum-cannabis-eigenanbau-bfarm-geht-in-berufung/13716.html
29
diff --git a/inc/3rdparty/site_config/standard/dictionary.reference.com.txt b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt
index f8b79c80..b8243d0c 100755
--- a/inc/3rdparty/site_config/standard/dictionary.reference.com.txt
+++ b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt
@@ -1,8 +1,6 @@
1title: //h1[@id='query_h1'] 1body: //div[contains(@class, 'source-data')]
2body: //div[contains(@class, 'lunatext results_content')] 2strip: //button
3strip_id_or_class: spl_unshd
4#replace_string(<div class="dicTl">): <div class="dicTl">------------------<br />
5 3
6prune: no 4prune: no
7 5
8test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ \ No newline at end of file 6test_url: http://dictionary.reference.com/browse/propaganda
diff --git a/inc/3rdparty/site_config/standard/dropbox.com.txt b/inc/3rdparty/site_config/standard/dropbox.com.txt
index 92ae31b2..3b51569f 100755
--- a/inc/3rdparty/site_config/standard/dropbox.com.txt
+++ b/inc/3rdparty/site_config/standard/dropbox.com.txt
@@ -1 +1,3 @@
1single_page_link: //a[@id='download_button_link'] \ No newline at end of file 1single_page_link: //a[@id='download_button_link']
2
3test_url: https://www.dropbox.com/s/qmocfrco2t0d28o/Fluffbeast.docx
diff --git a/inc/3rdparty/site_config/standard/echo-online.de.txt b/inc/3rdparty/site_config/standard/echo-online.de.txt
new file mode 100755
index 00000000..e53de23e
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/echo-online.de.txt
@@ -0,0 +1,24 @@
1# Author: Marvin Dickhaus <github@marvindickhaus.de>
2# 2014-10-08
3
4#Tidy just messes up the DOM
5tidy: no
6
7title: //h1
8body: //h2 | //div[@id='artikelteaser'] | //div[@id='artikeltext']
9
10#Strip
11strip_image_src: artikel_a_merken.gif
12strip: //div[@class='zusatzinfo']
13
14#Author: substring is used to remove the " Von " prefix.
15author: substring(//li[@class='artikelautor'], 5)
16
17date: //li[@class='artikeldatum']
18
19#The first two URLs will at some point no longer show
20#the full article. There is a time-based paywall
21#installed. Using the feed should present valid output
22test_url: http://www.echo-online.de/art1231,5503063
23test_url: http://www.echo-online.de/art1168,5502598
24test_url: http://www.echo-online.de/rss/darmstadt.xml
diff --git a/inc/3rdparty/site_config/standard/economist.com.txt b/inc/3rdparty/site_config/standard/economist.com.txt
index 16c9ed64..8db5fdd6 100755
--- a/inc/3rdparty/site_config/standard/economist.com.txt
+++ b/inc/3rdparty/site_config/standard/economist.com.txt
@@ -1,8 +1,13 @@
1body: //div[@class='main-content'] 1body: //div[@class='main-content']
2body: //article[contains(@class, 'resp-node')]
2date: //time[@class='date-created'] 3date: //time[@class='date-created']
3strip: //aside 4strip: //aside
4prune: no 5prune: no
5 6
6autodetect_next_page: no 7autodetect_next_page: no
7 8
8test_url: http://www.economist.com/node/21528429 \ No newline at end of file 9test_url: http://www.economist.com/node/21528429
10
11test_url: http://www.economist.com/news/essays/21623373-which-something-old-and-powerful-encountered-vault
12test_contains: the calfskin pages are smooth
13test_contains: Books will evolve online and off
diff --git a/inc/3rdparty/site_config/standard/eurogamer.net.txt b/inc/3rdparty/site_config/standard/eurogamer.net.txt
index 8a351667..8931becb 100755
--- a/inc/3rdparty/site_config/standard/eurogamer.net.txt
+++ b/inc/3rdparty/site_config/standard/eurogamer.net.txt
@@ -1,8 +1,9 @@
1body: //div[ @class='content' ] | //div[ @class='blog-entry' ] 1body: //p[@class='strapline'] | //div[@class='cover-image'] | //article[@class='hd']
2strip: //div[@class='social top']
3strip: //p[@class='byline']
2 4
3strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')] 5date: //span[@itemprop='datePublished']
6author: //a[@itemprop='author']/text()
4 7
5date://p[ @class='timestamp' ] 8test_url: http://www.eurogamer.net/articles/2014-08-20-bungie-ordered-to-return-shares-to-composer-marty-odonnell
6 9test_url: http://www.eurogamer.net/articles/2014-08-20-invisible-inc-does-espionage-justice
7author://a[ @class='eurogamer-author' ]
8test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4 \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/facebook.com.txt b/inc/3rdparty/site_config/standard/facebook.com.txt
index 6a492767..26d4f905 100755
--- a/inc/3rdparty/site_config/standard/facebook.com.txt
+++ b/inc/3rdparty/site_config/standard/facebook.com.txt
@@ -1,5 +1,12 @@
1body: //div[@id='imagestage'] 1body: //div[@id='imagestage']
2body: //div[contains(@class, 'userContentWrapper')]
3
4strip_id_or_class: commentable
5
2prune: no 6prune: no
3tidy: no 7tidy: no
4 8
5test_url: https://www.facebook.com/feeds/page.php?id=338077742912613&format=rss20 \ No newline at end of file 9# single_page_link: replace(substring-after(//noscript//meta[@http-equiv="refresh"]/@content, 'URL='), "&amp;", "&")
10
11test_url: https://www.facebook.com/permalink.php?story_fbid=10154584776550183&id=294468630182
12test_contains: holding an extraordinary session in Brussels this month
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt
index 47048a1b..47048a1b 100644..100755
--- a/inc/3rdparty/site_config/standard/faz.net.txt
+++ b/inc/3rdparty/site_config/standard/faz.net.txt
diff --git a/inc/3rdparty/site_config/standard/finance.yahoo.com.txt b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt
index 248522cb..0c967db0 100755
--- a/inc/3rdparty/site_config/standard/finance.yahoo.com.txt
+++ b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt
@@ -5,8 +5,8 @@ strip: //div[contains(@class, 'related-companies')]
5strip: //div[@id='y-article-related'] 5strip: //div[@id='y-article-related']
6strip: //div[@id='ypf-article-related'] 6strip: //div[@id='ypf-article-related']
7prune: no 7prune: no
8tidy: no
8 9
9single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] 10single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')]
10 11
11test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1 12test_url: http://finance.yahoo.com/news/canadian-orebodies-gives-notice-exercise-130000032.html \ No newline at end of file
12test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/fivechapters.com.txt b/inc/3rdparty/site_config/standard/fivechapters.com.txt
index d9c5e42e..9614d2f6 100755
--- a/inc/3rdparty/site_config/standard/fivechapters.com.txt
+++ b/inc/3rdparty/site_config/standard/fivechapters.com.txt
@@ -1,2 +1,2 @@
1body: //div[@class='entry'] 1body: //div[@class='entry']
2test_url: http://www.fivechapters.com/2010/paris-part-one/ \ No newline at end of file 2test_url: http://www.fivechapters.com/2014/the-saddest-writer-in-america-part-two/
diff --git a/inc/3rdparty/site_config/standard/fivefilters.org.txt b/inc/3rdparty/site_config/standard/fivefilters.org.txt
index dc1db432..f37f02b9 100755
--- a/inc/3rdparty/site_config/standard/fivefilters.org.txt
+++ b/inc/3rdparty/site_config/standard/fivefilters.org.txt
@@ -1 +1,4 @@
1prune: no \ No newline at end of file 1body: //section[contains(@class, 'container')]
2prune: no
3
4test_url: http://fivefilters.org/kindle-it/
diff --git a/inc/3rdparty/site_config/standard/foreignpolicy.com.txt b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt
index 4e84b989..853a5b7b 100755
--- a/inc/3rdparty/site_config/standard/foreignpolicy.com.txt
+++ b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt
@@ -1,15 +1,19 @@
1title: //div[@class='translateHead']//h1 | //div[@id='art-mast']//h1 1title: //div[@class='translateHead']//h1 | //div[@id='art-mast']//h1
2author: substring-after(//span[@id='by-line'], 'BY ') 2author: substring-after(//span[@id='by-line'], 'BY ')
3date: //span[@id='pub-date'] 3date: //span[@id='pub-date']
4body: //div[@id='art-mast']/h2 | //div[@class='translateBody'] | //div[@id='art-body'] 4body: (//article//img[contains(@class, 'main_photo')])[1] | (//article//div[contains(@class, 'full_post_content')])[1]
5#body: //div[@id='art-mast']/h2 | //div[@class='translateBody'] | //div[@id='art-body']
5#Strip inside article content 6#Strip inside article content
6strip: //div[@id='share-box'] 7strip: //div[@id='share-box']
7strip: //div[@id='special-box'] 8strip: //div[@id='special-box
9
10strip_id_or_class: side_panel
8 11
9prune: no 12prune: no
10 13
11single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')] 14single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')]
12single_page_link: //a[text()='SINGLE PAGE'] 15single_page_link: //a[text()='SINGLE PAGE']
13 16
17test_url: http://www.foreignpolicy.com/articles/2014/07/22/the_end_game_in_gaza_netanyahu_hamas
14test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me 18test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me
15test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus \ No newline at end of file 19test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/golem.de.txt b/inc/3rdparty/site_config/standard/golem.de.txt
index 6afdebe8..c64860c0 100755
--- a/inc/3rdparty/site_config/standard/golem.de.txt
+++ b/inc/3rdparty/site_config/standard/golem.de.txt
@@ -1,25 +1,34 @@
1# Jens Kohl, jens.kohl@... 1# Author: zinnober
2# - Added publication date 2# Rewrite of original template which fetched the printer-version without pictures
3# - Striped pagination block
4# - Added single page link
5# - Added xpath-querys for the printer friendly version
6 3
7title: //h1 4tidy: no
8body: //div[@class='formatted']
9prune: no 5prune: no
10 6
11date: substring-after(//li[2][@class="text1"], 'Datum:') 7# Set full title
12strip: //ol[@class="list-chapters"] 8title: //h1
13strip_comments: yes 9
14 10date: //time
15# next: commands for printer friendly pages 11
16single_page_link: //a[contains(@href, 'print.php?a=')]/@href 12# Content is here
17title: //body/h3 13body: //article
18strip_image_src: staticrl/images/logo.jpg 14
19strip_image_src: http://cpx.golem.de/cpx.php?class=7 15# Fetch full multipage articles
20strip: //body/h3 16next_page_link: //a[@id='atoc_next']
21strip: //body/b[1] 17
22strip: //body/b[2] 18# Remove tracking and ads
23strip: //body/b[3] 19strip_id_or_class: iqadtile4
24strip: //div[1] 20
25test_url: http://www.golem.de/1112/88696.html \ No newline at end of file 21# General Cleanup
22strip_id_or_class: list-jtoc
23strip_id_or_class: table-jtoc
24strip_id_or_class: implied
25strip_id_or_class: social-
26strip_id_or_class: comments
27strip_id_or_class: footer
28
29# Tidy up galleries (could still be improved, though)
30strip: //img[@src='']
31
32# Try yourself
33test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
34test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html
diff --git a/inc/3rdparty/site_config/standard/heise.de.txt b/inc/3rdparty/site_config/standard/heise.de.txt
index 37a4aaf0..9433104b 100755
--- a/inc/3rdparty/site_config/standard/heise.de.txt
+++ b/inc/3rdparty/site_config/standard/heise.de.txt
@@ -1,9 +1,42 @@
1#second part of single_page_link for telepolis-articles (desktop-version of site) 1# Author: zinnober
2single_page_link: //p[@class='news_option']/a | //a[@id='tp-druckversion'] 2# Template should work well with either desktop or mobile version (m.heise.de)
3 3
4prune: no
5
6title: //article/h1 | //h1
4date: //p[@class='news_datum'] 7date: //p[@class='news_datum']
5title: //h1 8author: //h4[@class='author']
6body: //div[@class='meldung_wrapper'] 9
10body: //article | //div[@class='meldung_wrapper']
11
12# General cleanup
13strip: //time
14strip: //h4[@class='author']
15strip: //p[@class='news_datum']
16strip: //p[@class='artikel_datum']
17strip: //a[contains(@href, 'mailto')]
18strip_id_or_class: comments
19strip_id_or_class: ISI_IGNORE
20strip_id_or_class: clear
21
22strip_id_or_class: linkurl_grossbild
23strip_id_or_class: image-num
24strip_id_or_class: heisebox_right
25strip_id_or_class: dossier
26
27# Strip Ads
28strip_id_or_class: ad_
29
30# Some optimizations
31replace_string(<h5>): <h2>
32replace_string(</h5>): </h2>
33replace_string(<span class="bild_rechts"): <p
34replace_string(<div class="heisebox">): <blockquote>
35
36
37next_page_link: //a[@class='next']
38next_page_link: //a[@title='vor']
7 39
8test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html 40test_url: http://www.heise.de/open/artikel/Die-Neuerungen-von-Linux-3-15-2196231.html
9test_url: http://www.heise.de/tp/artikel/42/42579/1.html 41test_url: http://m.heise.de/open/artikel/Die-Neuerungen-von-Linux-3-15-2196231.html
42test_url: http://www.heise.de/newsticker/meldung/Ueberwachungstechnik-Die-globale-Handy-Standortueberwachung-2301494.html
diff --git a/inc/3rdparty/site_config/standard/hosted.ap.org.txt b/inc/3rdparty/site_config/standard/hosted.ap.org.txt
index dfd81937..a660f23b 100755
--- a/inc/3rdparty/site_config/standard/hosted.ap.org.txt
+++ b/inc/3rdparty/site_config/standard/hosted.ap.org.txt
@@ -2,4 +2,4 @@ body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='en
2tidy: no 2tidy: no
3strip_image_src: analytics.apnewsregistry 3strip_image_src: analytics.apnewsregistry
4 4
5test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50 \ No newline at end of file 5test_url: http://hosted.ap.org/dynamic/stories/E/EU_TURKEY_KURDS?SITE=KSNEW&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2014-10-14-10-50-25
diff --git a/inc/3rdparty/site_config/standard/itunes.apple.com.txt b/inc/3rdparty/site_config/standard/itunes.apple.com.txt
new file mode 100755
index 00000000..ffd95561
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/itunes.apple.com.txt
@@ -0,0 +1,14 @@
1body: //div[@id='left-stack' or contains(@class, 'center-stack')]
2
3find_string: class="artwork" src="
4replace_string: class="artwork" src-disabled="
5find_string: src-swap-high-dpi="
6replace_string: src="
7
8strip_id_or_class: rating
9strip_id_or_class: listeners-also-bought
10
11prune: no
12
13test_url: https://itunes.apple.com/us/rss/topaudiobooks/limit=10/xml
14test_url: https://itunes.apple.com/us/audiobook/the-giver-unabridged/id356345850 \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/kachiblog.com.txt b/inc/3rdparty/site_config/standard/kachiblog.com.txt
index 35baf8df..57ab0de1 100755
--- a/inc/3rdparty/site_config/standard/kachiblog.com.txt
+++ b/inc/3rdparty/site_config/standard/kachiblog.com.txt
@@ -4,4 +4,4 @@ body: //div[@itemprop='articleBody']
4tidy: no 4tidy: no
5 5
6test_url: http://www.kachiblog.com/2013/05/samsung-galaxy-s4-vs-samsung-galaxy.html 6test_url: http://www.kachiblog.com/2013/05/samsung-galaxy-s4-vs-samsung-galaxy.html
7test_url: http://www.kachiblog.com/feeds/posts/default \ No newline at end of file 7test_url: http://www.kachiblog.com/feed
diff --git a/inc/3rdparty/site_config/standard/lifehacker.co.uk.txt b/inc/3rdparty/site_config/standard/lifehacker.co.uk.txt
new file mode 100755
index 00000000..c540f7f3
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/lifehacker.co.uk.txt
@@ -0,0 +1,7 @@
1title: //div[@itemprop='headline']
2body: //noscript/img | //div[@itemprop='text']
3author: //div[@class='meta meta--post']//a[@class='is-author']
4date: //div[@class='meta meta--post']//time/@datetime
5
6test_url: http://www.lifehacker.co.uk/2014/08/22/dealhacker-10-google-chromecast-super-cheap-batteries-much
7test_url: http://www.lifehacker.co.uk/2014/08/18/andrognito-hides-files-youd-like-keep-away-prying-eyes
diff --git a/inc/3rdparty/site_config/standard/mainpost.de.txt b/inc/3rdparty/site_config/standard/mainpost.de.txt
index 2136de3f..2f6382f1 100755
--- a/inc/3rdparty/site_config/standard/mainpost.de.txt
+++ b/inc/3rdparty/site_config/standard/mainpost.de.txt
@@ -25,4 +25,4 @@ strip_id_or_class: 'rightimage'
25#Comments 25#Comments
26strip: //table 26strip: //table
27strip: //p/following-sibling::*[0] 27strip: //p/following-sibling::*[0]
28test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211 \ No newline at end of file 28test_url: http://www.mainpost.de/regional/wuerzburg/Autobahnschuetze-Staatsanwalt-fordert-zwoelf-Jahre;art492151,8386332
diff --git a/inc/3rdparty/site_config/standard/medialens.org.txt b/inc/3rdparty/site_config/standard/medialens.org.txt
index 4c333aa1..c26bac55 100755
--- a/inc/3rdparty/site_config/standard/medialens.org.txt
+++ b/inc/3rdparty/site_config/standard/medialens.org.txt
@@ -1,4 +1,5 @@
1strip_id_or_class: article-tools 1strip_id_or_class: article-tools
2strip_id_or_class: pagenav 2strip_id_or_class: pagenav
3prune: no 3prune: no
4test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html \ No newline at end of file 4test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html
5test_contains: In an era of permanent war, economic meltdown
diff --git a/inc/3rdparty/site_config/standard/medium.com.txt b/inc/3rdparty/site_config/standard/medium.com.txt
index acf7cc90..9e9c6895 100755
--- a/inc/3rdparty/site_config/standard/medium.com.txt
+++ b/inc/3rdparty/site_config/standard/medium.com.txt
@@ -1,7 +1,12 @@
1body: //div[contains(@class, 'post-content-inner')] 1body: //div[contains(@class, 'postContent-inner')]
2strip_id_or_class: follow-ups 2strip_id_or_class: supplementalPostContent
3strip_id_or_class: footer
4 3
5prune: no 4prune: no
6 5
7test_url: https://medium.com/p/6844c0d7893b \ No newline at end of file 6test_url: https://medium.com/@savolai/kaytettavyyden-haasteet-keskustelukulttuurista-2-3-6844c0d7893b
7test_contains: Jos käytettävyysongelmat ovat kerran niin tyypillisiä
8test_contains: Keskustelukulttuuriongelmasta (subjective vs. objective bugs)
9
10test_url: https://medium.com/health-the-future/thirty-things-ive-learned-482765ee3503
11test_contains: Remember you will die
12test_contains: You have to have some faith.
diff --git a/inc/3rdparty/site_config/standard/menshealth.com.sg.txt b/inc/3rdparty/site_config/standard/menshealth.com.sg.txt
new file mode 100755
index 00000000..6a669253
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/menshealth.com.sg.txt
@@ -0,0 +1,12 @@
1strip: //div[contains(@style, 'float:right') and contains(., 'advertisement')]
2body: //div[@style="float:left;width:740px;"]
3
4tidy: no
5
6test_url: http://www.menshealth.com.sg/fitness/mh-picks-under-armour-clutchfit-nitro-mid-cleats
7test_contains: These cleats are made for one thing
8
9test_url: http://www.menshealth.com.sg/fitness/top-10-fat-burning-bodyweight-moves-you-can-do-10-minutes
10test_contains: let this workout fool you
11
12test_url: http://www.menshealth.com.sg/fitness/feed \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/northumberlandview.ca.txt b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt
index 88429a78..f698d98e 100755
--- a/inc/3rdparty/site_config/standard/northumberlandview.ca.txt
+++ b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt
@@ -8,4 +8,4 @@ strip_id_or_class: news_morearticlesincat
8strip_id_or_class: ezc_comments 8strip_id_or_class: ezc_comments
9strip_comments: yes 9strip_comments: yes
10 10
11test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972 \ No newline at end of file 11test_url: http://www.northumberlandview.ca/index.php?module=news&type=user&func=display&sid=31127
diff --git a/inc/3rdparty/site_config/standard/nytimes.com.txt b/inc/3rdparty/site_config/standard/nytimes.com.txt
index 23c9ad11..54735ec7 100755
--- a/inc/3rdparty/site_config/standard/nytimes.com.txt
+++ b/inc/3rdparty/site_config/standard/nytimes.com.txt
@@ -42,8 +42,12 @@ strip://h6[@class = 'kicker']
42author:substring-after(//h6[@class='byline'],'By ') 42author:substring-after(//h6[@class='byline'],'By ')
43 43
44test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html 44test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html
45test_contains: In this column I want to look at a not uncommon way of writing
46
45test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html 47test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html
48test_contains: IF you’ve seen enough of Aaron Sorkin’s theater
49
46test_url: http://www.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html 50test_url: http://www.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html
47test_url: http://www.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html 51test_url: http://www.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html
48test_url: http://www.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html 52test_url: http://www.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html
49test_url: http://www.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html \ No newline at end of file 53test_url: http://www.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html
diff --git a/inc/3rdparty/site_config/standard/real.gr.txt b/inc/3rdparty/site_config/standard/real.gr.txt
index 1a33610d..ce0a3c43 100755
--- a/inc/3rdparty/site_config/standard/real.gr.txt
+++ b/inc/3rdparty/site_config/standard/real.gr.txt
@@ -1,3 +1,5 @@
1body: //div[@id='_ctl12__ctl0_Article'] 1body: //div[contains(@class, 'article-photo-wrapper')]
2prune: no 2prune: no
3autodetect_on_failure: no \ No newline at end of file 3
4test_url: http://www.real.gr/DefaultArthro.aspx?page=arthro&id=360962&catID=1
5test_contains: Επισήμως το αποψινό υπουργικό
diff --git a/inc/3rdparty/site_config/standard/reddit.com.txt b/inc/3rdparty/site_config/standard/reddit.com.txt
index 8871f564..ba342c7c 100755
--- a/inc/3rdparty/site_config/standard/reddit.com.txt
+++ b/inc/3rdparty/site_config/standard/reddit.com.txt
@@ -7,7 +7,7 @@ author: //p[@class="tagline"]/a
7# this doesn't work for some reason...? 7# this doesn't work for some reason...?
8date: //p[@class="tagline"]//@datetime 8date: //p[@class="tagline"]//@datetime
9 9
10body: //div[@class="expando"]//div[@class="usertext-body"] 10body: (//div[contains(@class, 'noncollapsed')]//div[contains(@class, 'usertext-body')])[1]
11 11
12strip_id_or_class: tagline 12strip_id_or_class: tagline
13strip_id_or_class: unvotable-message 13strip_id_or_class: unvotable-message
@@ -17,4 +17,5 @@ strip_id_or_class: buttons
17single_page_link: //p[@class="title"]/a[contains(@href, 'http://')] 17single_page_link: //p[@class="title"]/a[contains(@href, 'http://')]
18 18
19test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/ 19test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/
20test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/ \ No newline at end of file 20test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/
21test_url: http://www.reddit.com/r/WritingPrompts/comments/2786lw/wp_in_a_world_where_puns_are_illegal_one_man/chybk8e \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/searchengineland.com.txt b/inc/3rdparty/site_config/standard/searchengineland.com.txt
index fb6a1074..9ccc5898 100755
--- a/inc/3rdparty/site_config/standard/searchengineland.com.txt
+++ b/inc/3rdparty/site_config/standard/searchengineland.com.txt
@@ -1,4 +1,4 @@
1body: //div[@class="storyBox"] 1body: //div[contains(concat(' ',normalize-space(@class),' '),' article ') and (contains(concat(' ',normalize-space(@class),' '),' clear '))]
2title: //div[@class="storyBox"]/h1 2title: //div[@class="storyBox"]/h1
3author: //a[@rel="author"] 3author: //a[@rel="author"]
4date: substring-before(//span[@class="dateline"], 'by') 4date: substring-before(//span[@class="dateline"], 'by')
diff --git a/inc/3rdparty/site_config/standard/sourcebooks.com.txt b/inc/3rdparty/site_config/standard/sourcebooks.com.txt
index b52169da..86e3df5e 100755
--- a/inc/3rdparty/site_config/standard/sourcebooks.com.txt
+++ b/inc/3rdparty/site_config/standard/sourcebooks.com.txt
@@ -1,4 +1,4 @@
1#grab the actual content div 1#grab the actual content div
2body: //div[@class='rt-article'] 2body: //div[@class='rt-article']
3 3
4test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html \ No newline at end of file 4test_url: http://www.sourcebooks.com/blog/happy-27th-birthday-sourcebooks.html
diff --git a/inc/3rdparty/site_config/standard/tabletmag.com.txt b/inc/3rdparty/site_config/standard/tabletmag.com.txt
new file mode 100755
index 00000000..58b1f5bb
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/tabletmag.com.txt
@@ -0,0 +1,5 @@
1body: //div[contains(@class, 'story-text')]
2
3strip_id_or_class: related
4
5test_url: http://www.tabletmag.com/jewish-news-and-politics/181181/mossberg-parallel-states?all=1 \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/tagesspiegel.de.txt b/inc/3rdparty/site_config/standard/tagesspiegel.de.txt
new file mode 100755
index 00000000..57e7d3df
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/tagesspiegel.de.txt
@@ -0,0 +1,60 @@
1# Author: zinnober
2# Should work with "normal" articles as well as with image galleries
3
4prune: no
5
6# Title
7title: //h1/span[@class='hcf-headline']
8
9# Set author
10author: //a[@rel='author']
11
12# Set date
13date: //span[@class='date hcf-atlas']
14
15# Fetch full multipage articles
16next_page_link: //a[contains(@class, 'hcf-forward')]
17
18# Content is here
19body: //article
20body: //div[contains(@class, 'hcf-screen')]
21
22# Remove tracking and ads
23strip_id_or_class: hcf-ad
24strip_id_or_class: hcf-autoload-ad
25strip_id_or_class: hcf-content-ad
26
27# Tidy up before article
28strip: //article/h1
29strip_id_or_class: hcf-atlas
30strip_id_or_class: hcf-author
31strip_id_or_class: date hcf-atlas
32strip_id_or_class: date hcf-atlas
33
34# General cleanup
35strip: //div[contains(@class, 'hcf-screen')]//h1
36strip: //div[@class='hcf-subpage-titles']//ul
37strip_id_or_class: hcf-doctype-media
38strip_id_or_class: hcf-inline-gallery
39strip_id_or_class: hcf-doctype-video
40strip_id_or_class: hcf-links
41strip_id_or_class: hcf-mini-navi
42strip_id_or_class: hcf-media-control
43strip_id_or_class: hcf-hidden
44replace_string(<span class="hcf-update">Update</span>): <strong>Update: </strong>
45
46# Fix pictures and captions
47replace_string(<a class="hcf-doctype-gallery): <p class="hcf-doctype-gallery
48replace_string(<a class="hcf-doctype-enlarge): <p class="hcf-doctype-enlarge
49replace_string(<figcaption class="hcf-caption">): <br><small><em>
50replace_string(</figcaption>): </em></small>
51
52# Fix image galleries
53replace_string(<a class=" ajaxify): <p class="ajaxify
54replace_string(<div class="hcf-caption"><div><p>): <small><em>
55
56# Try it yourself
57test_url: http://www.tagesspiegel.de/berlin/bezirke/wedding/wedding-jetzt/auf-der-suche-nach-einem-stadtteil-wilder-weiter-wedding/8757156.html
58test_url: http://www.tagesspiegel.de/berlin/olympia-in-berlin-der-flughafen-tegel-soll-das-olympische-dorf-werden/10645036.html
59test_url: http://www.tagesspiegel.de/mediacenter/fotostrecken/berlin/bildergalerie-kreuzberger-der-woche/9305534.html
60
diff --git a/inc/3rdparty/site_config/standard/techmeme.com.txt b/inc/3rdparty/site_config/standard/techmeme.com.txt
index 0b4bfbd6..26eb37b0 100755
--- a/inc/3rdparty/site_config/standard/techmeme.com.txt
+++ b/inc/3rdparty/site_config/standard/techmeme.com.txt
@@ -1,3 +1,3 @@
1single_page_link_in_feed: //b/a 1single_page_link_in_feed: //b/a
2 2
3test_url_feed: http://www.techmeme.com/feed.xml \ No newline at end of file 3test_url: http://www.techmeme.com/feed.xml
diff --git a/inc/3rdparty/site_config/standard/theatlantic.com.txt b/inc/3rdparty/site_config/standard/theatlantic.com.txt
index aa41b153..3fc5611b 100755
--- a/inc/3rdparty/site_config/standard/theatlantic.com.txt
+++ b/inc/3rdparty/site_config/standard/theatlantic.com.txt
@@ -15,6 +15,8 @@ strip: //div[@class='earthbox']
15 15
16single_page_link: //article//a[contains(@class, 'print')] 16single_page_link: //article//a[contains(@class, 'print')]
17 17
18native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')]
19
18test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/ 20test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/
19test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/ 21test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/
20test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ \ No newline at end of file 22test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/theglobeandmail.com.txt b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt
index 750f8473..2473cad2 100755
--- a/inc/3rdparty/site_config/standard/theglobeandmail.com.txt
+++ b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt
@@ -1,5 +1,10 @@
1body: //div[contains(@class, 'entry-content')]//div[contains(@class, 'column-2')]
1single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')] 2single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')]
3strip_id_or_class: entry-related
4strip_id_or_class: entry-sidebar
5strip_id_or_class: entry-pagination
2tidy: no 6tidy: no
3prune: no 7prune: no
4 8
5test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/ \ No newline at end of file 9test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/
10test_url: http://www.theglobeandmail.com/report-on-business/industry-news/energy-and-resources/cliffs-natural-resources-looking-to-exit-ontarios-ring-of-fire/article20651617/ \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/theguardian.com.txt b/inc/3rdparty/site_config/standard/theguardian.com.txt
index c803e4e4..88e2ecf4 100755
--- a/inc/3rdparty/site_config/standard/theguardian.com.txt
+++ b/inc/3rdparty/site_config/standard/theguardian.com.txt
@@ -6,8 +6,19 @@ strip: //div[contains(@class, 'kindleWidget')]
6#strip: //a[not(text())] 6#strip: //a[not(text())]
7strip_id_or_class: pocket-btn 7strip_id_or_class: pocket-btn
8author: //li[@class='byline'] 8author: //li[@class='byline']
9native_ad_clue: //meta[@property="article:tag" and contains(@content, "Partner zone")]
10native_ad_clue: //meta[@property="video:tag" and contains(@content, "Partner zone")]
9prune: no 11prune: no
10tidy: no 12tidy: no
13
11test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption 14test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption
15test_contains: The National Security Agency has made repeated attempts to develop
16test_contains: The agency did not directly address those questions, instead providing a statement.
17
12test_url: http://www.theguardian.com/world/2013/oct/03/edward-snowden-files-john-lanchester 18test_url: http://www.theguardian.com/world/2013/oct/03/edward-snowden-files-john-lanchester
13test_url: http://www.theguardian.com/commentisfree/2014/jun/15/britishness-search-identity-my-part-in-camerons-odyssey \ No newline at end of file 19test_contains: In August, the editor of the Guardian rang me up and asked if I would spend a week in New York
20test_contains: As the second most senior judge in the country, Lord Hoffmann, said in 2004 about a previous version of our anti-terrorism laws
21
22test_url: http://www.theguardian.com/commentisfree/2014/jun/15/britishness-search-identity-my-part-in-camerons-odyssey
23# Native ad
24test_url: http://www.theguardian.com/sustainable-business/2014/jul/18/ben-jerry-turn-ice-cream-into-energy
diff --git a/inc/3rdparty/site_config/standard/theverge.com.txt b/inc/3rdparty/site_config/standard/theverge.com.txt
index 1e1ce58f..78f8654a 100755
--- a/inc/3rdparty/site_config/standard/theverge.com.txt
+++ b/inc/3rdparty/site_config/standard/theverge.com.txt
@@ -15,6 +15,11 @@ strip: //nav
15strip: //img[contains(@class, 'vox-lazy-load')] 15strip: //img[contains(@class, 'vox-lazy-load')]
16# deal with bad parsing 16# deal with bad parsing
17strip: //div[contains(@class, 'story-image')]//div[contains(., 'function(')] 17strip: //div[contains(@class, 'story-image')]//div[contains(., 'function(')]
18strip: //div[contains(@class, 'm-linkset')]
19strip: //div[contains(@class, 'm-entry__sidebar')]
20strip: //ul[contains(@class, 'm-article__sources')]
21strip: //div[contains(@class, 'chorus-emc__content')]
22
18 23
19strip_id_or_class: gallery 24strip_id_or_class: gallery
20strip_id_or_class: article-meta 25strip_id_or_class: article-meta
@@ -45,4 +50,4 @@ test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review
45test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review 50test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review
46test_url: http://www.theverge.com/2013/2/24/4026114/barnes-noble-shifting-focus-away-from-nook-hardware 51test_url: http://www.theverge.com/2013/2/24/4026114/barnes-noble-shifting-focus-away-from-nook-hardware
47test_url: http://www.theverge.com/2014/6/19/5824072/top-shelf-living-the-dream 52test_url: http://www.theverge.com/2014/6/19/5824072/top-shelf-living-the-dream
48test_url: http://www.theverge.com/rss/frontpage \ No newline at end of file 53test_url: http://www.theverge.com/rss/frontpage
diff --git a/inc/3rdparty/site_config/standard/thisiscolossal.com.txt b/inc/3rdparty/site_config/standard/thisiscolossal.com.txt
new file mode 100755
index 00000000..ab16ce18
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/thisiscolossal.com.txt
@@ -0,0 +1,25 @@
1# Author: zinnober
2
3tidy: no
4prune: no
5
6# Set author
7author: //a[contains(@rel, 'author')]
8
9# Content is here
10body: //article
11
12# Tidy up before article
13strip: //header
14
15# Get rid of doubled images
16strip: //img[contains(@class, '-hidden')]
17
18# Tidy up after article
19strip_id_or_class: social-list
20strip_id_or_class: meta-info
21strip: //footer
22
23# Try it yourself
24test_url: http://www.thisiscolossal.com/2014/09/chicago-in-the-fog-by-michael-salisbury/
25test_url: http://www.thisiscolossal.com/2014/09/bird-portraits-ruffling-with-personality-by-leila-jeffreys/
diff --git a/inc/3rdparty/site_config/standard/towerofthehand.com.txt b/inc/3rdparty/site_config/standard/towerofthehand.com.txt
new file mode 100755
index 00000000..a4d87d12
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/towerofthehand.com.txt
@@ -0,0 +1,10 @@
1title: //div[@id='headline']
2body: //div[@class='entry_text']
3author: //div[text() = 'Author:']/following-sibling::div/a
4date: //div[text() = 'Published:']/following-sibling::div
5single_page_link: //a[@href='noscript.html']
6prune: no
7
8test_url: http://towerofthehand.com/blog/2014/08/08-pitch-this-got-spinoff/index.html
9test_url: http://towerofthehand.com/blog/2014/07/31-definitions-and-embodiments/index.html
10test_url: http://towerofthehand.com/blog/2014/07/03-hero-with-thousand-faces/index.html
diff --git a/inc/3rdparty/site_config/standard/twitter.com.txt b/inc/3rdparty/site_config/standard/twitter.com.txt
index 520ebd85..0e5b7487 100755
--- a/inc/3rdparty/site_config/standard/twitter.com.txt
+++ b/inc/3rdparty/site_config/standard/twitter.com.txt
@@ -6,4 +6,5 @@ date: //span[contains(@class, 'js-short-timestamp')]/@data-time
6prune: no 6prune: no
7tidy: no 7tidy: no
8 8
9test_url: https://twitter.com/medialens/status/216883678582804480 \ No newline at end of file 9test_url: https://twitter.com/medialens/status/216883678582804480
10test_contains: is all but alone in challenging the tsunami of UK
diff --git a/inc/3rdparty/site_config/standard/vanityfair.com.txt b/inc/3rdparty/site_config/standard/vanityfair.com.txt
index efa38224..f52339cf 100755
--- a/inc/3rdparty/site_config/standard/vanityfair.com.txt
+++ b/inc/3rdparty/site_config/standard/vanityfair.com.txt
@@ -2,6 +2,7 @@ title: //meta[@property="og:title"]/@content
2author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')] 2author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')]
3date: //div[contains(@class, 'cn_date_time')] 3date: //div[contains(@class, 'cn_date_time')]
4body: //div[contains(@class, 'pageContainers')] 4body: //div[contains(@class, 'pageContainers')]
5body: //div[@id='main']
5body: //article[@id='items-container'] 6body: //article[@id='items-container']
6#body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container'] 7#body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container']
7 8
@@ -26,5 +27,7 @@ strip: //li[@class='blogNavPrev']
26single_page_link: //a[@title='Print this page'] 27single_page_link: //a[@title='Print this page']
27 28
28test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105 29test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105
30test_contains: nothing can take away from the miracle of Tahrir Square
31
29test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808 32test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808
30test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201 \ No newline at end of file 33test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201
diff --git a/inc/3rdparty/site_config/standard/wn.de.txt b/inc/3rdparty/site_config/standard/wn.de.txt
new file mode 100755
index 00000000..ef18c8a5
--- /dev/null
+++ b/inc/3rdparty/site_config/standard/wn.de.txt
@@ -0,0 +1,18 @@
1author: //div[@id='main']//div[@class='col right']//div[contains(@class, 'attribute-author')]
2body: //div[@id='main']//div[@class='col right']
3strip_id_or_class: boxes
4strip_id_or_class: lazy
5strip_id_or_class: comment_box
6strip_id_or_class: fb_comments
7
8find_string: <noscript>
9replace_string: <div>
10find_string: </noscript>
11replace_string: </div>
12
13prune: no
14tidy: no
15
16test_url: http://www.wn.de/Muenster/Kultur/1742956-Wilm-Weppelmann-verlaesst-die-Einsiedelei-Und-dann-ab-unter-die-Dusche
17# feed
18test_url: http://www.wn.de/rss/feed/wn_muenster \ No newline at end of file
diff --git a/inc/3rdparty/site_config/standard/zeit.de.txt b/inc/3rdparty/site_config/standard/zeit.de.txt
index 8c9c1718..9815d478 100755
--- a/inc/3rdparty/site_config/standard/zeit.de.txt
+++ b/inc/3rdparty/site_config/standard/zeit.de.txt
@@ -1,4 +1,3 @@
1# 2014-10-21 [Marmo] added stripping of inline ads and appropriate test_url
2# 2013.10.30 [rezor92] fixed single_page_link 1# 2013.10.30 [rezor92] fixed single_page_link
3# 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions 2# 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions
4# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section) 3# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)
@@ -17,8 +16,6 @@ author: substring-after(//li[@class='source first '], 'Quelle: ')
17 16
18strip_id_or_class: articleheader 17strip_id_or_class: articleheader
19strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"] 18strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"]
20#Remove inline ads
21strip: //div[@class="innerad"]
22 19
23#Removes author and date from the start 20#Removes author and date from the start
24strip: //ul[@class="tools"] 21strip: //ul[@class="tools"]
@@ -46,4 +43,3 @@ strip_id_or_class:"pagination"
46 43
47footnotes: no 44footnotes: no
48test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag 45test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag
49test_url: http://www.zeit.de/wissen/2014-10/ebola-nigeria-who