From 90a1a78b1e2f4d40e1d9b8e6f46aca129a9d7bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Mon, 27 Oct 2014 06:46:13 +0100 Subject: updated site_config --- .../site_config/standard/512pixels.net.txt | 4 +- inc/3rdparty/site_config/standard/README.md | 8 +-- .../site_config/standard/alexduner.com.txt | 2 +- .../site_config/standard/anandtech.com.txt | 6 ++- .../site_config/standard/apotheke-adhoc.de.txt | 23 +++++++++ .../site_config/standard/arstechnica.com.txt | 2 + .../site_config/standard/autocar.co.uk.txt | 13 +++++ inc/3rdparty/site_config/standard/bbc.co.uk.txt | 17 +++++- inc/3rdparty/site_config/standard/bbc.com.txt | 60 ++++++++++++++++++++++ inc/3rdparty/site_config/standard/bit-tech.net.txt | 19 +++++++ .../site_config/standard/bleacherreport.com.txt | 16 ++++++ .../site_config/standard/blogs.faz.net.txt | 45 ++++++++++++++++ .../site_config/standard/brasil.elpais.com.txt | 7 ++- .../site_config/standard/businessweek.com.txt | 41 +++++---------- inc/3rdparty/site_config/standard/buzzfeed.com.txt | 11 +++- .../site_config/standard/canonrumors.com.txt | 28 ++++++++++ inc/3rdparty/site_config/standard/chomsky.info.txt | 3 +- .../site_config/standard/cn.reuters.com.txt | 6 ++- .../site_config/standard/code.fivefilters.org.txt | 4 +- .../site_config/standard/csmonitor.com.txt | 2 +- .../site_config/standard/da.feedsportal.com.txt | 2 +- .../site_config/standard/designsponge.com.txt | 31 +++++++++++ .../site_config/standard/desitvforum.net.txt | 4 +- .../standard/deutsche-apotheker-zeitung.de.txt | 29 +++++++++++ .../standard/dictionary.reference.com.txt | 8 ++- inc/3rdparty/site_config/standard/dropbox.com.txt | 4 +- .../site_config/standard/echo-online.de.txt | 24 +++++++++ .../site_config/standard/economist.com.txt | 7 ++- .../site_config/standard/eurogamer.net.txt | 13 ++--- inc/3rdparty/site_config/standard/facebook.com.txt | 9 +++- inc/3rdparty/site_config/standard/faz.net.txt | 0 .../site_config/standard/finance.yahoo.com.txt | 4 +- .../site_config/standard/fivechapters.com.txt | 2 +- .../site_config/standard/fivefilters.org.txt | 5 +- .../site_config/standard/foreignpolicy.com.txt | 8 ++- inc/3rdparty/site_config/standard/golem.de.txt | 53 +++++++++++-------- inc/3rdparty/site_config/standard/heise.de.txt | 45 +++++++++++++--- .../site_config/standard/hosted.ap.org.txt | 2 +- .../site_config/standard/itunes.apple.com.txt | 14 +++++ .../site_config/standard/kachiblog.com.txt | 2 +- .../site_config/standard/lifehacker.co.uk.txt | 7 +++ inc/3rdparty/site_config/standard/mainpost.de.txt | 2 +- .../site_config/standard/medialens.org.txt | 3 +- inc/3rdparty/site_config/standard/medium.com.txt | 13 +++-- .../site_config/standard/menshealth.com.sg.txt | 12 +++++ .../site_config/standard/northumberlandview.ca.txt | 2 +- inc/3rdparty/site_config/standard/nytimes.com.txt | 6 ++- inc/3rdparty/site_config/standard/real.gr.txt | 6 ++- inc/3rdparty/site_config/standard/reddit.com.txt | 5 +- .../site_config/standard/searchengineland.com.txt | 2 +- .../site_config/standard/sourcebooks.com.txt | 2 +- .../site_config/standard/tabletmag.com.txt | 5 ++ .../site_config/standard/tagesspiegel.de.txt | 60 ++++++++++++++++++++++ inc/3rdparty/site_config/standard/techmeme.com.txt | 2 +- .../site_config/standard/theatlantic.com.txt | 2 + .../site_config/standard/theglobeandmail.com.txt | 7 ++- .../site_config/standard/theguardian.com.txt | 13 ++++- inc/3rdparty/site_config/standard/theverge.com.txt | 7 ++- .../site_config/standard/thisiscolossal.com.txt | 25 +++++++++ .../site_config/standard/towerofthehand.com.txt | 10 ++++ inc/3rdparty/site_config/standard/twitter.com.txt | 3 +- .../site_config/standard/vanityfair.com.txt | 5 +- inc/3rdparty/site_config/standard/wn.de.txt | 18 +++++++ inc/3rdparty/site_config/standard/zeit.de.txt | 4 -- 64 files changed, 685 insertions(+), 119 deletions(-) create mode 100755 inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt create mode 100755 inc/3rdparty/site_config/standard/autocar.co.uk.txt create mode 100755 inc/3rdparty/site_config/standard/bbc.com.txt create mode 100755 inc/3rdparty/site_config/standard/bit-tech.net.txt create mode 100755 inc/3rdparty/site_config/standard/bleacherreport.com.txt create mode 100755 inc/3rdparty/site_config/standard/blogs.faz.net.txt create mode 100755 inc/3rdparty/site_config/standard/canonrumors.com.txt create mode 100755 inc/3rdparty/site_config/standard/designsponge.com.txt create mode 100755 inc/3rdparty/site_config/standard/deutsche-apotheker-zeitung.de.txt create mode 100755 inc/3rdparty/site_config/standard/echo-online.de.txt mode change 100644 => 100755 inc/3rdparty/site_config/standard/faz.net.txt create mode 100755 inc/3rdparty/site_config/standard/itunes.apple.com.txt create mode 100755 inc/3rdparty/site_config/standard/lifehacker.co.uk.txt create mode 100755 inc/3rdparty/site_config/standard/menshealth.com.sg.txt create mode 100755 inc/3rdparty/site_config/standard/tabletmag.com.txt create mode 100755 inc/3rdparty/site_config/standard/tagesspiegel.de.txt create mode 100755 inc/3rdparty/site_config/standard/thisiscolossal.com.txt create mode 100755 inc/3rdparty/site_config/standard/towerofthehand.com.txt create mode 100755 inc/3rdparty/site_config/standard/wn.de.txt (limited to 'inc/3rdparty/site_config') diff --git a/inc/3rdparty/site_config/standard/512pixels.net.txt b/inc/3rdparty/site_config/standard/512pixels.net.txt index e458980f..02a996f7 100755 --- a/inc/3rdparty/site_config/standard/512pixels.net.txt +++ b/inc/3rdparty/site_config/standard/512pixels.net.txt @@ -1,2 +1,2 @@ -title: substring-before(//title, '—') -test_url: http://512pixels.net/more-on-linked-lists/ \ No newline at end of file +title: //meta[@property='og:title']/@content +test_url: http://www.512pixels.net/blog/2014/10/the-move diff --git a/inc/3rdparty/site_config/standard/README.md b/inc/3rdparty/site_config/standard/README.md index 9040ba85..ab5b12d9 100755 --- a/inc/3rdparty/site_config/standard/README.md +++ b/inc/3rdparty/site_config/standard/README.md @@ -1,12 +1,14 @@ Full-Text RSS site config files ================ -[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically. +[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically. -This repository contains the site config files we use in Full-Text RSS. +This repository contains the site-specific extraction rules we rely on in Full-Text RSS. ### Contributing changes +We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for. + We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface. You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model: @@ -31,7 +33,7 @@ Marco, Instapaper's creator, graciously opened up the database of contributions > And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached. -Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required). +Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold). ### Testing site config files diff --git a/inc/3rdparty/site_config/standard/alexduner.com.txt b/inc/3rdparty/site_config/standard/alexduner.com.txt index bd9de9d7..3897f9ec 100755 --- a/inc/3rdparty/site_config/standard/alexduner.com.txt +++ b/inc/3rdparty/site_config/standard/alexduner.com.txt @@ -1,4 +1,4 @@ body: //section[@class='content'] date: //span[1] author: //h1[@id='sitetitle'] -test_url: https://alexduner.com/blog/2013/1/something-i-learned-today \ No newline at end of file +test_url: http://alexduner.com/blog/something-i-learned-today diff --git a/inc/3rdparty/site_config/standard/anandtech.com.txt b/inc/3rdparty/site_config/standard/anandtech.com.txt index 7d804918..fc95c5d8 100755 --- a/inc/3rdparty/site_config/standard/anandtech.com.txt +++ b/inc/3rdparty/site_config/standard/anandtech.com.txt @@ -1,3 +1,5 @@ +body: //section[@class='main_cont']/img | //div[@class='articleContent'] +title: //div[@class='blog_top_left']//h2 author: //a[@class='b'][1] date: substring-after(substring-before(//div, 'Posted in'), ' on ') strip_image_src: /content/images/globals/ @@ -8,4 +10,6 @@ prune: no single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) -test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ \ No newline at end of file +test_url: http://www.anandtech.com/show/8370/gigabyte-am1m-s2h-review +test_url: http://www.anandtech.com/show/8402/sandisk-releases-ultra-ii-ssd-the-second-tlc-nand-ssd-in-the-market +test_url: http://www.anandtech.com/show/8400/arms-cortex-m-even-smaller-and-lower-power-cpu-cores diff --git a/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt b/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt new file mode 100755 index 00000000..3a702e7b --- /dev/null +++ b/inc/3rdparty/site_config/standard/apotheke-adhoc.de.txt @@ -0,0 +1,23 @@ +# Author: zinnober + +prune: no + +title: substring-before(//div[@id='content']/h1, ',') + +single_page_link: //a[@title='Seite drucken'] + +body: //div[@id='detail-body'] + +replace_string(): +replace_string(

):

+ +# Fix headlines +replace_string(Patrick Hollstein):   +replace_string(APOTHEKE ADHOC):   +replace_string(dpa):   +replace_string(Katharina Lübke):   +replace_string(Julia Pradel):   +replace_string(Franziska Gerhardt):   + +test_url: http://www.apotheke-adhoc.de/nachrichten/politik/nachricht-detail-politik/deutscher-apothekertag-antraege-gegen-lieferengpaesse-2/ + diff --git a/inc/3rdparty/site_config/standard/arstechnica.com.txt b/inc/3rdparty/site_config/standard/arstechnica.com.txt index 767f6800..eb92aa2c 100755 --- a/inc/3rdparty/site_config/standard/arstechnica.com.txt +++ b/inc/3rdparty/site_config/standard/arstechnica.com.txt @@ -13,5 +13,7 @@ title: //div[@id='story']//h2[@class='title'] strip: //div[@class='pager'] next_page_link: //nav//a[span/@class='next']/@href +native_ad_clue: //meta[@property="og:url" and contains(@content, '/sponsored/')] + test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ diff --git a/inc/3rdparty/site_config/standard/autocar.co.uk.txt b/inc/3rdparty/site_config/standard/autocar.co.uk.txt new file mode 100755 index 00000000..9f4fe18b --- /dev/null +++ b/inc/3rdparty/site_config/standard/autocar.co.uk.txt @@ -0,0 +1,13 @@ +title: //div[@class='col-center']/h1 +author: //div[@class='personality']/a +date: //div[@class='personality-date'] +body: //div[@class='content-top ']//div[@class='content'][1] | //div[contains(@class,'article-body')] | //div[contains(@class,'main-article')] + +next_page_link: //div[@id='review-link']/a + +strip: //div[@class='author-block'] +strip: //p//iframe[contains(@src,'signup')]/preceding::p[1] + +test_url: http://www.autocar.co.uk/car-review/volkswagen/golf +test_url: http://www.autocar.co.uk/car-news/pebble-beach/saleen-unveils-performance-electric-vehicle-based-tesla-model-s +test_url: http://www.autocar.co.uk/car-review/rolls-royce/first-drives/rolls-royce-ghost-series-ii-first-drive-review diff --git a/inc/3rdparty/site_config/standard/bbc.co.uk.txt b/inc/3rdparty/site_config/standard/bbc.co.uk.txt index ef1f491a..bad77654 100755 --- a/inc/3rdparty/site_config/standard/bbc.co.uk.txt +++ b/inc/3rdparty/site_config/standard/bbc.co.uk.txt @@ -13,7 +13,7 @@ body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] #strip: //div[@class="story-feature narrow"] #strip: //div[@class="story-feature wide"] #strip: //div[@class="story-feature dslideshow-enclosure"] -strip: //div[contains(@class, "story-feature")] +strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))] strip: //span[@class="story-date"] #strip: //div[@class="caption body-narrow-width"] strip: //div[@class="warning"]//p @@ -30,13 +30,26 @@ strip: //div[contains(@class, 'comment-introduction')] strip: //div[contains(@class, 'share-tools')] strip: //div[@id='also-related-links'] +strip_id_or_class: share-help +strip_id_or_class: comments_module + replace_string(

replace_string():
+tidy: no prune: no dissolve: //h2 + test_url: http://www.bbc.co.uk/sport/0/football/23224017 +test_contains: Swansea City have completed the club-record signing + test_url: http://www.bbc.co.uk/news/business-15060862 +test_contains: Europe's leaders are meeting again to try to solve + +# news feed +test_url: http://feeds.bbci.co.uk/news/rss.xml +# sports feed +test_url: http://feeds.bbci.co.uk/sport/0/football/rss.xml?edition=int # video entry -test_url: http://www.bbc.co.uk/news/world-asia-22056933 \ No newline at end of file +test_url: http://www.bbc.co.uk/news/world-asia-22056933 diff --git a/inc/3rdparty/site_config/standard/bbc.com.txt b/inc/3rdparty/site_config/standard/bbc.com.txt new file mode 100755 index 00000000..c04a683e --- /dev/null +++ b/inc/3rdparty/site_config/standard/bbc.com.txt @@ -0,0 +1,60 @@ +body: //div[@class="story-body"] +# for video entries +body: //div[contains(@class, "videoInStory") or @id="meta-information"] +title: //h1[@class="story-header"] +date: //span[@class="story-date"]/span[@class='date'] +# for sport site +date: //meta[@name='DCTERMS.created']/@content +author: //div[@id='headline']//span[@class='byline-name'] + +# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 +body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] + +#strip: //div[@class="story-feature narrow"] +#strip: //div[@class="story-feature wide"] +#strip: //div[@class="story-feature dslideshow-enclosure"] +strip: //div[contains(@class, "story-feature") and not(contains(@class, 'full-width'))] +strip: //span[@class="story-date"] +#strip: //div[@class="caption body-narrow-width"] +strip: //div[@class="warning"]//p +strip: //div[@id='page-bookmark-links-head'] +strip: //object +strip: //div[contains(@class, "bbccom_advert_placeholder")] +strip: //div[contains(@class, "embedded-hyper")] +strip: //div[contains(@class, 'market-data')] +strip: //a[contains(@class, 'hidden')] +strip: //div[contains(@class, 'hypertabs')] +strip: //div[contains(@class, 'related')] +strip: //form[@id='comment-form'] +strip: //div[contains(@class, 'comment-introduction')] +strip: //div[contains(@class, 'share-tools')] +strip: //div[@id='also-related-links'] + +strip_id_or_class: share-help +strip_id_or_class: comments_module + +replace_string(