diff options
Diffstat (limited to 'inc/3rdparty/site_config')
951 files changed, 7577 insertions, 5674 deletions
diff --git a/inc/3rdparty/site_config/standard/24ways.org.txt b/inc/3rdparty/site_config/standard/24ways.org.txt index 03bd1950..86c9e077 100644..100755 --- a/inc/3rdparty/site_config/standard/24ways.org.txt +++ b/inc/3rdparty/site_config/standard/24ways.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='meta']/h2/a | 1 | title: //div[@class='meta']/h2/a |
2 | author: //div[@class='meta']/h2/following-sibling::p/a/text() | 2 | author: //div[@class='meta']/h2/following-sibling::p/a/text() |
3 | date://div[@class='meta']/h2/strong | 3 | date://div[@class='meta']/h2/strong |
4 | body: //div[@id='article'] | 4 | body: //div[@id='article'] |
5 | strip: //div[@class='domore'] | 5 | strip: //div[@class='domore'] |
6 | test_url: http://24ways.org/2011/composing-the-new-canon \ No newline at end of file | 6 | test_url: http://24ways.org/2011/composing-the-new-canon \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/36kr.com.txt b/inc/3rdparty/site_config/standard/36kr.com.txt new file mode 100755 index 00000000..d73d7de5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/36kr.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[contains(@class, 'entry-title')] | ||
2 | date: //meta[@name='weibo: article:create_at']/@content | ||
3 | body: //div[contains(@class, 'mainContent')] | ||
4 | strip_id_or_class: related_topics | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.36kr.com/p/207879.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/37signals.com.txt b/inc/3rdparty/site_config/standard/37signals.com.txt index 43a10ae5..531cac1e 100644..100755 --- a/inc/3rdparty/site_config/standard/37signals.com.txt +++ b/inc/3rdparty/site_config/standard/37signals.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='post_header']//h2/a | 1 | title: //div[@class='post_header']//h2/a |
2 | author: //span[@class='author'] | 2 | author: //span[@class='author'] |
3 | date: //span[@class='date'] | 3 | date: //span[@class='date'] |
4 | body: //div[@id='Content'] | 4 | body: //div[@id='Content'] |
5 | 5 | ||
6 | test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department \ No newline at end of file | 6 | test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/3quarksdaily.com.txt b/inc/3rdparty/site_config/standard/3quarksdaily.com.txt index c4e7940f..80a3958f 100644..100755 --- a/inc/3rdparty/site_config/standard/3quarksdaily.com.txt +++ b/inc/3rdparty/site_config/standard/3quarksdaily.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@class='content'] | 1 | body: //div[@class='content'] |
2 | date: //div[@class='content']/h2 | 2 | date: //div[@class='content']/h2 |
3 | strip: //div[@class='content']/h2 | 3 | strip: //div[@class='content']/h2 |
4 | title: //div[@class='content']/h3 | 4 | title: //div[@class='content']/h3 |
5 | 5 | ||
6 | strip: //div[@id='postmenu'] | 6 | strip: //div[@id='postmenu'] |
7 | strip: //div[@class='trackback'] | 7 | strip: //div[@class='trackback'] |
8 | tidy: no | 8 | tidy: no |
9 | test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html \ No newline at end of file | 9 | test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt b/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt index b846b050..b846b050 100644..100755 --- a/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt +++ b/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt | |||
diff --git a/inc/3rdparty/site_config/standard/43folders.com.txt b/inc/3rdparty/site_config/standard/43folders.com.txt index e8073f6f..3777c66f 100644..100755 --- a/inc/3rdparty/site_config/standard/43folders.com.txt +++ b/inc/3rdparty/site_config/standard/43folders.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //*[@class = 'content'] | 1 | body: //*[@class = 'content'] |
2 | author: //*[@class = 'submitted']/a | 2 | author: //*[@class = 'submitted']/a |
3 | date: substring-after(//*[@class = 'submitted']/text(), '|') | 3 | date: substring-after(//*[@class = 'submitted']/text(), '|') |
4 | test_url: http://www.43folders.com/2011/04/22/cranking \ No newline at end of file | 4 | test_url: http://www.43folders.com/2011/04/22/cranking \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/500px.com.txt b/inc/3rdparty/site_config/standard/500px.com.txt index 68e6b2d0..b9b7e9dd 100644..100755 --- a/inc/3rdparty/site_config/standard/500px.com.txt +++ b/inc/3rdparty/site_config/standard/500px.com.txt | |||
@@ -1,27 +1,27 @@ | |||
1 | # very loose setup for both 500px.com/photo/* and 500px.com/blog/* | 1 | # very loose setup for both 500px.com/photo/* and 500px.com/blog/* |
2 | # photo page example: http://500px.com/photo/4181666 | 2 | # photo page example: http://500px.com/photo/4181666 |
3 | # blog page example: http://500px.com/blog/110 | 3 | # blog page example: http://500px.com/blog/110 |
4 | 4 | ||
5 | # avoid "no text" error | 5 | # avoid "no text" error |
6 | tidy:no | 6 | tidy:no |
7 | prune:no | 7 | prune:no |
8 | 8 | ||
9 | # reorganize photo page elements | 9 | # reorganize photo page elements |
10 | #body://div[contains(@class,'container')] | 10 | #body://div[contains(@class,'container')] |
11 | move_into(body)://div[contains(@id,'thephoto')] | 11 | move_into(body)://div[contains(@id,'thephoto')] |
12 | move_into(body)://div[contains(@id,'description')] | 12 | move_into(body)://div[contains(@id,'description')] |
13 | move_into(body)://div[contains(@id,'tags')] | 13 | move_into(body)://div[contains(@id,'tags')] |
14 | move_into(body)://div[contains(@id,'photo-info')] | 14 | move_into(body)://div[contains(@id,'photo-info')] |
15 | 15 | ||
16 | # clean photo page info | 16 | # clean photo page info |
17 | strip://span[contains(@id,'copyright')] | 17 | strip://span[contains(@id,'copyright')] |
18 | strip://*[contains(@id,'store')] | 18 | strip://*[contains(@id,'store')] |
19 | strip://*[contains(@id,'user-info')] | 19 | strip://*[contains(@id,'user-info')] |
20 | strip://*[contains(@id,'photo-stats')] | 20 | strip://*[contains(@id,'photo-stats')] |
21 | strip://*[contains(@id,'voting_controls_container')] | 21 | strip://*[contains(@id,'voting_controls_container')] |
22 | strip://*[contains(@id,'more-photos')] | 22 | strip://*[contains(@id,'more-photos')] |
23 | strip://*[contains(@id,'embed-photo')] | 23 | strip://*[contains(@id,'embed-photo')] |
24 | 24 | ||
25 | # clean blog page side bar | 25 | # clean blog page side bar |
26 | strip://*[contains(@class,'col d3 clearafter')] | 26 | strip://*[contains(@class,'col d3 clearafter')] |
27 | test_url: http://500px.com/photo/3641041?from=editors \ No newline at end of file | 27 | test_url: http://500px.com/photo/3641041?from=editors \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/512pixels.net.txt b/inc/3rdparty/site_config/standard/512pixels.net.txt index e458980f..e458980f 100644..100755 --- a/inc/3rdparty/site_config/standard/512pixels.net.txt +++ b/inc/3rdparty/site_config/standard/512pixels.net.txt | |||
diff --git a/inc/3rdparty/site_config/standard/5by5.tv.txt b/inc/3rdparty/site_config/standard/5by5.tv.txt index dce0df4e..59b70a99 100644..100755 --- a/inc/3rdparty/site_config/standard/5by5.tv.txt +++ b/inc/3rdparty/site_config/standard/5by5.tv.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //*[@id="episode"] | 1 | body: //*[@id="episode"] |
2 | prune: no | 2 | prune: no |
3 | tidy: no | 3 | tidy: no |
4 | 4 | ||
5 | autodetect_next_page: no | 5 | autodetect_next_page: no |
6 | strip_id_or_class: player | 6 | strip_id_or_class: player |
7 | 7 | ||
8 | strip://*[@id="header"] | 8 | strip://*[@id="header"] |
9 | test_url: http://5by5.tv/buildanalyze/60 \ No newline at end of file | 9 | test_url: http://5by5.tv/buildanalyze/60 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/7newsbelize.com.txt b/inc/3rdparty/site_config/standard/7newsbelize.com.txt new file mode 100755 index 00000000..46d09f8e --- /dev/null +++ b/inc/3rdparty/site_config/standard/7newsbelize.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //*[@id='sstitle'] | ||
2 | body: //div[@id='sstory'] | ||
3 | strip_id_or_class: newsoptions | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.7newsbelize.com/sstory.php?nid=25654 | ||
7 | test_url: http://www.7newsbelize.com/7news.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/944.com.txt b/inc/3rdparty/site_config/standard/944.com.txt index 84380e79..8bf6a4c2 100644..100755 --- a/inc/3rdparty/site_config/standard/944.com.txt +++ b/inc/3rdparty/site_config/standard/944.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h2[@class='border'] | 1 | title: //h2[@class='border'] |
2 | body: //div[@class='padding'] | 2 | body: //div[@class='padding'] |
3 | 3 | ||
4 | convert_double_br_tags: yes | 4 | convert_double_br_tags: yes |
5 | 5 | ||
6 | strip: //div[@id='social_sharing'] | 6 | strip: //div[@id='social_sharing'] |
7 | strip: //div[@class='socialLinks'] | 7 | strip: //div[@class='socialLinks'] |
8 | 8 | ||
9 | test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/ \ No newline at end of file | 9 | test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/README.md b/inc/3rdparty/site_config/standard/README.md new file mode 100755 index 00000000..9040ba85 --- /dev/null +++ b/inc/3rdparty/site_config/standard/README.md | |||
@@ -0,0 +1,38 @@ | |||
1 | Full-Text RSS site config files | ||
2 | ================ | ||
3 | |||
4 | [Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically. | ||
5 | |||
6 | This repository contains the site config files we use in Full-Text RSS. | ||
7 | |||
8 | ### Contributing changes | ||
9 | |||
10 | We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface. | ||
11 | |||
12 | You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model: | ||
13 | |||
14 | > The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination. | ||
15 | |||
16 | When we receive a pull request we'll review the changes and if everything's okay we'll update our copy. | ||
17 | |||
18 | If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github). | ||
19 | |||
20 | ### How to write a site config file | ||
21 | |||
22 | The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block. | ||
23 | |||
24 | For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns). | ||
25 | |||
26 | ### Instapaper | ||
27 | |||
28 | When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users. | ||
29 | |||
30 | Marco, Instapaper's creator, graciously opened up the database of contributions to everyone: | ||
31 | |||
32 | > And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached. | ||
33 | |||
34 | Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required). | ||
35 | |||
36 | ### Testing site config files | ||
37 | |||
38 | Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier. | ||
diff --git a/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt b/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt index 379592e0..b60c15de 100644..100755 --- a/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt +++ b/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] | 2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] |
3 | 3 | ||
4 | strip_id_or_class: socialshareprivacy1 | 4 | strip_id_or_class: socialshareprivacy1 |
5 | strip_id_or_class: zvaFacebookButton | 5 | strip_id_or_class: zvaFacebookButton |
6 | 6 | ||
7 | tidy: no | 7 | tidy: no |
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757 \ No newline at end of file | 10 | test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt b/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt index 4d76fac7..013afa4c 100644..100755 --- a/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt +++ b/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] | 2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] |
3 | 3 | ||
4 | strip_id_or_class: socialshareprivacy1 | 4 | strip_id_or_class: socialshareprivacy1 |
5 | strip_id_or_class: zvaFacebookButton | 5 | strip_id_or_class: zvaFacebookButton |
6 | 6 | ||
7 | tidy: no | 7 | tidy: no |
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718 \ No newline at end of file | 10 | test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/abc.es.txt b/inc/3rdparty/site_config/standard/abc.es.txt index a99833de..43aadc49 100644..100755 --- a/inc/3rdparty/site_config/standard/abc.es.txt +++ b/inc/3rdparty/site_config/standard/abc.es.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text'] | 2 | body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody'] |
3 | strip_id_or_class: colB | 3 | strip_id_or_class: colB |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html \ No newline at end of file | 7 | test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/abc.net.au.txt b/inc/3rdparty/site_config/standard/abc.net.au.txt index 5e6269cb..22b3a0f4 100644..100755 --- a/inc/3rdparty/site_config/standard/abc.net.au.txt +++ b/inc/3rdparty/site_config/standard/abc.net.au.txt | |||
@@ -1,10 +1,18 @@ | |||
1 | title: //h1 | 1 | title: //div[@class='article section']//h1 |
2 | author: //div[@class="byline"]/a | 2 | author: //div[@class="byline"]/a |
3 | date: //span[@class="timestamp"] | 3 | date: //span[@class="timestamp"] |
4 | 4 | body: //div[@class="page section"] | |
5 | strip: //p[@class="topics"] | 5 | |
6 | strip: //h1 | 6 | strip: //a[@class="inline-caption"] |
7 | strip: //div[@class="byline"] | 7 | strip: //p[@class="ticker section noprint"] |
8 | strip: //p[@class="published"] | 8 | strip: //p[@class="topics"] |
9 | strip: //h1 | ||
10 | strip: //div[@class="byline"] | ||
11 | strip: //p[@class="published"] | ||
9 | strip: //div[contains(@class,"featured-scroller")] | 12 | strip: //div[contains(@class,"featured-scroller")] |
10 | test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544 \ No newline at end of file | 13 | strip_id_or_class: footer |
14 | |||
15 | tidy: no | ||
16 | |||
17 | test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892 | ||
18 | test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business | ||
diff --git a/inc/3rdparty/site_config/standard/abcnews.go.com.txt b/inc/3rdparty/site_config/standard/abcnews.go.com.txt index c515d3e4..8d367351 100644..100755 --- a/inc/3rdparty/site_config/standard/abcnews.go.com.txt +++ b/inc/3rdparty/site_config/standard/abcnews.go.com.txt | |||
@@ -1,27 +1,27 @@ | |||
1 | title: //h1[@class='headline'] | 1 | title: //h1[@class='headline'] |
2 | body: //div[@id='storyText'] | 2 | body: //div[@id='storyText'] |
3 | # for video entries | 3 | # for video entries |
4 | body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')] | 4 | body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')] |
5 | author: //div[@class='byline'] | 5 | author: //div[@class='byline'] |
6 | date: //div[@class='date'] | 6 | date: //div[@class='date'] |
7 | strip: //*[@id='date_partner'] | 7 | strip: //*[@id='date_partner'] |
8 | 8 | ||
9 | strip: //div[@class='breadcrumb'] | 9 | strip: //div[@class='breadcrumb'] |
10 | strip: //div[contains(@class,'show_tools')] | 10 | strip: //div[contains(@class,'show_tools')] |
11 | strip: //div[@id='sponsoredByAd'] | 11 | strip: //div[@id='sponsoredByAd'] |
12 | strip: //div[contains(@class,'rel_container')] | 12 | strip: //div[contains(@class,'rel_container')] |
13 | strip: //p[a[starts-with(@href, 'http://www.twitter.com')]] | 13 | strip: //p[a[starts-with(@href, 'http://www.twitter.com')]] |
14 | strip: //p[a[starts-with(@href, 'http://www.facebook.com')]] | 14 | strip: //p[a[starts-with(@href, 'http://www.facebook.com')]] |
15 | strip: //p[contains(., 'Click here to return to')] | 15 | strip: //p[contains(., 'Click here to return to')] |
16 | #strip_id_or_class: media | 16 | #strip_id_or_class: media |
17 | strip_id_or_class: mediaplayer | 17 | strip_id_or_class: mediaplayer |
18 | 18 | ||
19 | replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http | 19 | replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http |
20 | 20 | ||
21 | prune: no | 21 | prune: no |
22 | 22 | ||
23 | single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true') | 23 | single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true') |
24 | 24 | ||
25 | test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744 | 25 | test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744 |
26 | # multi-page | 26 | # multi-page |
27 | test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544 \ No newline at end of file | 27 | test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/accesstoinsight.org.txt b/inc/3rdparty/site_config/standard/accesstoinsight.org.txt index b5d85079..45d66533 100644..100755 --- a/inc/3rdparty/site_config/standard/accesstoinsight.org.txt +++ b/inc/3rdparty/site_config/standard/accesstoinsight.org.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id='H_docTitle'] | 1 | title: //div[@id='H_docTitle'] |
2 | 2 | ||
3 | body: //div[@id='H_meta' or @id='H_content' or @id='F_footer'] | 3 | body: //div[@id='H_meta' or @id='H_content' or @id='F_footer'] |
4 | 4 | ||
5 | strip_id_or_class: F_toenail | 5 | strip_id_or_class: F_toenail |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html \ No newline at end of file | 9 | test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/acidcow.com.txt b/inc/3rdparty/site_config/standard/acidcow.com.txt index 60ede6a6..21958651 100644..100755 --- a/inc/3rdparty/site_config/standard/acidcow.com.txt +++ b/inc/3rdparty/site_config/standard/acidcow.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[starts-with(@id, 'news-id-')] | 1 | body: //div[starts-with(@id, 'news-id-')] |
2 | 2 | ||
3 | test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html \ No newline at end of file | 3 | test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/acquia.com.txt b/inc/3rdparty/site_config/standard/acquia.com.txt index 5ddf542e..2803611f 100644..100755 --- a/inc/3rdparty/site_config/standard/acquia.com.txt +++ b/inc/3rdparty/site_config/standard/acquia.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title://h1[@class="title"] | 1 | title://h1[@class="title"] |
2 | author://div[@class="submitted"]/span/a | 2 | author://div[@class="submitted"]/span/a |
3 | date://div[@class="submitted"]/span | 3 | date://div[@class="submitted"]/span |
4 | body://div[@class="content-wrapper"] | 4 | body://div[@class="content-wrapper"] |
5 | 5 | ||
6 | strip://div[@id="skip-link"] | 6 | strip://div[@id="skip-link"] |
7 | strip://div[@id="region-content-3-3"] | 7 | strip://div[@id="region-content-3-3"] |
8 | strip://div[@id="section-footer"] | 8 | strip://div[@id="section-footer"] |
9 | test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code \ No newline at end of file | 9 | test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/acroswing.fr.txt b/inc/3rdparty/site_config/standard/acroswing.fr.txt index 57d86d2f..6b1d67fe 100644..100755 --- a/inc/3rdparty/site_config/standard/acroswing.fr.txt +++ b/inc/3rdparty/site_config/standard/acroswing.fr.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | tidy:no | 1 | tidy:no |
2 | date: //time[@class='updated'] | 2 | date: //time[@class='updated'] |
3 | dissolve: //ul[@class='video-gallery']/li | 3 | dissolve: //ul[@class='video-gallery']/li |
4 | dissolve: //ul[@class='video-gallery'] | 4 | dissolve: //ul[@class='video-gallery'] |
5 | test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php \ No newline at end of file | 5 | test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/aftenposten.no.txt b/inc/3rdparty/site_config/standard/aftenposten.no.txt new file mode 100755 index 00000000..8a69c357 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aftenposten.no.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1[@class='articleTitle '] | ||
2 | body: //div[@class='bodyText widget storyContent'] | ||
3 | strip: //p/span[@class='quote']/.. | ||
4 | strip_id_or_class: 'pull1' | ||
5 | test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aftonbladet.se.txt b/inc/3rdparty/site_config/standard/aftonbladet.se.txt new file mode 100755 index 00000000..b6c576a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aftonbladet.se.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | author: //article//address[contains(@class, 'author')] | ||
2 | body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')] | ||
3 | |||
4 | strip: //address//img | ||
5 | strip: //footer | ||
6 | strip_id_or_class: abSticky | ||
7 | |||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab | ||
11 | test_url: http://www.aftonbladet.se/debatt/article16207536.ab | ||
12 | test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab | ||
13 | test_url: http://www.aftonbladet.se/rss.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt b/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt index 408e9099..b2d88a05 100644..100755 --- a/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt +++ b/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | 2 | ||
3 | # clean up recipe pages | 3 | # clean up recipe pages |
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | 4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] |
5 | 5 | ||
6 | #recipe pages | 6 | #recipe pages |
7 | strip_id_or_class: "recipe-feedback" | 7 | strip_id_or_class: "recipe-feedback" |
8 | strip_id_or_class: "comments" | 8 | strip_id_or_class: "comments" |
9 | strip_id_or_class: "procedure-number" | 9 | strip_id_or_class: "procedure-number" |
10 | strip_id_or_class: "more-with-author" | 10 | strip_id_or_class: "more-with-author" |
11 | 11 | ||
12 | #slice | 12 | #slice |
13 | strip_id_or_class: "inner" | 13 | strip_id_or_class: "inner" |
14 | 14 | ||
15 | test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html \ No newline at end of file | 15 | test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/albayan.ae.txt b/inc/3rdparty/site_config/standard/albayan.ae.txt new file mode 100755 index 00000000..f6c093d2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/albayan.ae.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='main-column']//div[@class='content'] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645 | ||
6 | test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alex.mullr.net.txt b/inc/3rdparty/site_config/standard/alex.mullr.net.txt index c5f15370..c5f15370 100644..100755 --- a/inc/3rdparty/site_config/standard/alex.mullr.net.txt +++ b/inc/3rdparty/site_config/standard/alex.mullr.net.txt | |||
diff --git a/inc/3rdparty/site_config/standard/alexduner.com.txt b/inc/3rdparty/site_config/standard/alexduner.com.txt new file mode 100755 index 00000000..bd9de9d7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alexduner.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //section[@class='content'] | ||
2 | date: //span[1] | ||
3 | author: //h1[@id='sitetitle'] | ||
4 | test_url: https://alexduner.com/blog/2013/1/something-i-learned-today \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alexduner.squarespace.com.txt b/inc/3rdparty/site_config/standard/alexduner.squarespace.com.txt new file mode 100755 index 00000000..875405e4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alexduner.squarespace.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //section[@class='content'] | ||
2 | date: //span[1] | ||
3 | author: //h1[@id='sitetitle'] | ||
4 | test_url: https://alexduner.squarespace.com/blog/2013/1/tech-culture-from-the-outside-looking-in \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alistapart.com.txt b/inc/3rdparty/site_config/standard/alistapart.com.txt index 090f7eb1..7a7096e2 100644..100755 --- a/inc/3rdparty/site_config/standard/alistapart.com.txt +++ b/inc/3rdparty/site_config/standard/alistapart.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h1[@class='title'] | 1 | title: //h1[@class='title'] |
2 | author: //h3[@class='byline']/a | 2 | author: //h3[@class='byline']/a |
3 | date: //div[@class='ishinfo'] | 3 | date: //div[@class='ishinfo'] |
4 | 4 | ||
5 | body: //*[@id='articletext'] | 5 | body: //*[@id='articletext'] |
6 | strip_id_or_class: 'ishinfo' | 6 | strip_id_or_class: 'ishinfo' |
7 | strip_id_or_class: 'metastuff' | 7 | strip_id_or_class: 'metastuff' |
8 | strip_id_or_class: 'learnmore' | 8 | strip_id_or_class: 'learnmore' |
9 | strip_id_or_class: 'discuss' | 9 | strip_id_or_class: 'discuss' |
10 | 10 | ||
11 | prune: no | 11 | prune: no |
12 | test_url: http://www.alistapart.com/articles/organizing-mobile/ \ No newline at end of file | 12 | test_url: http://www.alistapart.com/articles/organizing-mobile/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/aljazeera.com.txt b/inc/3rdparty/site_config/standard/aljazeera.com.txt index 4f0148f4..d3bf4014 100644..100755 --- a/inc/3rdparty/site_config/standard/aljazeera.com.txt +++ b/inc/3rdparty/site_config/standard/aljazeera.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //span[@id='DetailedTitle'] | 1 | title: //span[@id='DetailedTitle'] |
2 | body: //td[@id='tdTextContent'] | 2 | body: //td[@id='tdTextContent'] |
3 | strip_id_or_class: Skyscrapper_Body | 3 | strip_id_or_class: Skyscrapper_Body |
4 | date: //span[@id='ctl00_cphBody_lblDate'] | 4 | date: //span[@id='ctl00_cphBody_lblDate'] |
5 | author: //div[@id="dvAuthorInfo"]//a/text() | 5 | author: //div[@id="dvAuthorInfo"]//a/text() |
6 | strip: //table[ tbody/tr/td/object ] | 6 | strip: //table[ tbody/tr/td/object ] |
7 | prune: no | 7 | prune: no |
8 | test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html \ No newline at end of file | 8 | test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/allrecipes.com.txt b/inc/3rdparty/site_config/standard/allrecipes.com.txt index e9767bda..85dc2a5a 100644..100755 --- a/inc/3rdparty/site_config/standard/allrecipes.com.txt +++ b/inc/3rdparty/site_config/standard/allrecipes.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //h1[@id='itemTitle'] | 1 | title: //h1[@id='itemTitle'] |
2 | body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')] | 2 | body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')] |
3 | strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right'] | 3 | strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right'] |
4 | strip: //div[contains(@class, 'rightcoltoolsdiv')] | 4 | strip: //div[contains(@class, 'rightcoltoolsdiv')] |
5 | strip: //div[contains(@class, 'servings-form')] | 5 | strip: //div[contains(@class, 'servings-form')] |
6 | strip: //p[@class='nutritional-information'] | 6 | strip: //p[@class='nutritional-information'] |
7 | strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')] | 7 | strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')] |
8 | strip: //div[@id='nutri-info']/div[contains(@class, 'title')] | 8 | strip: //div[@id='nutri-info']/div[contains(@class, 'title')] |
9 | strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter'] | 9 | strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter'] |
10 | strip_id_or_class: eshaAttribute | 10 | strip_id_or_class: eshaAttribute |
11 | strip_id_or_class: eshaParagraph | 11 | strip_id_or_class: eshaParagraph |
12 | prune: no | 12 | prune: no |
13 | 13 | ||
14 | test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd \ No newline at end of file | 14 | test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/allthingsd.com.txt b/inc/3rdparty/site_config/standard/allthingsd.com.txt index cd52498f..f8c67d02 100644..100755 --- a/inc/3rdparty/site_config/standard/allthingsd.com.txt +++ b/inc/3rdparty/site_config/standard/allthingsd.com.txt | |||
@@ -1,10 +1,13 @@ | |||
1 | title://div[@class="article-title"]/h1[@class="title"] | 1 | title://div[@class="article-title"]/h1[@class="title"] |
2 | date: //p[@class="article-date"] | 2 | date: //p[@class="article-date"] |
3 | body://*[@class="article-body article-text"] | 3 | body://div[contains(@class, "article-body")] |
4 | # Trim out related posts at bottom of article | 4 | # Trim out related posts at bottom of article |
5 | strip://blockquote[@class="memo"] | 5 | strip://blockquote[@class="memo"] |
6 | 6 | ||
7 | # Yup, no idea why author won't work... | 7 | tidy: no |
8 | author://div[@class="page-header article-header clearfix"]/p[@class="title"] | 8 | |
9 | # Yup, no idea why author won't work... | ||
10 | author://div[@class="page-header article-header clearfix"]/p[@class="title"] | ||
9 | # [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it. | 11 | # [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it. |
10 | test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/ \ No newline at end of file | 12 | test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/ |
13 | test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/allyou.com.txt b/inc/3rdparty/site_config/standard/allyou.com.txt index 3c26c682..a13a7252 100644..100755 --- a/inc/3rdparty/site_config/standard/allyou.com.txt +++ b/inc/3rdparty/site_config/standard/allyou.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@id='pageHdr']//h1 | 1 | title: //div[@id='pageHdr']//h1 |
2 | body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint'] | 2 | body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint'] |
3 | strip: //div[contains(@class, 'infoBox') or @id='infoBox'] | 3 | strip: //div[contains(@class, 'infoBox') or @id='infoBox'] |
4 | single_page_link: //li[@id='print']/a | 4 | single_page_link: //li[@id='print']/a |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | 7 | ||
8 | test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/ \ No newline at end of file | 8 | test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt b/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt index f5865f89..da1a67bc 100644..100755 --- a/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt +++ b/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class = 'entry'] | 1 | body: //div[@class = 'entry'] |
2 | date: substring-after(//p[@class="date"],'بتاريخ ') | 2 | date: substring-after(//p[@class="date"],'بتاريخ ') |
3 | strip_id_or_class: date | 3 | strip_id_or_class: date |
4 | strip_id_or_class: follow-single | 4 | strip_id_or_class: follow-single |
5 | strip_id_or_class: ratingblock | 5 | strip_id_or_class: ratingblock |
6 | strip_id_or_class: newRatingHolder | 6 | strip_id_or_class: newRatingHolder |
7 | strip_id_or_class: postmetadata | 7 | strip_id_or_class: postmetadata |
8 | strip_id_or_class: addthis_toolbox | 8 | strip_id_or_class: addthis_toolbox |
9 | strip_id_or_class: addthis_default_style | 9 | strip_id_or_class: addthis_default_style |
10 | strip_id_or_class: size-full | 10 | strip_id_or_class: size-full |
11 | test_url: http://alphabeta.argaam.com/?p=35657 \ No newline at end of file | 11 | test_url: http://alphabeta.argaam.com/?p=35657 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/alriyadh.com.txt b/inc/3rdparty/site_config/standard/alriyadh.com.txt index d0060000..be7c43d5 100644..100755 --- a/inc/3rdparty/site_config/standard/alriyadh.com.txt +++ b/inc/3rdparty/site_config/standard/alriyadh.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@id = "article-view"] | 1 | body: //div[@id = "article-view"] |
2 | body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')] | 2 | body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')] |
3 | author: //p[@class = "author"] | 3 | author: //p[@class = "author"] |
4 | strip: //h1 | 4 | strip: //h1 |
5 | strip: //h2 | 5 | strip: //h2 |
6 | strip_id_or_class: author | 6 | strip_id_or_class: author |
7 | prune: no | 7 | prune: no |
8 | test_url: http://www.alriyadh.com/2011/10/10/article674357.html | 8 | test_url: http://www.alriyadh.com/2011/10/10/article674357.html |
9 | test_url: http://www.alriyadh.com/net/article/780935 \ No newline at end of file | 9 | test_url: http://www.alriyadh.com/net/article/780935 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/alseraj.net.txt b/inc/3rdparty/site_config/standard/alseraj.net.txt index 107d82d6..107d82d6 100644..100755 --- a/inc/3rdparty/site_config/standard/alseraj.net.txt +++ b/inc/3rdparty/site_config/standard/alseraj.net.txt | |||
diff --git a/inc/3rdparty/site_config/standard/alt1040.com.txt b/inc/3rdparty/site_config/standard/alt1040.com.txt index 4fd45719..4fd45719 100644..100755 --- a/inc/3rdparty/site_config/standard/alt1040.com.txt +++ b/inc/3rdparty/site_config/standard/alt1040.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/alternet.org.txt b/inc/3rdparty/site_config/standard/alternet.org.txt new file mode 100755 index 00000000..e92252eb --- /dev/null +++ b/inc/3rdparty/site_config/standard/alternet.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')] | ||
2 | |||
3 | test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest | ||
4 | test_url: http://feeds.feedblitz.com/alternet \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/altfoto.com.txt b/inc/3rdparty/site_config/standard/altfoto.com.txt index d974cf4a..d974cf4a 100644..100755 --- a/inc/3rdparty/site_config/standard/altfoto.com.txt +++ b/inc/3rdparty/site_config/standard/altfoto.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt b/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt index 7fd47193..a5bd03bf 100644..100755 --- a/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt +++ b/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ") | 3 | author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ") |
4 | 4 | ||
5 | date: //div/a[contains (@href, "issue")] | 5 | date: //div/a[contains (@href, "issue")] |
6 | 6 | ||
7 | move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1] | 7 | move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1] |
8 | 8 | ||
9 | body: //div[@class="enableBullets"] | 9 | body: //div[@class="enableBullets"] |
10 | test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819 \ No newline at end of file | 10 | test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/amandala.com.bz.txt b/inc/3rdparty/site_config/standard/amandala.com.bz.txt new file mode 100755 index 00000000..fb0e21b8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/amandala.com.bz.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='content']//div[contains(@class, 'content')] | ||
2 | strip_id_or_class: widget | ||
3 | strip: //a[contains(@href, 'upm_export=')] | ||
4 | |||
5 | test_url: http://amandala.com.bz/news/feed/ | ||
6 | test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/amazon.com.txt b/inc/3rdparty/site_config/standard/amazon.com.txt index 1a23c4b7..cd7ad159 100644..100755 --- a/inc/3rdparty/site_config/standard/amazon.com.txt +++ b/inc/3rdparty/site_config/standard/amazon.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //span[@id = 'btAsinTitle'] | 1 | title: //span[@id = 'btAsinTitle'] |
2 | body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div | 2 | body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div |
3 | #strip_id_or_class: quantityDropdownDiv | 3 | #strip_id_or_class: quantityDropdownDiv |
4 | #strip_id_or_class: addToCartSpan | 4 | #strip_id_or_class: addToCartSpan |
5 | #strip_id_or_class: oneClickDiv | 5 | #strip_id_or_class: oneClickDiv |
6 | strip_id_or_class: nocontent | 6 | strip_id_or_class: nocontent |
7 | strip_id_or_class: masDynamicConten | 7 | strip_id_or_class: masDynamicConten |
8 | strip_id_or_class: dynamic-content | 8 | strip_id_or_class: dynamic-content |
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | find_string: <span id="actualPriceValue"> | 11 | find_string: <span id="actualPriceValue"> |
12 | replace_string: <span id="actualPriceValue"><br />Price: | 12 | replace_string: <span id="actualPriceValue"><br />Price: |
13 | 13 | ||
14 | strip_id_or_class: collapsePS | 14 | strip_id_or_class: collapsePS |
15 | strip_id_or_class: expandPS | 15 | strip_id_or_class: expandPS |
16 | strip_id_or_class: psPlaceHolde | 16 | strip_id_or_class: psPlaceHolde |
17 | strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')] | 17 | strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')] |
18 | 18 | ||
19 | test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/ \ No newline at end of file | 19 | test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/americandrink.net.txt b/inc/3rdparty/site_config/standard/americandrink.net.txt index dee0e868..7145f3ff 100644..100755 --- a/inc/3rdparty/site_config/standard/americandrink.net.txt +++ b/inc/3rdparty/site_config/standard/americandrink.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='head']/h2/a | 1 | title: //div[@class='head']/h2/a |
2 | author: //div[@class='head']/a | 2 | author: //div[@class='head']/a |
3 | date: //div[@class='head']/p[@class='date']/a | 3 | date: //div[@class='head']/p[@class='date']/a |
4 | body: //div[@class='copy'] | 4 | body: //div[@class='copy'] |
5 | strip: //p[@class='meta'] | 5 | strip: //p[@class='meta'] |
6 | test_url: http://americandrink.net/post/10567188712/free-the-hooch \ No newline at end of file | 6 | test_url: http://americandrink.net/post/10567188712/free-the-hooch \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/americascup.com.txt b/inc/3rdparty/site_config/standard/americascup.com.txt index b1673b6a..31723f81 100644..100755 --- a/inc/3rdparty/site_config/standard/americascup.com.txt +++ b/inc/3rdparty/site_config/standard/americascup.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //div[@class="editorial-content"]/h3 | 1 | title: //div[@class="editorial-content"]/h3 |
2 | body: //div[@class="hero-image" or @class="editorial-content"] | 2 | body: //div[@class="hero-image" or @class="editorial-content"] |
3 | 3 | ||
4 | strip: //ul[@class="hero-caption"] | 4 | strip: //ul[@class="hero-caption"] |
5 | strip_id_or_class: footer | 5 | strip_id_or_class: footer |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | tidy: no | 8 | tidy: no |
9 | 9 | ||
10 | test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/ \ No newline at end of file | 10 | test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt b/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt index 8bf31ec2..c2b62b5a 100644..100755 --- a/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt +++ b/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h1[@class="post-title"] | 1 | title: //h1[@class="post-title"] |
2 | author: //span[@class="author"]/a | 2 | author: //span[@class="author"]/a |
3 | date: //span[@class="date"] | 3 | date: //span[@class="date"] |
4 | body: //div[@class="post-content main"] | 4 | body: //div[@class="post-content main"] |
5 | test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/ \ No newline at end of file | 5 | test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/amptoons.com.txt b/inc/3rdparty/site_config/standard/amptoons.com.txt new file mode 100755 index 00000000..87547c63 --- /dev/null +++ b/inc/3rdparty/site_config/standard/amptoons.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //title | ||
2 | |||
3 | body: //div[@class="entry-content"] | ||
4 | |||
5 | author: //span[@class="author vcard"] | ||
6 | |||
7 | date: //span[@class="entry-date"] | ||
8 | test_url: http://www.amptoons.com/blog/2013/03/14/open-thread-and-link-farm-i-hate-being-sick-edition/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/anandtech.com.txt b/inc/3rdparty/site_config/standard/anandtech.com.txt index 8067e03c..7d804918 100644..100755 --- a/inc/3rdparty/site_config/standard/anandtech.com.txt +++ b/inc/3rdparty/site_config/standard/anandtech.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | author: //a[@class='b'][1] | 1 | author: //a[@class='b'][1] |
2 | date: substring-after(substring-before(//div, 'Posted in'), ' on ') | 2 | date: substring-after(substring-before(//div, 'Posted in'), ' on ') |
3 | strip_image_src: /content/images/globals/ | 3 | strip_image_src: /content/images/globals/ |
4 | strip: //h2[. = 'Page 1']/preceding::p | 4 | strip: //h2[. = 'Page 1']/preceding::p |
5 | strip: //h2 | 5 | strip: //h2 |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) | 9 | single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) |
10 | 10 | ||
11 | test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ \ No newline at end of file | 11 | test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/androidpolice.com.txt b/inc/3rdparty/site_config/standard/androidpolice.com.txt new file mode 100755 index 00000000..8f9b1a21 --- /dev/null +++ b/inc/3rdparty/site_config/standard/androidpolice.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='post_content'] | ||
2 | date: //div[@class='date_day'] | div[@class='date_month'] | ||
3 | |||
4 | test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/ | ||
5 | |||
diff --git a/inc/3rdparty/site_config/standard/andyrutledge.com.txt b/inc/3rdparty/site_config/standard/andyrutledge.com.txt index f9ffd3c3..ce31fcf5 100644..100755 --- a/inc/3rdparty/site_config/standard/andyrutledge.com.txt +++ b/inc/3rdparty/site_config/standard/andyrutledge.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | author: string('Andy Rutledge') | 2 | author: string('Andy Rutledge') |
3 | date: //div[@class='articledate'] | 3 | date: //div[@class='articledate'] |
4 | body: //div[@class='copybody'] | 4 | body: //div[@class='copybody'] |
5 | 5 | ||
6 | strip: //*[@class='space'] | 6 | strip: //*[@class='space'] |
7 | strip: //*[@class='articleFoot'] | 7 | strip: //*[@class='articleFoot'] |
8 | 8 | ||
9 | test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php \ No newline at end of file | 9 | test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt b/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt index a5c7c08a..2d8937f7 100644..100755 --- a/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt +++ b/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h1[@class="title"] | 1 | title: //h1[@class="title"] |
2 | 2 | ||
3 | author: ("Anna Manasova") | 3 | author: ("Anna Manasova") |
4 | # is ignored, unfortunately | 4 | # is ignored, unfortunately |
5 | 5 | ||
6 | date: //p[@class="date"] | 6 | date: //p[@class="date"] |
7 | 7 | ||
8 | body: //div[@class="entry"] | 8 | body: //div[@class="entry"] |
9 | test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/ \ No newline at end of file | 9 | test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/applature.com.txt b/inc/3rdparty/site_config/standard/applature.com.txt index a78a6150..a820bba4 100644..100755 --- a/inc/3rdparty/site_config/standard/applature.com.txt +++ b/inc/3rdparty/site_config/standard/applature.com.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | title: //h1[contains(@class, 'title')# | 1 | title: //h1[contains(@class, 'title')# |
2 | body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer'] | 2 | body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer'] |
3 | date: //div[@class='date'] | 3 | date: //div[@class='date'] |
4 | 4 | ||
5 | strip_id_or_class: sharethis | 5 | strip_id_or_class: sharethis |
6 | strip_id_or_class: stats | 6 | strip_id_or_class: stats |
7 | strip_id_or_class: apply_form | 7 | strip_id_or_class: apply_form |
8 | strip_id_or_class: job_map | 8 | strip_id_or_class: job_map |
9 | strip_id_or_class: respond | 9 | strip_id_or_class: respond |
10 | strip: //h1//span[@class='type'] | 10 | strip: //h1//span[@class='type'] |
11 | strip: //li[@class='print' or @class='map'] | 11 | strip: //li[@class='print' or @class='map'] |
12 | 12 | ||
13 | replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla | 13 | replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla |
14 | 14 | ||
15 | prune: no | 15 | prune: no |
16 | tidy: no | 16 | tidy: no |
17 | 17 | ||
18 | test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/ \ No newline at end of file | 18 | test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/apple.com.txt b/inc/3rdparty/site_config/standard/apple.com.txt index 4c483955..a54dccc8 100644..100755 --- a/inc/3rdparty/site_config/standard/apple.com.txt +++ b/inc/3rdparty/site_config/standard/apple.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | strip: //p[@class='sosumi'] | 1 | strip: //p[@class='sosumi'] |
2 | # Aren't they witty? | 2 | # Aren't they witty? |
3 | 3 | ||
4 | # I can't work out what causes the  before the title. | 4 | # I can't work out what causes the  before the title. |
5 | title: //h1[@class='title'] | 5 | title: //h1[@class='title'] |
6 | strip: //h1[@class='title'] | 6 | strip: //h1[@class='title'] |
7 | test_url: http://www.apple.com/pr/library/2011/02/15appstore.html \ No newline at end of file | 7 | test_url: http://www.apple.com/pr/library/2011/02/15appstore.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/appledaily.com.tw.txt b/inc/3rdparty/site_config/standard/appledaily.com.tw.txt new file mode 100755 index 00000000..82d6f376 --- /dev/null +++ b/inc/3rdparty/site_config/standard/appledaily.com.tw.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[contains(@class, 'articulum')] | ||
2 | |||
3 | test_url: http://www.appledaily.com.tw/realtimenews/article/new/20140120/330479 | ||
4 | test_url: http://www.appledaily.com.tw/rss/create/kind/rnews/type/new/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/appleinsider.com.txt b/inc/3rdparty/site_config/standard/appleinsider.com.txt index 279fbce1..5ae1050b 100644..100755 --- a/inc/3rdparty/site_config/standard/appleinsider.com.txt +++ b/inc/3rdparty/site_config/standard/appleinsider.com.txt | |||
@@ -1,11 +1,23 @@ | |||
1 | title: //p[@class='title'] | 1 | title: //h1[@class="art-head"] |
2 | 2 | ||
3 | author: //p[text() = 'By ']/a/text() | 3 | author: //p[contains(@class, 'byline')]/a |
4 | strip: //p[text() = 'By '] | 4 | #author: //p[text() = 'By ']/a/text() |
5 | 5 | #strip: //p[text() = 'By '] | |
6 | body: //td[@class='bod'] | 6 | |
7 | strip_id_or_class: title | 7 | date: //p[contains(@class, 'date-header')] |
8 | strip_id_or_class: minor | 8 | |
9 | 9 | body: //div[@class="article"] | |
10 | strip_id_or_class: multipagefooter | 10 | strip_id_or_class: lazy |
11 | test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html \ No newline at end of file | 11 | #strip_id_or_class: minor |
12 | strip_id_or_class: multipagefooter | ||
13 | strip_id_or_class: date-header | ||
14 | strip_id_or_class: byline | ||
15 | |||
16 | find_string: <noscript> | ||
17 | replace_string: <div> | ||
18 | find_string: </noscript> | ||
19 | replace_string: </div> | ||
20 | |||
21 | test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html | ||
22 | test_url: http://appleinsider.com/articles/13/10/03/goldee-companion-app-for-philips-hue-bulbs-offers-shifting-dynamic-light-scenes | ||
23 | test_url: http://appleinsider.com/appleinsider.rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/appleweblog.com.txt b/inc/3rdparty/site_config/standard/appleweblog.com.txt index 023c9ccb..023c9ccb 100644..100755 --- a/inc/3rdparty/site_config/standard/appleweblog.com.txt +++ b/inc/3rdparty/site_config/standard/appleweblog.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/archdaily.com.txt b/inc/3rdparty/site_config/standard/archdaily.com.txt index 9476cf56..0178639e 100644..100755 --- a/inc/3rdparty/site_config/standard/archdaily.com.txt +++ b/inc/3rdparty/site_config/standard/archdaily.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | date: //div[@class='post_date'] | 1 | date: //div[@class='post_date'] |
2 | 2 | ||
3 | body: //div[@class='post_content'] | 3 | body: //div[@class='post_content'] |
4 | 4 | ||
5 | test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up \ No newline at end of file | 5 | test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/archiveofourown.org.txt b/inc/3rdparty/site_config/standard/archiveofourown.org.txt index 50ff632d..579de517 100644..100755 --- a/inc/3rdparty/site_config/standard/archiveofourown.org.txt +++ b/inc/3rdparty/site_config/standard/archiveofourown.org.txt | |||
@@ -1,18 +1,22 @@ | |||
1 | # Description: Fix XPaths to include ALL chapters on 'view_full_work' pages. | 1 | # Description: Fix XPaths to include ALL chapters on 'view_full_work' pages. |
2 | # Include: work meta, summary, chapter information, and notes which Instapaper strips out on default. | 2 | # Include: work meta, summary, chapter information, and notes which Instapaper strips out on default. |
3 | # Exclude: header, footer, navigation, comments. | 3 | # Exclude: header, footer, navigation, comments. |
4 | # Notes: User is a newbie with XPaths. | 4 | # Notes: User is a newbie with XPaths. |
5 | 5 | ||
6 | title: //h2[@class='title'] | 6 | title: //h2[@class='title'] |
7 | author: //h3[@class='byline'] | 7 | author: //h3[@class='byline'] |
8 | author: //a[@class='login author'] | 8 | author: //a[@class='login author'] |
9 | 9 | ||
10 | strip_id_or_class:header | 10 | strip_id_or_class:header |
11 | strip_id_or_class:navigation | 11 | strip_id_or_class:navigation |
12 | strip_id_or_class:feedback | 12 | strip_id_or_class:feedback |
13 | strip_id_or_class:kudos | 13 | strip_id_or_class:kudos |
14 | strip_id_or_class:add_comment_placeholder | 14 | strip_id_or_class:add_comment_placeholder |
15 | strip_id_or_class:add_comment | 15 | strip_id_or_class:add_comment |
16 | strip_id_or_class:globalize | 16 | strip_id_or_class:globalize |
17 | strip_id_or_class:footer | 17 | strip_id_or_class:footer |
18 | test_url: http://archiveofourown.org/works/229402?view_full_work=true \ No newline at end of file | 18 | |
19 | single_page_link: //div[@id='main']//a[contains(@href, 'view_adult=true')] | ||
20 | |||
21 | test_url: http://archiveofourown.org/works/229402?view_full_work=true | ||
22 | test_url: http://archiveofourown.org/works/750111/chapters/1399929 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/arstechnica.com.txt b/inc/3rdparty/site_config/standard/arstechnica.com.txt index 49bb3dbc..767f6800 100644..100755 --- a/inc/3rdparty/site_config/standard/arstechnica.com.txt +++ b/inc/3rdparty/site_config/standard/arstechnica.com.txt | |||
@@ -1,16 +1,17 @@ | |||
1 | author: //p[@class='byline']/a | 1 | author: //p[@class='byline']/a |
2 | body: //div[contains(@class,'article-content')] | 2 | body: //div[contains(@class,'article-content')] |
3 | strip: //h2[@class='title'] | 3 | strip: //h2[@class='title'] |
4 | strip_id_or_class: byline | 4 | strip_id_or_class: byline |
5 | prune: no | 5 | strip_id_or_class: story-sidebar |
6 | 6 | prune: no | |
7 | date: //div[@class='byline']/span[@class='posted']//abbr/@original-title | 7 | |
8 | date: //div[@class='byline']/span[@class='posted']//abbr | 8 | date: //div[@class='byline']/span[@class='posted']//abbr/@original-title |
9 | 9 | date: //div[@class='byline']/span[@class='posted']//abbr | |
10 | title: //div[@id='story']//h2[@class='title'] | 10 | |
11 | 11 | title: //div[@id='story']//h2[@class='title'] | |
12 | strip: //div[@class='pager'] | 12 | |
13 | next_page_link: //nav//a[span/@class='next']/@href | 13 | strip: //div[@class='pager'] |
14 | 14 | next_page_link: //nav//a[span/@class='next']/@href | |
15 | test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars | 15 | |
16 | test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ \ No newline at end of file | 16 | test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars |
17 | test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ | ||
diff --git a/inc/3rdparty/site_config/standard/articles.boston.com.txt b/inc/3rdparty/site_config/standard/articles.boston.com.txt index e54423be..73bcdb4e 100644..100755 --- a/inc/3rdparty/site_config/standard/articles.boston.com.txt +++ b/inc/3rdparty/site_config/standard/articles.boston.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1 | 1 | title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1 |
2 | author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ") | 2 | author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ") |
3 | date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"] | 3 | date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"] |
4 | 4 | ||
5 | strip_id_or_class: mod-pagination | 5 | strip_id_or_class: mod-pagination |
6 | test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park \ No newline at end of file | 6 | test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/articles.courant.com.txt b/inc/3rdparty/site_config/standard/articles.courant.com.txt index a08f2041..984d81de 100644..100755 --- a/inc/3rdparty/site_config/standard/articles.courant.com.txt +++ b/inc/3rdparty/site_config/standard/articles.courant.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1 | 1 | title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1 |
2 | date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"] | 2 | date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"] |
3 | author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3] | 3 | author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3] |
4 | 4 | ||
5 | strip_id_or_class: mod-article-byline | 5 | strip_id_or_class: mod-article-byline |
6 | strip_id_or_class: mod-article-header | 6 | strip_id_or_class: mod-article-header |
7 | strip_id_or_class: mod-article-subtitle | 7 | strip_id_or_class: mod-article-subtitle |
8 | #This leaves some crud after the article, but it's better than nothing. | 8 | #This leaves some crud after the article, but it's better than nothing. |
9 | #It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element. | 9 | #It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element. |
10 | 10 | ||
11 | test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown \ No newline at end of file | 11 | test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/articles.washingtonpost.com.txt b/inc/3rdparty/site_config/standard/articles.washingtonpost.com.txt new file mode 100755 index 00000000..a76c2d02 --- /dev/null +++ b/inc/3rdparty/site_config/standard/articles.washingtonpost.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[contains(@class, "article_body")] | ||
2 | # print view | ||
3 | body: //div[@id='print_facet']//div[@id='body'] | ||
4 | |||
5 | tidy: no | ||
6 | prune: no | ||
7 | |||
8 | single_page_link: concat(substring-before(//div[@id="echo_container_a"]/@guid, '_story.html'), '_print.html') | ||
9 | |||
10 | test_url: http://articles.washingtonpost.com/2011-10-22/world/35279694_1_germany-acts-german-leaders-chancellor-angela-merkel | ||
11 | test_url: http://articles.washingtonpost.com/2013-05-31/opinions/39658000_1_chemical-weapons-mass-destruction-cartels \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/asahi.com.txt b/inc/3rdparty/site_config/standard/asahi.com.txt index 2562edb9..b4eec7bd 100644..100755 --- a/inc/3rdparty/site_config/standard/asahi.com.txt +++ b/inc/3rdparty/site_config/standard/asahi.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='HeadLine'] | 1 | body: //div[@id='HeadLine'] |
2 | strip: //div[@id='utility_right'] | 2 | strip: //div[@id='utility_right'] |
3 | test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html \ No newline at end of file | 3 | test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ascarter.net.txt b/inc/3rdparty/site_config/standard/ascarter.net.txt index 5236d09e..0327e846 100644..100755 --- a/inc/3rdparty/site_config/standard/ascarter.net.txt +++ b/inc/3rdparty/site_config/standard/ascarter.net.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h1[@class='article_title'] | 1 | title: //h1[@class='article_title'] |
2 | author: //span[@class='author'] | 2 | author: //span[@class='author'] |
3 | date: //h2[@class='dateline'] | 3 | date: //h2[@class='dateline'] |
4 | body: //div[@class='article_body'] | 4 | body: //div[@class='article_body'] |
5 | test_url: http://ascarter.net/2012/02/20/enough-is-enough.html \ No newline at end of file | 5 | test_url: http://ascarter.net/2012/02/20/enough-is-enough.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/astronews.com.txt b/inc/3rdparty/site_config/standard/astronews.com.txt index 33e8153d..8de22270 100644..100755 --- a/inc/3rdparty/site_config/standard/astronews.com.txt +++ b/inc/3rdparty/site_config/standard/astronews.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //span[@class='titel'] | 1 | title: //span[@class='titel'] |
2 | author: //span[@class='metadaten_C']/a//span[@class='metadaten_C'] | 2 | author: //span[@class='metadaten_C']/a//span[@class='metadaten_C'] |
3 | date: substring-after(//span[@class='metadaten_C'],'astronews.com') | 3 | date: substring-after(//span[@class='metadaten_C'],'astronews.com') |
4 | strip: //span[@class='bu'] | 4 | strip: //span[@class='bu'] |
5 | strip_image_src: '/_images/' | 5 | strip_image_src: '/_images/' |
6 | 6 | ||
7 | test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml \ No newline at end of file | 7 | test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/asymco.com.txt b/inc/3rdparty/site_config/standard/asymco.com.txt index adad5f18..f639b048 100644..100755 --- a/inc/3rdparty/site_config/standard/asymco.com.txt +++ b/inc/3rdparty/site_config/standard/asymco.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | # Johannes Stühler | 1 | # Johannes Stühler |
2 | 2 | ||
3 | title://h2 | 3 | title://h2 |
4 | author://span[@class='meta-content'] | 4 | author://span[@class='meta-content'] |
5 | date://abbr[@class='date published']/@title | 5 | date://abbr[@class='date published']/@title |
6 | body://div[@class='entry-content'] | 6 | body://div[@class='entry-content'] |
7 | 7 | ||
8 | test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/ \ No newline at end of file | 8 | test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/autoblog.com.txt b/inc/3rdparty/site_config/standard/autoblog.com.txt index 58681bf9..291db992 100644..100755 --- a/inc/3rdparty/site_config/standard/autoblog.com.txt +++ b/inc/3rdparty/site_config/standard/autoblog.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | prune: no | 1 | prune: no |
2 | body: //div[@class='post-body'] | 2 | body: //div[@class='post-body'] |
3 | author: //p[@class='byline']//a | 3 | author: //p[@class='byline']//a |
4 | date: substring-after(//div[@class='about']/p[2], 'Posted') | 4 | date: substring-after(//div[@class='about']/p[2], 'Posted') |
5 | strip: //div[@class='body']/div[@class='meta'] | 5 | strip: //div[@class='body']/div[@class='meta'] |
6 | test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/ \ No newline at end of file | 6 | test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/avclub.com.txt b/inc/3rdparty/site_config/standard/avclub.com.txt index 776ee108..c365a7aa 100644..100755 --- a/inc/3rdparty/site_config/standard/avclub.com.txt +++ b/inc/3rdparty/site_config/standard/avclub.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author: //*[@id="article_wrapper"]/div[1]/a[1] | 1 | author: //*[@id="article_wrapper"]/div[1]/a[1] |
2 | body: //*[@id="article_wrapper"]/div[2] | 2 | body: //*[@id="article_wrapper"]/div[2] |
3 | date: //*[@id="article_wrapper"]/div[1]/text()[2] | 3 | date: //*[@id="article_wrapper"]/div[1]/text()[2] |
4 | test_url: http://www.avclub.com/articles/forgetmenot,70904 \ No newline at end of file | 4 | test_url: http://www.avclub.com/articles/forgetmenot,70904 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/baltimoresun.com.txt b/inc/3rdparty/site_config/standard/baltimoresun.com.txt index 32adff8d..35b62427 100644..100755 --- a/inc/3rdparty/site_config/standard/baltimoresun.com.txt +++ b/inc/3rdparty/site_config/standard/baltimoresun.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | single_page_link: //div[@class='toppaginate']//a[@rel='nofollow'] | 1 | single_page_link: //div[@class='toppaginate']//a[@rel='nofollow'] |
2 | convert_double_br_tags: yes | 2 | convert_double_br_tags: yes |
3 | 3 | ||
4 | title: //div[@class="story"]/h1 | 4 | title: //div[@class="story"]/h1 |
5 | body: //div[@id="story-body-text"] | 5 | body: //div[@id="story-body-text"] |
6 | author: //span[@class="byline"] | 6 | author: //span[@class="byline"] |
7 | date: //p[@class="date"] | 7 | date: //p[@class="date"] |
8 | 8 | ||
9 | strip: //*[@class='all'] | 9 | strip: //*[@class='all'] |
10 | strip: //*[@class='articlerail'] | 10 | strip: //*[@class='articlerail'] |
11 | 11 | ||
12 | test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story \ No newline at end of file | 12 | test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/baseballprospectus.com.txt b/inc/3rdparty/site_config/standard/baseballprospectus.com.txt new file mode 100755 index 00000000..1207b343 --- /dev/null +++ b/inc/3rdparty/site_config/standard/baseballprospectus.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //h1[@class='title'] | ||
2 | author: //p[@class="author"]/a[1] | ||
3 | body: //div[@class="article"] | ||
4 | date: //p[@class="date"] | ||
5 | |||
6 | # remove user tools | ||
7 | strip: //div[@class='tools'] | ||
8 | strip: //h1 | ||
9 | strip: //h2[@class='subtitle'] | ||
10 | strip: //p[@class='author'] | ||
11 | strip: //p[@class='date'] | ||
12 | |||
13 | test_url: http://www.baseballprospectus.com/article.php?articleid=18463 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/basicthinking.de.txt b/inc/3rdparty/site_config/standard/basicthinking.de.txt index ab583145..f08c1f26 100644..100755 --- a/inc/3rdparty/site_config/standard/basicthinking.de.txt +++ b/inc/3rdparty/site_config/standard/basicthinking.de.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | date: //span[@class='date'] | 2 | date: //span[@class='date'] |
3 | body: //div[@class='entry'] | 3 | body: //div[@class='entry'] |
4 | 4 | ||
5 | strip: //div[@class='zusatz'] | 5 | strip: //div[@class='zusatz'] |
6 | 6 | ||
7 | test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/ \ No newline at end of file | 7 | test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bb.is.txt b/inc/3rdparty/site_config/standard/bb.is.txt index eaafaf18..57f7fdfa 100644..100755 --- a/inc/3rdparty/site_config/standard/bb.is.txt +++ b/inc/3rdparty/site_config/standard/bb.is.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20) | 1 | author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20) |
2 | 2 | ||
3 | 3 | ||
4 | date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12) | 4 | date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12) |
5 | 5 | ||
6 | 6 | ||
7 | body: //div[@class='first-article-big'] | 7 | body: //div[@class='first-article-big'] |
8 | strip: //table[@class='newsimagecontainer'] | 8 | strip: //table[@class='newsimagecontainer'] |
9 | strip: //h3[@class='headlines'] | 9 | strip: //h3[@class='headlines'] |
10 | strip: //iframe[@class='headlines'] | 10 | strip: //iframe[@class='headlines'] |
11 | strip: //a[@class='newslink'] | 11 | strip: //a[@class='newslink'] |
12 | convert_double_br_tags: yes | 12 | convert_double_br_tags: yes |
13 | test_url: http://bb.is/Pages/82?NewsID=174119 \ No newline at end of file | 13 | test_url: http://bb.is/Pages/82?NewsID=174119 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bbc.co.uk.txt b/inc/3rdparty/site_config/standard/bbc.co.uk.txt index 9c5c3419..ef1f491a 100644..100755 --- a/inc/3rdparty/site_config/standard/bbc.co.uk.txt +++ b/inc/3rdparty/site_config/standard/bbc.co.uk.txt | |||
@@ -1,32 +1,42 @@ | |||
1 | body: //div[@class="story-body"] | 1 | body: //div[@class="story-body"] |
2 | title: //h1[@class="story-header"] | 2 | # for video entries |
3 | date: //span[@class="story-date"]/span[@class='date'] | 3 | body: //div[contains(@class, "videoInStory") or @id="meta-information"] |
4 | 4 | title: //h1[@class="story-header"] | |
5 | # recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 | 5 | date: //span[@class="story-date"]/span[@class='date'] |
6 | body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] | 6 | # for sport site |
7 | 7 | date: //meta[@name='DCTERMS.created']/@content | |
8 | #strip: //div[@class="story-feature narrow"] | 8 | author: //div[@id='headline']//span[@class='byline-name'] |
9 | #strip: //div[@class="story-feature wide"] | 9 | |
10 | #strip: //div[@class="story-feature dslideshow-enclosure"] | 10 | # recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 |
11 | strip: //div[contains(@class, "story-feature")] | 11 | body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] |
12 | strip: //span[@class="story-date"] | 12 | |
13 | #strip: //div[@class="caption body-narrow-width"] | 13 | #strip: //div[@class="story-feature narrow"] |
14 | strip: //div[@class="warning"]//p | 14 | #strip: //div[@class="story-feature wide"] |
15 | strip: //div[@id='page-bookmark-links-head'] | 15 | #strip: //div[@class="story-feature dslideshow-enclosure"] |
16 | strip: //object | 16 | strip: //div[contains(@class, "story-feature")] |
17 | strip: //div[contains(@class, "bbccom_advert_placeholder")] | 17 | strip: //span[@class="story-date"] |
18 | strip: //div[contains(@class, "embedded-hyper")] | 18 | #strip: //div[@class="caption body-narrow-width"] |
19 | strip: //div[contains(@class, 'market-data')] | 19 | strip: //div[@class="warning"]//p |
20 | strip: //a[contains(@class, 'hidden')] | 20 | strip: //div[@id='page-bookmark-links-head'] |
21 | strip: //div[contains(@class, 'hypertabs')] | 21 | strip: //object |
22 | strip: //div[contains(@class, 'related')] | 22 | strip: //div[contains(@class, "bbccom_advert_placeholder")] |
23 | strip: //form[@id='comment-form'] | 23 | strip: //div[contains(@class, "embedded-hyper")] |
24 | strip: //div[contains(@class, 'comment-introduction')] | 24 | strip: //div[contains(@class, 'market-data')] |
25 | 25 | strip: //a[contains(@class, 'hidden')] | |
26 | replace_string(<noscript>): <div> | 26 | strip: //div[contains(@class, 'hypertabs')] |
27 | replace_string(</noscript>): </div> | 27 | strip: //div[contains(@class, 'related')] |
28 | 28 | strip: //form[@id='comment-form'] | |
29 | prune: no | 29 | strip: //div[contains(@class, 'comment-introduction')] |
30 | 30 | strip: //div[contains(@class, 'share-tools')] | |
31 | dissolve: //h2 | 31 | strip: //div[@id='also-related-links'] |
32 | test_url: http://www.bbc.co.uk/news/business-15060862 \ No newline at end of file | 32 | |
33 | replace_string(<noscript>): <div> | ||
34 | replace_string(</noscript>): </div> | ||
35 | |||
36 | prune: no | ||
37 | |||
38 | dissolve: //h2 | ||
39 | test_url: http://www.bbc.co.uk/sport/0/football/23224017 | ||
40 | test_url: http://www.bbc.co.uk/news/business-15060862 | ||
41 | # video entry | ||
42 | test_url: http://www.bbc.co.uk/news/world-asia-22056933 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bbcgoodfood.com.txt b/inc/3rdparty/site_config/standard/bbcgoodfood.com.txt new file mode 100755 index 00000000..1547d625 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bbcgoodfood.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //header//h1 | ||
2 | #body: //article[contains(@class, 'node-full')] | ||
3 | body: //div[contains(@class, 'recipe-details') or contains(@class, 'tips-carousel')] | //section[@id='recipe-ingredients' or @id='recipe-method'] | ||
4 | |||
5 | strip_id_or_class: recipe-rating-wrapper | ||
6 | strip_id_or_class: magazine-subcribe-header | ||
7 | strip_id_or_class: hide | ||
8 | strip_id_or_class: recipe-actions | ||
9 | strip_id_or_class: buy-ingredients | ||
10 | strip_id_or_class: related-content | ||
11 | strip_id_or_class: recipe-magazine-ad | ||
12 | strip_id_or_class: copy-right | ||
13 | |||
14 | prune: no | ||
15 | |||
16 | test_url: http://www.bbcgoodfood.com/recipes/1131634/minced-beef-wellington \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/benoitmaison.org.txt b/inc/3rdparty/site_config/standard/benoitmaison.org.txt index f341d593..72c1baed 100644..100755 --- a/inc/3rdparty/site_config/standard/benoitmaison.org.txt +++ b/inc/3rdparty/site_config/standard/benoitmaison.org.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | body: //div[@class="entry-content"] | 1 | body: //div[@class="entry-content"] |
2 | 2 | ||
3 | # Remove text ‘Tweet’ | 3 | # Remove text ‘Tweet’ |
4 | strip: //div[@class="entry-content"]/div[last()] | 4 | strip: //div[@class="entry-content"]/div[last()] |
5 | 5 | ||
6 | title: h1[@class="entry-title"] | 6 | title: h1[@class="entry-title"] |
7 | 7 | ||
8 | # If the Instapaper text parser worked with HTML5 tags, we would use: | 8 | # If the Instapaper text parser worked with HTML5 tags, we would use: |
9 | date: //time[@class="entry-date"] | 9 | date: //time[@class="entry-date"] |
10 | 10 | ||
11 | # But since it does not, use this more complicated rule: | 11 | # But since it does not, use this more complicated rule: |
12 | date: //div[@class="entry-meta"]/a[@rel="bookmark"] | 12 | date: //div[@class="entry-meta"]/a[@rel="bookmark"] |
13 | 13 | ||
14 | # Unfortunately, the following rule is overridden by the automatically found author. | 14 | # Unfortunately, the following rule is overridden by the automatically found author. |
15 | author: ("Benoit Maison") | 15 | author: ("Benoit Maison") |
16 | test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/ \ No newline at end of file | 16 | test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/berlingske.dk.txt b/inc/3rdparty/site_config/standard/berlingske.dk.txt index 607c998d..9f8c41c6 100644..100755 --- a/inc/3rdparty/site_config/standard/berlingske.dk.txt +++ b/inc/3rdparty/site_config/standard/berlingske.dk.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h1[@class='headline'] | 1 | title: //h1[@class='headline'] |
2 | body: //div[contains(@class, 'article-wrapper')] | 2 | body: //div[contains(@class, 'article-wrapper')] |
3 | test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa \ No newline at end of file | 3 | test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bernama.com.txt b/inc/3rdparty/site_config/standard/bernama.com.txt new file mode 100755 index 00000000..fdc04b7f --- /dev/null +++ b/inc/3rdparty/site_config/standard/bernama.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[contains(@class, "NewsText"] | ||
2 | prune: no | ||
3 | |||
4 | test_url: http://www.bernama.com/bernama/v7/rss/english.php | ||
5 | test_url: http://www.bernama.com/bernama/v7/newsindex.php?id=943513 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/betabeat.com.txt b/inc/3rdparty/site_config/standard/betabeat.com.txt index 7815cf26..7815cf26 100644..100755 --- a/inc/3rdparty/site_config/standard/betabeat.com.txt +++ b/inc/3rdparty/site_config/standard/betabeat.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/betanews.com.txt b/inc/3rdparty/site_config/standard/betanews.com.txt index 0eaf085e..90a54a23 100644..100755 --- a/inc/3rdparty/site_config/standard/betanews.com.txt +++ b/inc/3rdparty/site_config/standard/betanews.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | # some articles at this site like this one doesn't | 1 | # some articles at this site like this one doesn't |
2 | # seem to pick up the article body via normal | 2 | # seem to pick up the article body via normal |
3 | # processing, other articles come through fine | 3 | # processing, other articles come through fine |
4 | # http://www.betanews.com/joewilcox/article | 4 | # http://www.betanews.com/joewilcox/article |
5 | # /Google-is-a-marketing-sensation/1309708375 | 5 | # /Google-is-a-marketing-sensation/1309708375 |
6 | body: //*[@id="article"] | 6 | body: //*[@id="article"] |
7 | test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375 \ No newline at end of file | 7 | test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/biography.com.txt b/inc/3rdparty/site_config/standard/biography.com.txt index dc071299..e431037a 100644..100755 --- a/inc/3rdparty/site_config/standard/biography.com.txt +++ b/inc/3rdparty/site_config/standard/biography.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[contains(@class, 'main-content')]//h1 | 1 | title: //div[contains(@class, 'main-content')]//h1 |
2 | body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')] | 2 | body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')] | 6 | single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')] |
7 | 7 | ||
8 | test_url: http://www.biography.com/print/profile/martin-luther-9389283 \ No newline at end of file | 8 | test_url: http://www.biography.com/print/profile/martin-luther-9389283 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bitelia.com.txt b/inc/3rdparty/site_config/standard/bitelia.com.txt index 7bffae93..7bffae93 100644..100755 --- a/inc/3rdparty/site_config/standard/bitelia.com.txt +++ b/inc/3rdparty/site_config/standard/bitelia.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/bizjournals.com.txt b/inc/3rdparty/site_config/standard/bizjournals.com.txt new file mode 100755 index 00000000..cfba766f --- /dev/null +++ b/inc/3rdparty/site_config/standard/bizjournals.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | date: //meta[@name='publish-date']/@content | ||
2 | body: //div[contains(@class, 'articleContentWrapper')] | ||
3 | prune: no | ||
4 | |||
5 | strip: //div[contains(@class, 'staff_info')]//dd[contains(., 'Twitter')] | ||
6 | |||
7 | strip_id_or_class: related_content | ||
8 | strip_id_or_class: enlarge | ||
9 | strip_id_or_class: photoBy | ||
10 | strip_id_or_class: older | ||
11 | |||
12 | test_url: http://www.bizjournals.com/cincinnati/news/2013/10/03/harris-teeter-shareholders-vote-on.html | ||
13 | test_url: http://feeds.bizjournals.com/industry_20?format=xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bjango.com.txt b/inc/3rdparty/site_config/standard/bjango.com.txt index 6cb04631..0fed5526 100644..100755 --- a/inc/3rdparty/site_config/standard/bjango.com.txt +++ b/inc/3rdparty/site_config/standard/bjango.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1[@class='articlehead'] | 1 | title: //h1[@class='articlehead'] |
2 | body: //div[@class='column'] | 2 | body: //div[@class='column'] |
3 | strip: //h1 | 3 | strip: //h1 |
4 | strip: //div[@class='help'] | 4 | strip: //div[@class='help'] |
5 | 5 | ||
6 | #no author or date/time provided in current layout | 6 | #no author or date/time provided in current layout |
7 | test_url: http://bjango.com/articles/actions/ \ No newline at end of file | 7 | test_url: http://bjango.com/articles/actions/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.arsln.org.txt b/inc/3rdparty/site_config/standard/blog.arsln.org.txt index 1f43f490..7ac8cc11 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.arsln.org.txt +++ b/inc/3rdparty/site_config/standard/blog.arsln.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | prune: no | 2 | prune: no |
3 | date: //article/header/h6/time | 3 | date: //article/header/h6/time |
4 | title: //article/header/h3 | 4 | title: //article/header/h3 |
5 | author: //meta[@name='author']/@content | 5 | author: //meta[@name='author']/@content |
6 | body: //article//post | 6 | body: //article//post |
7 | 7 | ||
8 | test_url: http://blog.arsln.org/aska-ayip-oluyor/ \ No newline at end of file | 8 | test_url: http://blog.arsln.org/aska-ayip-oluyor/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt b/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt index 81c3bda6..78d7f516 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt +++ b/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //title | 1 | title: //title |
2 | author: //span[@class='author vcard']/a | 2 | author: //span[@class='author vcard']/a |
3 | date: //p[@class='headline_meta']/abbr[@class='published'] | 3 | date: //p[@class='headline_meta']/abbr[@class='published'] |
4 | body: //div[@class='format_text entry-content'] | 4 | body: //div[@class='format_text entry-content'] |
5 | 5 | ||
6 | strip: //div[@id='dd_ajax_float'] | 6 | strip: //div[@id='dd_ajax_float'] |
7 | test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html \ No newline at end of file | 7 | test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt b/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt index a4c5aaea..db80a35f 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt +++ b/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | # Instapaper gets this back to front and only gets the blog title instead of the article title. | 1 | # Instapaper gets this back to front and only gets the blog title instead of the article title. |
2 | title: substring-before(//title, '-') | 2 | title: substring-before(//title, '-') |
3 | 3 | ||
4 | author: //a[ contains(@href, '/people') ] | 4 | author: //a[ contains(@href, '/people') ] |
5 | 5 | ||
6 | body: //div[ @class='post' ] | 6 | body: //div[ @class='post' ] |
7 | 7 | ||
8 | # Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous. | 8 | # Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous. |
9 | test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n \ No newline at end of file | 9 | test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.fefe.de.txt b/inc/3rdparty/site_config/standard/blog.fefe.de.txt index 92272b70..97e48e69 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.fefe.de.txt +++ b/inc/3rdparty/site_config/standard/blog.fefe.de.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | date: //h3 | 2 | date: //h3 |
3 | body: //ul | 3 | body: //ul |
4 | 4 | ||
5 | test_url: http://blog.fefe.de/?ts=b063bf55 \ No newline at end of file | 5 | test_url: http://blog.fefe.de/?ts=b063bf55 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.instagram.com.txt b/inc/3rdparty/site_config/standard/blog.instagram.com.txt index 3065dd80..13d1d44a 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.instagram.com.txt +++ b/inc/3rdparty/site_config/standard/blog.instagram.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | # clean Instagram blog a little bit | 1 | # clean Instagram blog a little bit |
2 | 2 | ||
3 | tidy:no | 3 | tidy:no |
4 | prune:no | 4 | prune:no |
5 | 5 | ||
6 | body://div[contains(@id,'content')] | 6 | body://div[contains(@id,'content')] |
7 | 7 | ||
8 | strip_id_or_class:meta | 8 | strip_id_or_class:meta |
9 | strip_id_or_class:notes | 9 | strip_id_or_class:notes |
10 | strip_id_or_class:pagination | 10 | strip_id_or_class:pagination |
11 | test_url: http://blog.instagram.com/post/8757832007/fromwhereistand \ No newline at end of file | 11 | test_url: http://blog.instagram.com/post/8757832007/fromwhereistand \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.instapaper.com.txt b/inc/3rdparty/site_config/standard/blog.instapaper.com.txt new file mode 100755 index 00000000..fda01b15 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.instapaper.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | author: //a[@href="http://www.marco.org/about"] | ||
2 | date: //span[@class="date"] | ||
3 | |||
4 | # Remove the date from article body. | ||
5 | strip: //span[@class="date"] | ||
6 | |||
7 | # Remove pagination links from article body. | ||
8 | strip: //div[@id="pagination"] | ||
9 | test_url: http://blog.instapaper.com/post/31303984531 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt b/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt index 4e467fe9..e89ad3a5 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt +++ b/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | date: //span[contains(@class, 'date-links')] | 1 | date: //span[contains(@class, 'date-links')] |
2 | author: //span[contains(@class, 'author-links')] | 2 | author: //span[contains(@class, 'author-links')] |
3 | body: //div[contains(@class, 'entry-content')] | 3 | body: //div[contains(@class, 'entry-content')] |
4 | test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web \ No newline at end of file | 4 | test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt b/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt index ac18ad15..bcd3bdc9 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt +++ b/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //*[contains(@class, 'post_content')] | 1 | body: //*[contains(@class, 'post_content')] |
2 | author: string('Kaelig Deloumeau-Prigent') | 2 | author: string('Kaelig Deloumeau-Prigent') |
3 | title: //h1[@class='title'] | 3 | title: //h1[@class='title'] |
4 | date: //span[@class='date'] | 4 | date: //span[@class='date'] |
5 | test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par \ No newline at end of file | 5 | test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.naver.com.txt b/inc/3rdparty/site_config/standard/blog.naver.com.txt index 702789ad..73c30c47 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.naver.com.txt +++ b/inc/3rdparty/site_config/standard/blog.naver.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //span[@class='pcol1 itemSubjectBoldfont'] | 1 | title: //span[@class='pcol1 itemSubjectBoldfont'] |
2 | body: //div[@id='postListBody'] | 2 | body: //div[@id='postListBody'] |
3 | date: //p[@class='date fil5 pcol2'] | 3 | date: //p[@class='date fil5 pcol2'] |
4 | single_page_link: /html/frameset/frame[1]/attribute::src | 4 | single_page_link: /html/frameset/frame[1]/attribute::src |
5 | strip: //div[@class='post-btn'] | 5 | strip: //div[@class='post-btn'] |
6 | test_url: http://blog.naver.com/how2invest/110135068757 \ No newline at end of file | 6 | test_url: http://blog.naver.com/how2invest/110135068757 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.pchome.net.txt b/inc/3rdparty/site_config/standard/blog.pchome.net.txt index 3089001e..de81beba 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.pchome.net.txt +++ b/inc/3rdparty/site_config/standard/blog.pchome.net.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | # PCHOME blog, a popular Chinese blog host | 1 | # PCHOME blog, a popular Chinese blog host |
2 | # Oct 15, 2011 | 2 | # Oct 15, 2011 |
3 | # | 3 | # |
4 | 4 | ||
5 | title://*[contains(@class,'imp')]/h2 | 5 | title://*[contains(@class,'imp')]/h2 |
6 | 6 | ||
7 | date://*[contains(@class,'imp')]/span | 7 | date://*[contains(@class,'imp')]/span |
8 | body://div[contains(@id,'blog_content')] | 8 | body://div[contains(@id,'blog_content')] |
9 | 9 | ||
10 | 10 | ||
11 | 11 | ||
12 | test_url: http://blog.pchome.net/article/462502.html \ No newline at end of file | 12 | test_url: http://blog.pchome.net/article/462502.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.pinboard.in.txt b/inc/3rdparty/site_config/standard/blog.pinboard.in.txt index b7afe455..40f0c560 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.pinboard.in.txt +++ b/inc/3rdparty/site_config/standard/blog.pinboard.in.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //a[@class="blog_title"] | 1 | title: //a[@class="blog_title"] |
2 | date: //p[@class="when"]/a | 2 | date: //p[@class="when"]/a |
3 | body: //div[@class="blog_entry"] | 3 | body: //div[@class="blog_entry"] |
4 | strip_id_or_class:blog_title | 4 | strip_id_or_class:blog_title |
5 | strip_id_or_class:when | 5 | strip_id_or_class:when |
6 | test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/ \ No newline at end of file | 6 | test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.renren.com.txt b/inc/3rdparty/site_config/standard/blog.renren.com.txt new file mode 100755 index 00000000..401d31e5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.renren.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://blog.renren.com/share/224959024/14260739544 | ||
3 | # http://blog.renren.com/share/231323504/14261768898 | ||
4 | # http://blog.renren.com/share/230305019/1502806705 | ||
5 | |||
6 | title://h1[contains(@class, 'title-article')] | ||
7 | author://span[contains(@class, 'name')] | ||
8 | body://div[contains(@class, 'content-body')] | ||
9 | |||
10 | convert_double_br_tags:yes | ||
11 | test_url: http://blog.renren.com/share/230305019/1502806705 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt b/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt index acb9ce81..4895272a 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt +++ b/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt | |||
@@ -1,26 +1,26 @@ | |||
1 | # Sina blog, the most popular blog host in China. | 1 | # Sina blog, the most popular blog host in China. |
2 | # Its source code is horrible. | 2 | # Its source code is horrible. |
3 | # | 3 | # |
4 | # Issue: | 4 | # Issue: |
5 | # Only the first image in the article is displayed. | 5 | # Only the first image in the article is displayed. |
6 | # The rest images are replace by a 1x1 transparent gif by sina blog host. | 6 | # The rest images are replace by a 1x1 transparent gif by sina blog host. |
7 | # | 7 | # |
8 | 8 | ||
9 | title://*[contains(@class,'titName SG_txta')] | 9 | title://*[contains(@class,'titName SG_txta')] |
10 | author://*[contains(@id,'ownernick')] | 10 | author://*[contains(@id,'ownernick')] |
11 | date://*[contains(@class,'time SG_txtc')] | 11 | date://*[contains(@class,'time SG_txtc')] |
12 | body://div[contains(@class,'articalContent')] | 12 | body://div[contains(@class,'articalContent')] |
13 | 13 | ||
14 | # Remove redundant content which has span class start with "MASS" | 14 | # Remove redundant content which has span class start with "MASS" |
15 | # Example <span class="MASSf21674ffeef7"></span> | 15 | # Example <span class="MASSf21674ffeef7"></span> |
16 | strip://span[contains(@class,'MASS')] | 16 | strip://span[contains(@class,'MASS')] |
17 | 17 | ||
18 | # Remove comment | 18 | # Remove comment |
19 | strip://div[contains(@class,'allComm')] | 19 | strip://div[contains(@class,'allComm')] |
20 | 20 | ||
21 | # Remove hiden text and link | 21 | # Remove hiden text and link |
22 | strip://ins | 22 | strip://ins |
23 | 23 | ||
24 | tidy:no | 24 | tidy:no |
25 | convert_double_br_tags:yes | 25 | convert_double_br_tags:yes |
26 | test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html \ No newline at end of file | 26 | test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blog.spu.edu.txt b/inc/3rdparty/site_config/standard/blog.spu.edu.txt index 68bd4e39..68bd4e39 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.spu.edu.txt +++ b/inc/3rdparty/site_config/standard/blog.spu.edu.txt | |||
diff --git a/inc/3rdparty/site_config/standard/blog.wells.ee.txt b/inc/3rdparty/site_config/standard/blog.wells.ee.txt index 8c8b3838..eae6982b 100644..100755 --- a/inc/3rdparty/site_config/standard/blog.wells.ee.txt +++ b/inc/3rdparty/site_config/standard/blog.wells.ee.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2/a[@class="no-link title"] | 1 | title: //h2/a[@class="no-link title"] |
2 | author: //h2[@id="blog_owner"] | 2 | author: //h2[@id="blog_owner"] |
3 | date: //time | 3 | date: //time |
4 | strip: //h2/a[@class="no-link title"] | 4 | strip: //h2/a[@class="no-link title"] |
5 | test_url: http://blog.wells.ee/retina | 5 | test_url: http://blog.wells.ee/retina |
6 | test_url: http://blog.wells.ee/skeuomorphism \ No newline at end of file | 6 | test_url: http://blog.wells.ee/skeuomorphism \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt b/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt index f630127b..2a66952b 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt +++ b/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | # 2011-08-23 [carlo@...] Initial version. | 1 | # 2011-08-23 [carlo@...] Initial version. |
2 | 2 | ||
3 | author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text() | 3 | author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text() |
4 | 4 | ||
5 | # why yes, I do feel a bit dirty | 5 | # why yes, I do feel a bit dirty |
6 | date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " ) | 6 | date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " ) |
7 | 7 | ||
8 | test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero \ No newline at end of file | 8 | test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.forbes.com.txt b/inc/3rdparty/site_config/standard/blogs.forbes.com.txt index 86580d21..86580d21 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.forbes.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.forbes.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/blogs.hbr.org.txt b/inc/3rdparty/site_config/standard/blogs.hbr.org.txt index 3664d16c..d47c3520 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.hbr.org.txt +++ b/inc/3rdparty/site_config/standard/blogs.hbr.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //div[@id='pageFeature']/h1 | 1 | title: //div[@id='pageFeature']/h1 |
2 | body: //div[@id='articleBody'] | 2 | body: //div[@id='articleBody'] |
3 | strip: //div[@class='module wide'] | 3 | strip: //div[@class='module wide'] |
4 | test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29 \ No newline at end of file | 4 | test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.msdn.com.txt b/inc/3rdparty/site_config/standard/blogs.msdn.com.txt index 3d3ec020..b2ff8332 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.msdn.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.msdn.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h3[@class="post-name"] | 1 | title: //h3[@class="post-name"] |
2 | author: //span[@class="user-name"] | 2 | author: //span[@class="user-name"] |
3 | date: //div[@class="post-date"] | 3 | date: //div[@class="post-date"] |
4 | body: //div[@class="post-content user-defined-markup"] | 4 | body: //div[@class="post-content user-defined-markup"] |
5 | footnotes: no | 5 | footnotes: no |
6 | test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx \ No newline at end of file | 6 | test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.reuters.com.txt b/inc/3rdparty/site_config/standard/blogs.reuters.com.txt index 6907bcb2..d3eb9966 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.reuters.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.reuters.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //div[@id='single']/h1 | 1 | title: //div[@id='single']/h1 |
2 | body: //div[@id='postcontent'] | 2 | body: //div[@id='postcontent'] |
3 | test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/ \ No newline at end of file | 3 | test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt b/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt index a7d15081..2102015d 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | # meta data | 1 | # meta data |
2 | title://h1[@class = 'postTitle'] | 2 | title://h1[@class = 'postTitle'] |
3 | author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|') | 3 | author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|') |
4 | date://span[@class = 'datestamp'] | 4 | date://span[@class = 'datestamp'] |
5 | 5 | ||
6 | #body content | 6 | #body content |
7 | body://div[@id = 'singleBlogPost'] | 7 | body://div[@id = 'singleBlogPost'] |
8 | 8 | ||
9 | #reclaim author info | 9 | #reclaim author info |
10 | move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv'] | 10 | move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv'] |
11 | strip://p[@class = 'moreLink mobileHide'] | 11 | strip://p[@class = 'moreLink mobileHide'] |
12 | 12 | ||
13 | #cleanup comments, there might be some open <div> sections | 13 | #cleanup comments, there might be some open <div> sections |
14 | strip://div[@id = 'comments2'] | 14 | strip://div[@id = 'comments2'] |
15 | strip://h3[a[@href = '#add-comment']] | 15 | strip://h3[a[@href = '#add-comment']] |
16 | test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/ \ No newline at end of file | 16 | test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt b/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt index ba8bc6e7..1bc65e77 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | # metadata | 1 | # metadata |
2 | author://div[@class = 'post']/div[@class='meta']/a[1] | 2 | author://div[@class = 'post']/div[@class='meta']/a[1] |
3 | date://div[@id = 'rap']/h2[1] | 3 | date://div[@id = 'rap']/h2[1] |
4 | body://div[@class = 'post'] | 4 | body://div[@class = 'post'] |
5 | 5 | ||
6 | # wrapping caption and image | 6 | # wrapping caption and image |
7 | wrap_in(fieldset)://div[contains(@class, 'wp-caption')] | 7 | wrap_in(fieldset)://div[contains(@class, 'wp-caption')] |
8 | 8 | ||
9 | 9 | ||
10 | # clean up | 10 | # clean up |
11 | strip://div[@class = 'post']/h3[@class = 'storytitle'] | 11 | strip://div[@class = 'post']/h3[@class = 'storytitle'] |
12 | strip://div[@class = 'post']/div[@class = 'social'] | 12 | strip://div[@class = 'post']/div[@class = 'social'] |
13 | strip://img[@style = 'display:none;'] | 13 | strip://img[@style = 'display:none;'] |
14 | strip://img[@height='0' and @width='0'] | 14 | strip://img[@height='0' and @width='0'] |
15 | test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/ \ No newline at end of file | 15 | test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/blogs.technet.com.txt b/inc/3rdparty/site_config/standard/blogs.technet.com.txt index a2909fd1..3d0fbadc 100644..100755 --- a/inc/3rdparty/site_config/standard/blogs.technet.com.txt +++ b/inc/3rdparty/site_config/standard/blogs.technet.com.txt | |||
@@ -1,6 +1,9 @@ | |||
1 | title: //h3[@class="post-name"] | 1 | title: //h3[@class="post-name"] |
2 | author: //span[@class="user-name"] | 2 | author: //span[@class="user-name"] |
3 | date: //div[@class="post-date"] | 3 | date: //div[@class="post-date"] |
4 | body: //div[@class="post-content user-defined-markup"] | 4 | body: //div[@class="post-content user-defined-markup"] |
5 | strip_id_or_class: log-feedback-list | ||
6 | tidy: no | ||
5 | footnotes: no | 7 | footnotes: no |
6 | test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx \ No newline at end of file | 8 | test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx |
9 | test_url: http://blogs.technet.com/b/isablog/archive/2009/01/07/a-pptp-client-might-fail-to-connect-to-a-vpn-server-on-the-internet-through-an-isa-server-2006.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bluetouff.com.txt b/inc/3rdparty/site_config/standard/bluetouff.com.txt index fbe7a5c6..543d3920 100644..100755 --- a/inc/3rdparty/site_config/standard/bluetouff.com.txt +++ b/inc/3rdparty/site_config/standard/bluetouff.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body://div[@class='entry'] | 1 | body://div[@class='entry'] |
2 | date://div[@class='meta'] | 2 | date://div[@class='meta'] |
3 | strip://a[@class='FlattrButton'] | 3 | strip://a[@class='FlattrButton'] |
4 | test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/ \ No newline at end of file | 4 | test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/boagworld.com.txt b/inc/3rdparty/site_config/standard/boagworld.com.txt index 91e48fdb..3b3da991 100644..100755 --- a/inc/3rdparty/site_config/standard/boagworld.com.txt +++ b/inc/3rdparty/site_config/standard/boagworld.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@class="entry-title"][2] | 1 | title: //h1[@class="entry-title"][2] |
2 | author: string("Paul Boag") | 2 | author: string("Paul Boag") |
3 | date: substring(//span[@class="meta"], 11) | 3 | date: substring(//span[@class="meta"], 11) |
4 | body: //article | 4 | body: //article |
5 | strip: //h2 | 5 | strip: //h2 |
6 | strip: //h1 | 6 | strip: //h1 |
7 | strip: //div[@id="callsToAction"] | 7 | strip: //div[@id="callsToAction"] |
8 | test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/ \ No newline at end of file | 8 | test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/boingboing.net.txt b/inc/3rdparty/site_config/standard/boingboing.net.txt index 9169e8fb..4f39661b 100644..100755 --- a/inc/3rdparty/site_config/standard/boingboing.net.txt +++ b/inc/3rdparty/site_config/standard/boingboing.net.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | # This is far from perfect, but so is BoingBoing's markup | 1 | # This is far from perfect, but so is BoingBoing's markup |
2 | title: //h2[@class="headline"] | 2 | title: //h2[@class="headline"] |
3 | single_page_link: //h2[@class="headline"]/a | 3 | single_page_link: //h2[@class="headline"]/a |
4 | #date: //p[@class="byline"] | 4 | #date: //p[@class="byline"] |
5 | body: //div[@class="post"] | 5 | body: //div[@class="post"] |
6 | 6 | ||
7 | strip_id_or_class: shareMe | 7 | strip_id_or_class: shareMe |
8 | strip_id_or_class: authorbox | 8 | strip_id_or_class: authorbox |
9 | strip_id_or_class: byline | 9 | strip_id_or_class: byline |
10 | 10 | ||
11 | test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html \ No newline at end of file | 11 | test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt b/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt index 4cc49043..3616bbf2 100644..100755 --- a/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt +++ b/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h2[@class='entry-title'] | 1 | title: //h2[@class='entry-title'] |
2 | body: //div[@class='entry-content'] | 2 | body: //div[@class='entry-content'] |
3 | test_url: http://boldizsar.palotas.eu/blog/?p=1394 \ No newline at end of file | 3 | test_url: http://boldizsar.palotas.eu/blog/?p=1394 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/book.douban.com.txt b/inc/3rdparty/site_config/standard/book.douban.com.txt index 8b958562..fe2d2cbf 100644..100755 --- a/inc/3rdparty/site_config/standard/book.douban.com.txt +++ b/inc/3rdparty/site_config/standard/book.douban.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //span[@property='v:description'] | 1 | body: //span[@property='v:description'] |
2 | date: //span[@property='v:dtreviewed'] | 2 | date: //span[@property='v:dtreviewed'] |
3 | author: //span[@property='v:reviewer'] | 3 | author: //span[@property='v:reviewer'] |
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://book.douban.com/review/2422662/ \ No newline at end of file | 6 | test_url: http://book.douban.com/review/2422662/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bookforum.com.txt b/inc/3rdparty/site_config/standard/bookforum.com.txt index 331f415e..03b60039 100644..100755 --- a/inc/3rdparty/site_config/standard/bookforum.com.txt +++ b/inc/3rdparty/site_config/standard/bookforum.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | #metadata | 1 | #metadata |
2 | title://div[@class = 'Topper']/h1 | 2 | title://div[@class = 'Topper']/h1 |
3 | author://div[@class = 'Topper']/h3 | 3 | author://div[@class = 'Topper']/h3 |
4 | date://div[@class = 'Topper']/h6 | 4 | date://div[@class = 'Topper']/h6 |
5 | body://div[@class = 'Core'] | 5 | body://div[@class = 'Core'] |
6 | 6 | ||
7 | 7 | ||
8 | 8 | ||
9 | # clean up | 9 | # clean up |
10 | strip://div[@class = 'Topper']/h1 | 10 | strip://div[@class = 'Topper']/h1 |
11 | strip://div[@class = 'Topper']/h3 | 11 | strip://div[@class = 'Topper']/h3 |
12 | strip://div[@class = 'Topper']/h4 | 12 | strip://div[@class = 'Topper']/h4 |
13 | strip://div[@class = 'Topper']/h5 | 13 | strip://div[@class = 'Topper']/h5 |
14 | strip://div[@class = 'Topper']/h6 | 14 | strip://div[@class = 'Topper']/h6 |
15 | strip://br[@clear = 'all'] | 15 | strip://br[@clear = 'all'] |
16 | strip://div[@class = 'adCore'] | 16 | strip://div[@class = 'adCore'] |
17 | strip://div[@class = 'BookR'] | 17 | strip://div[@class = 'BookR'] |
18 | strip://div[@class = 'InfoBox'] | 18 | strip://div[@class = 'InfoBox'] |
19 | test_url: http://bookforum.com/inprint/018_04/8595 \ No newline at end of file | 19 | test_url: http://bookforum.com/inprint/018_04/8595 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/borderhouseblog.com.txt b/inc/3rdparty/site_config/standard/borderhouseblog.com.txt index 190738d5..b4e116fe 100644..100755 --- a/inc/3rdparty/site_config/standard/borderhouseblog.com.txt +++ b/inc/3rdparty/site_config/standard/borderhouseblog.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | author://div[@class="meta"]/span/a | 2 | author://div[@class="meta"]/span/a |
3 | date://div[@class="date"] | 3 | date://div[@class="date"] |
4 | body://div[@class="content article"] | 4 | body://div[@class="content article"] |
5 | strip://div[@class="content article"]/h1 | 5 | strip://div[@class="content article"]/h1 |
6 | 6 | ||
7 | test_url: http://borderhouseblog.com/?p=7832 \ No newline at end of file | 7 | test_url: http://borderhouseblog.com/?p=7832 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bostonglobe.com.txt b/inc/3rdparty/site_config/standard/bostonglobe.com.txt index d3e6f43f..4c74a34e 100644..100755 --- a/inc/3rdparty/site_config/standard/bostonglobe.com.txt +++ b/inc/3rdparty/site_config/standard/bostonglobe.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | # NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. | 1 | # NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. |
2 | 2 | ||
3 | title: //div[@class="header"]/h1 | 3 | title: //div[@class="header"]/h1 |
4 | author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") | 4 | author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") |
5 | date: //div[@class="byline"]/p[last()] | 5 | date: //div[@class="byline"]/p[last()] |
6 | body: //div[@class="article-body"] | 6 | body: //div[@class="article-body"] |
7 | 7 | ||
8 | strip_id_or_class: aside | 8 | strip_id_or_class: aside |
9 | strip_id_or_class: promo | 9 | strip_id_or_class: promo |
10 | strip_id_or_class: skip-nav | 10 | strip_id_or_class: skip-nav |
11 | strip_id_or_class: article-more | 11 | strip_id_or_class: article-more |
12 | strip_id_or_class: article-bar | 12 | strip_id_or_class: article-bar |
13 | 13 | ||
14 | # This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. | 14 | # This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. |
15 | strip_id_or_class: figure | 15 | strip_id_or_class: figure |
16 | test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html \ No newline at end of file | 16 | test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bostonreview.net.txt b/inc/3rdparty/site_config/standard/bostonreview.net.txt index 68567012..64e04a1c 100644..100755 --- a/inc/3rdparty/site_config/standard/bostonreview.net.txt +++ b/inc/3rdparty/site_config/standard/bostonreview.net.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | #basics | 1 | #basics |
2 | title://h3[@class = 'article_title'] | 2 | title://h3[@class = 'article_title'] |
3 | date://span[@class = 'article_date'] | 3 | date://span[@class = 'article_date'] |
4 | body://div[@id = 'center_column_article'] | 4 | body://div[@id = 'center_column_article'] |
5 | #correct, but author not being picked up in preview | 5 | #correct, but author not being picked up in preview |
6 | author://span[@class = 'article_author'] | 6 | author://span[@class = 'article_author'] |
7 | 7 | ||
8 | #strips basics from article | 8 | #strips basics from article |
9 | strip_id_or_class:article_title | 9 | strip_id_or_class:article_title |
10 | strip_id_or_class:article_date | 10 | strip_id_or_class:article_date |
11 | strip_id_or_class:article_author | 11 | strip_id_or_class:article_author |
12 | 12 | ||
13 | #strips pull quotes | 13 | #strips pull quotes |
14 | strip_id_or_class:pull_quote | 14 | strip_id_or_class:pull_quote |
15 | test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php \ No newline at end of file | 15 | test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/boundlessline.org.txt b/inc/3rdparty/site_config/standard/boundlessline.org.txt index bfc3f3d1..a836e1e2 100644..100755 --- a/inc/3rdparty/site_config/standard/boundlessline.org.txt +++ b/inc/3rdparty/site_config/standard/boundlessline.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: substring-before(//title, '|') | 1 | title: substring-before(//title, '|') |
2 | body: //div[@class="entry"] | 2 | body: //div[@class="entry"] |
3 | # Remove the author's picture | 3 | # Remove the author's picture |
4 | strip: //div[@class="entry"]/a[1] | 4 | strip: //div[@class="entry"]/a[1] |
5 | test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html \ No newline at end of file | 5 | test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bowdoinorient.com.txt b/inc/3rdparty/site_config/standard/bowdoinorient.com.txt new file mode 100755 index 00000000..932143d1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bowdoinorient.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //*[@class='articletitle'] | ||
2 | body: //*[(@id='articlebody')] | ||
3 | date: //*[(@class='articledate')] | ||
4 | author: //*[(@class='articleauthor')] | ||
5 | autodetect_next_page: no | ||
6 | test_url: http://bowdoinorient.com/article/8045 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brainfacts.org.txt b/inc/3rdparty/site_config/standard/brainfacts.org.txt index 94b0f56d..9705f621 100644..100755 --- a/inc/3rdparty/site_config/standard/brainfacts.org.txt +++ b/inc/3rdparty/site_config/standard/brainfacts.org.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //div[@class="standard"]/h1 | 1 | title: //div[@class="standard"]/h1 |
2 | author: string("BrainFacts.org") | 2 | author: string("BrainFacts.org") |
3 | date: //div[@class="meta"]/strong | 3 | date: //div[@class="meta"]/strong |
4 | 4 | ||
5 | strip: //p[@class="skip"] | 5 | strip: //p[@class="skip"] |
6 | strip: //div[@class="meta"] | 6 | strip: //div[@class="meta"] |
7 | strip: //div[@class="standard"]/h1 | 7 | strip: //div[@class="standard"]/h1 |
8 | strip: //div[@class="modal"] | 8 | strip: //div[@class="modal"] |
9 | strip: //div[@class="columnRight"] | 9 | strip: //div[@class="columnRight"] |
10 | test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/ \ No newline at end of file | 10 | test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/brandeins.de.txt b/inc/3rdparty/site_config/standard/brandeins.de.txt index 3753ce67..36aa2efa 100644..100755 --- a/inc/3rdparty/site_config/standard/brandeins.de.txt +++ b/inc/3rdparty/site_config/standard/brandeins.de.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | # set body | 1 | # set body |
2 | body: //div[@id='theContent'] | 2 | body: //div[@id='theContent'] |
3 | 3 | ||
4 | # set title | 4 | # set title |
5 | title: //div[@id='theContent']/h3 | 5 | title: //div[@id='theContent']/h3 |
6 | strip: //div[@id='theContent']/h3 | 6 | strip: //div[@id='theContent']/h3 |
7 | test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html \ No newline at end of file | 7 | test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt b/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt index 19504844..fc020539 100644..100755 --- a/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt +++ b/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | date://h2[@class="date-header"] | 1 | date://h2[@class="date-header"] |
2 | body://div[@class="entry-content"] | 2 | body://div[@class="entry-content"] |
3 | test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html \ No newline at end of file | 3 | test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/brasil.elpais.com.txt b/inc/3rdparty/site_config/standard/brasil.elpais.com.txt new file mode 100755 index 00000000..0b8feb6a --- /dev/null +++ b/inc/3rdparty/site_config/standard/brasil.elpais.com.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | title: //meta[@name='DC.title']/@content | ||
2 | title: //div[contains(@class, 'cabecera_noticia')]//h1 | ||
3 | date: //meta[@name='DC.date']/@content | ||
4 | date: //meta[@name='date']/@content | ||
5 | body: //div[@class='columna_texto'] | ||
6 | body: //div[@id='cuerpo_noticia'] | ||
7 | body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] | ||
8 | |||
9 | prune: no | ||
10 | |||
11 | strip_id_or_class: disposicion_vertical | ||
12 | strip_id_or_class: ampliar_foto | ||
13 | strip_id_or_class: utilidades | ||
14 | strip_id_or_class: info_relacionada | ||
15 | strip_id_or_class: m-kiosko | ||
16 | strip_id_or_class: info_complementa | ||
17 | |||
18 | strip: //p[@class='nota_pie'] | ||
19 | strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] | ||
20 | strip: //div[@id='coment' or @id='foros_not'] | ||
21 | |||
22 | test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html | ||
23 | test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes | ||
diff --git a/inc/3rdparty/site_config/standard/brettterpstra.com.txt b/inc/3rdparty/site_config/standard/brettterpstra.com.txt index f6f73778..55da1787 100644..100755 --- a/inc/3rdparty/site_config/standard/brettterpstra.com.txt +++ b/inc/3rdparty/site_config/standard/brettterpstra.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='post full'] | 1 | body: //div[@class='post full'] |
2 | title: //h1 | 2 | title: //h1 |
3 | author: substring-after(//title, '- ') | 3 | author: substring-after(//title, '- ') |
4 | date: //span[@class='date'] | 4 | date: //span[@class='date'] |
5 | test_url: http://brettterpstra.com/byword-for-ios/ \ No newline at end of file | 5 | test_url: http://brettterpstra.com/byword-for-ios/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt b/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt index 27e6b70c..27e6b70c 100644..100755 --- a/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt +++ b/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt | |||
diff --git a/inc/3rdparty/site_config/standard/brookings.edu.txt b/inc/3rdparty/site_config/standard/brookings.edu.txt index 9f4fc4e3..17a47605 100644..100755 --- a/inc/3rdparty/site_config/standard/brookings.edu.txt +++ b/inc/3rdparty/site_config/standard/brookings.edu.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //div[@id='contentheader']/h1 | 1 | title: //div[@id='contentheader']/h1 |
2 | author: //p[@class='attribution']/span[@class='author']/* | 2 | author: //p[@class='attribution']/span[@class='author']/* |
3 | # Is there a way to pull multiple authors? My XPath here is just grabbing the first | 3 | # Is there a way to pull multiple authors? My XPath here is just grabbing the first |
4 | 4 | ||
5 | date: /html/head/meta[@name="date"]/@content | 5 | date: /html/head/meta[@name="date"]/@content |
6 | body: //div[@class='main-content'] | 6 | body: //div[@class='main-content'] |
7 | 7 | ||
8 | strip: //p[@class='byline'] | 8 | strip: //p[@class='byline'] |
9 | strip: //div[@class='img-gallery'] | 9 | strip: //div[@class='img-gallery'] |
10 | strip: //div[@class='callout'] | 10 | strip: //div[@class='callout'] |
11 | strip: //div[@class='add-your-view'] | 11 | strip: //div[@class='add-your-view'] |
12 | convert_double_br_tags: yes | 12 | convert_double_br_tags: yes |
13 | test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx \ No newline at end of file | 13 | test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/brooksreview.net.txt b/inc/3rdparty/site_config/standard/brooksreview.net.txt index 71cafcdb..d33d7d4e 100644..100755 --- a/inc/3rdparty/site_config/standard/brooksreview.net.txt +++ b/inc/3rdparty/site_config/standard/brooksreview.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@class='article'] | 2 | body: //div[@class='article'] |
3 | body: //div[@class='post'] | 3 | body: //div[@class='post'] |
4 | date: //*[@id='single']/span | 4 | date: //*[@id='single']/span |
5 | prune: no | 5 | prune: no |
6 | test_url: http://brooksreview.net/2011/11/readability-agency/ \ No newline at end of file | 6 | test_url: http://brooksreview.net/2011/11/readability-agency/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bt.no.txt b/inc/3rdparty/site_config/standard/bt.no.txt new file mode 100755 index 00000000..200c2e4e --- /dev/null +++ b/inc/3rdparty/site_config/standard/bt.no.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1[contains(@class,'articleTitle')] | ||
2 | author: //span[@itemprop='name'] | ||
3 | date: //time[@class='published'] | ||
4 | body: //div[contains(@class,'bodyText')] | ||
5 | |||
6 | strip_id_or_class: 'pull1' | ||
7 | strip_id_or_class: 'relationArticle' | ||
8 | strip: //span[@class='quote'] | ||
9 | |||
10 | # strip h2 if at end of article (typically a request for comments) | ||
11 | strip: //div[contains(@class,'bodyText')]/node()[last()-1]/self::h2 | ||
12 | test_url: http://www.bt.no/meninger/debatt/Typisk-norsk-a-vare-god-nok-2884108.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/buffed.de.txt b/inc/3rdparty/site_config/standard/buffed.de.txt new file mode 100755 index 00000000..3dd36ce6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/buffed.de.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | date: //meta[@itemProp='datePublished']/@content | ||
2 | body: //div[@class='intro' or contains(@class, 'article_text')] | ||
3 | prune: no | ||
4 | strip_id_or_class: embedcode | ||
5 | strip_id_or_class: EmbedSwitch | ||
6 | strip_id_or_class: EmbedText | ||
7 | strip_id_or_class: bildergalerie | ||
8 | strip_id_or_class: subline_seohour_image | ||
9 | strip_id_or_class: ova-player | ||
10 | strip_id_or_class: jcarouseloutput | ||
11 | strip_id_or_class: cbox_embedded | ||
12 | |||
13 | test_url: http://www.buffed.de/SWTOR-Star-Wars-The-Old-Republic-PC-218697/News/SWTOR-Ab-Patch-24-Lore-Klamotten-faerben-1090051/ | ||
14 | test_url: http://www.buffed.de/feed.cfm?menu_alias=home \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/buquad.com.txt b/inc/3rdparty/site_config/standard/buquad.com.txt index a75fa046..f0fd08db 100644..100755 --- a/inc/3rdparty/site_config/standard/buquad.com.txt +++ b/inc/3rdparty/site_config/standard/buquad.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //h2/a | 2 | author: //h2/a |
3 | date: substring-after(//h2, '|') | 3 | date: substring-after(//h2, '|') |
4 | strip_id_or_class: 'attachment' | 4 | strip_id_or_class: 'attachment' |
5 | strip: //h3 | 5 | strip: //h3 |
6 | 6 | ||
7 | body: //div[@class='entry'] | 7 | body: //div[@class='entry'] |
8 | test_url: http://buquad.com/2012/04/09/paul-ryan/ \ No newline at end of file | 8 | test_url: http://buquad.com/2012/04/09/paul-ryan/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/business2community.com.txt b/inc/3rdparty/site_config/standard/business2community.com.txt new file mode 100755 index 00000000..0dcc7ff8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/business2community.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | date: substring-after(//p[@class='byline'],'Published') | ||
2 | |||
3 | strip: //div[@class='article-meta'] | ||
4 | |||
5 | test_url: http://www.business2community.com/social-media/funky-ways-to-print-instagram-photos-0485340 | ||
diff --git a/inc/3rdparty/site_config/standard/businessinsider.com.txt b/inc/3rdparty/site_config/standard/businessinsider.com.txt index c773db8b..39eb7426 100644..100755 --- a/inc/3rdparty/site_config/standard/businessinsider.com.txt +++ b/inc/3rdparty/site_config/standard/businessinsider.com.txt | |||
@@ -1,12 +1,16 @@ | |||
1 | title://div[@class="sl-layout-post"]/h1 | 1 | title://div[@class="sl-layout-post"]/h1 |
2 | body: //div[contains(@class, 'post-content') or contains(@class, 'KonaBody')] | 2 | body: //div[contains(@class, 'post-content') or contains(@class, 'slide-module') or contains(@class, 'KonaBody')] |
3 | strip: //div[contains(@class, "post-sidebar")] | 3 | strip: //div[contains(@class, "post-sidebar")] |
4 | strip: //div[@id='related-links'] | 4 | strip: //div[@id='related-links'] |
5 | author://div[@class="byline"]/a | 5 | strip: //div[@class='related-links-container'] |
6 | date://div[@class="byline"]/span[@class="date"] | 6 | strip: //p[@class='source'] |
7 | prune: no | 7 | author://div[@class="byline"]/a |
8 | 8 | date://div[@class="byline"]/span[@class="date"] | |
9 | strip://*[contains(@class,'sponsored-text')] | 9 | prune: no |
10 | strip: //div[@id='post_footer'] | 10 | |
11 | 11 | single_page_link: //a[contains(text(), 'View as one page')] | |
12 | test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1 \ No newline at end of file | 12 | |
13 | strip://*[contains(@class,'sponsored-text')] | ||
14 | strip: //div[@id='post_footer'] | ||
15 | |||
16 | test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1 | ||
diff --git a/inc/3rdparty/site_config/standard/businessnews.com.tn.txt b/inc/3rdparty/site_config/standard/businessnews.com.tn.txt index 714cfc90..6502b8e1 100644..100755 --- a/inc/3rdparty/site_config/standard/businessnews.com.tn.txt +++ b/inc/3rdparty/site_config/standard/businessnews.com.tn.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@id='article_detail'] | 1 | body: //div[@id='article_detail'] |
2 | title: //meta[@property='og:title']/@content | 2 | title: //meta[@property='og:title']/@content |
3 | date: //div[@id='date_com_art']//a[@class='date'] | 3 | date: //div[@id='date_com_art']//a[@class='date'] |
4 | author: //div[@id='article_detail']//font[@class='auteur'] | 4 | author: //div[@id='article_detail']//font[@class='auteur'] |
5 | 5 | ||
6 | strip_id_or_class: porte_titre_theme | 6 | strip_id_or_class: porte_titre_theme |
7 | strip_id_or_class: cont_param | 7 | strip_id_or_class: cont_param |
8 | strip_id_or_class: date_com_art | 8 | strip_id_or_class: date_com_art |
9 | 9 | ||
10 | prune: no | 10 | prune: no |
11 | 11 | ||
12 | test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1 \ No newline at end of file | 12 | test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt index 7b3d063b..03085593 100644..100755 --- a/inc/3rdparty/site_config/standard/businessweek.com.txt +++ b/inc/3rdparty/site_config/standard/businessweek.com.txt | |||
@@ -1,30 +1,30 @@ | |||
1 | # story has several pages, should be detected | 1 | # story has several pages, should be detected |
2 | body: //div[@id='storyBody'] | 2 | body: //div[@id='storyBody'] |
3 | body: //div[@id='article_body'] | 3 | body: //div[@id='article_body'] |
4 | body: //div[@id='story_body'] | 4 | body: //div[@id='story_body'] |
5 | 5 | ||
6 | title://h1[@id='article_headline'] | 6 | title://h1[@id='article_headline'] |
7 | 7 | ||
8 | # article author | 8 | # article author |
9 | author: //p[@class='author']/a | 9 | author: //p[@class='author']/a |
10 | # story author(s) | 10 | # story author(s) |
11 | author: substring-after(//p[@class='byline'], 'By ') | 11 | author: substring-after(//p[@class='byline'], 'By ') |
12 | 12 | ||
13 | # article date | 13 | # article date |
14 | date: //span[@class='published_date'] | 14 | date: //span[@class='published_date'] |
15 | # story date | 15 | # story date |
16 | date: //span[@class='date'] | 16 | date: //span[@class='date'] |
17 | 17 | ||
18 | date: substring-after(//div[contains(@class,'attributor')],'on') | 18 | date: substring-after(//div[contains(@class,'attributor')],'on') |
19 | strip_id_or_class: inset | 19 | strip_id_or_class: inset |
20 | strip: //p/span[@class='photoCredit'] | 20 | strip: //p/span[@class='photoCredit'] |
21 | strip: //h1 | 21 | strip: //h1 |
22 | 22 | ||
23 | strip_id_or_class: page_count | 23 | strip_id_or_class: page_count |
24 | strip_id_or_class: tools | 24 | strip_id_or_class: tools |
25 | strip_id_or_class: pagination | 25 | strip_id_or_class: pagination |
26 | 26 | ||
27 | single_page_link: //li[@id='stPrint']/a | 27 | single_page_link: //li[@id='stPrint']/a |
28 | 28 | ||
29 | test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html | 29 | test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html |
30 | test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file | 30 | test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/buzzfeed.com.txt b/inc/3rdparty/site_config/standard/buzzfeed.com.txt index 6df8bc47..97dddaee 100644..100755 --- a/inc/3rdparty/site_config/standard/buzzfeed.com.txt +++ b/inc/3rdparty/site_config/standard/buzzfeed.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | # Creator: Greg Leuch <greg@...> | 1 | # Creator: Greg Leuch <greg@...> |
2 | 2 | ||
3 | # It can be messy. | 3 | # It can be messy. |
4 | tidy:no | 4 | tidy:no |
5 | 5 | ||
6 | # The basic template. | 6 | # The basic template. |
7 | title: //h1[@data-print='title'] | 7 | title: //h1[@data-print='title'] |
8 | author: //a[@data-print='author'] | 8 | author: //a[@data-print='author'] |
9 | date: //time[@data-print='date'] | 9 | date: //time[@data-print='date'] |
10 | body: //div[@data-print='body'] | 10 | body: //div[@data-print='body'] |
11 | body: //section[@data-print='body'] | 11 | body: //section[@data-print='body'] |
12 | 12 | ||
13 | # For various things... | 13 | # For various things... |
14 | strip: *[@data-print="ignore"] | 14 | strip: *[@data-print="ignore"] |
15 | test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays \ No newline at end of file | 15 | test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/bygonebureau.com.txt b/inc/3rdparty/site_config/standard/bygonebureau.com.txt index 0abb6436..63c82130 100644..100755 --- a/inc/3rdparty/site_config/standard/bygonebureau.com.txt +++ b/inc/3rdparty/site_config/standard/bygonebureau.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //a[contains(@href, '/author/')] | 2 | author: //a[contains(@href, '/author/')] |
3 | date: //*[@class='post-date'] | 3 | date: //*[@class='post-date'] |
4 | strip: //*[@class='post-date'] | 4 | strip: //*[@class='post-date'] |
5 | strip: //h1 | 5 | strip: //h1 |
6 | test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/ \ No newline at end of file | 6 | test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cable.co.uk.txt b/inc/3rdparty/site_config/standard/cable.co.uk.txt new file mode 100755 index 00000000..435bf3b5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cable.co.uk.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@class='page-content']//h1 | ||
2 | body: //div[@class='page-content'] | ||
3 | strip_id_or_class: editorial-bar-top | ||
4 | strip_id_or_class: social-bottom | ||
5 | strip_id_or_class: comment-form | ||
6 | strip_id_or_class: pc-why | ||
7 | |||
8 | prune: no | ||
9 | tidy: no | ||
10 | |||
11 | test_url: http://www.cable.co.uk/news/bt-vision-unveils-interactive-guide-application-800734218/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cardboardconnection.com.txt b/inc/3rdparty/site_config/standard/cardboardconnection.com.txt index 3adc7a35..49f34302 100644..100755 --- a/inc/3rdparty/site_config/standard/cardboardconnection.com.txt +++ b/inc/3rdparty/site_config/standard/cardboardconnection.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@class='producttabbed-title'] | 1 | title: //h1[@class='producttabbed-title'] |
2 | body: //div[@class='postTabs_divs postTabs_curr_div'] | 2 | body: //div[@class='postTabs_divs postTabs_curr_div'] |
3 | strip: //div[@class='ratingblock2'] | 3 | strip: //div[@class='ratingblock2'] |
4 | strip: //p[@id='breadcrumbs'] | 4 | strip: //p[@id='breadcrumbs'] |
5 | strip: //div[@style='display: none'] | 5 | strip: //div[@style='display: none'] |
6 | 6 | ||
7 | 7 | ||
8 | test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards \ No newline at end of file | 8 | test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/carpeaqua.com.txt b/inc/3rdparty/site_config/standard/carpeaqua.com.txt index 7ba1ed78..5ea302e0 100644..100755 --- a/inc/3rdparty/site_config/standard/carpeaqua.com.txt +++ b/inc/3rdparty/site_config/standard/carpeaqua.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | body: //div[@class='entry'] | 2 | body: //div[@class='entry'] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | # otherwise the footnotes are removed | 5 | # otherwise the footnotes are removed |
6 | test_url: http://carpeaqua.com/2011/03/27/the-intersection-of-power-and-portability/ \ No newline at end of file | 6 | test_url: http://carpeaqua.com/2011/03/27/the-intersection-of-power-and-portability/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cars.com.txt b/inc/3rdparty/site_config/standard/cars.com.txt new file mode 100755 index 00000000..71c5c050 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cars.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[contains(@class, 'basicInfo')]//h1 | ||
2 | |||
3 | body: //img[@id='chosenPhotoIMG'] | //div[@id='aboutThisVehicleBox'] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.cars.com/go/search/detail.jsp?listingId=115364779 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/catb.org.txt b/inc/3rdparty/site_config/standard/catb.org.txt index 8908292c..2cd197fb 100644..100755 --- a/inc/3rdparty/site_config/standard/catb.org.txt +++ b/inc/3rdparty/site_config/standard/catb.org.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@class='article'] | 1 | body: //div[@class='article'] |
2 | strip: //div[@class='revhistory'] | 2 | strip: //div[@class='revhistory'] |
3 | strip: //div[@class='toc'] | 3 | strip: //div[@class='toc'] |
4 | tidy: no | 4 | tidy: no |
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://catb.org/~esr/faqs/smart-questions.html \ No newline at end of file | 7 | test_url: http://catb.org/~esr/faqs/smart-questions.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cbc.ca.txt b/inc/3rdparty/site_config/standard/cbc.ca.txt index 25305109..ba5faf3f 100644..100755 --- a/inc/3rdparty/site_config/standard/cbc.ca.txt +++ b/inc/3rdparty/site_config/standard/cbc.ca.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[contains(@class, 'headline')]/h1 | 1 | title: //div[contains(@class, 'headline')]/h1 |
2 | author: //h5[contains(@class, 'byline')] | 2 | author: //h5[contains(@class, 'byline')] |
3 | date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ') | 3 | date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ') |
4 | body: //div[@id="storyboard"] | 4 | body: //div[@id="storyboard"] |
5 | test_url: http://www.cbc.ca/news/world/story/2012/01/16/cruise-ship-monday.html \ No newline at end of file | 5 | test_url: http://www.cbc.ca/news/world/story/2012/01/16/cruise-ship-monday.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cbn.com.txt b/inc/3rdparty/site_config/standard/cbn.com.txt new file mode 100755 index 00000000..de8d8839 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cbn.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[contains(@class, 'articleText')] | ||
2 | date: //div[contains(@class, 'articleDate')] | ||
3 | author: //a[contains(@id, 'articleDetails_lnkByLine')] | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.cbn.com/cbnnews/world/2013/June/Chilly-G-8-Obama-Putin-Agree-to-Disagree-on-Syria/ | ||
7 | test_url: http://www.cbn.com/cbnnews/world/2013/June/UK-Agency-Accused-of-Hacking-Foreign-Diplomats/ | ||
8 | test_url: http://www.cbn.com/cbnnews/feed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cbsnews.com.txt b/inc/3rdparty/site_config/standard/cbsnews.com.txt index 4ba3da19..04d20230 100644..100755 --- a/inc/3rdparty/site_config/standard/cbsnews.com.txt +++ b/inc/3rdparty/site_config/standard/cbsnews.com.txt | |||
@@ -1,14 +1,15 @@ | |||
1 | date: //meta[@name="published"]/@content | 1 | date: //meta[@name="published"]/@content |
2 | date: //div[@class="timeLine"] | 2 | date: //div[@class="timeLine"] |
3 | title: //div[@id='contentBody']//h1 | 3 | title: //div[@id='contentBody']//h1 |
4 | author: //dl[@class="storyBlogByline"]/dd/a | 4 | author: //dl[@class="storyBlogByline"]/dd/a |
5 | body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')] | 5 | body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')] |
6 | 6 | ||
7 | # Content Pruning | 7 | # Content Pruning |
8 | strip: //div[@class="scrollingArrows"] | 8 | strip: //div[@class="scrollingArrows"] |
9 | strip: //div[@class="timeLine"] | 9 | strip: //div[@class="timeLine"] |
10 | strip: //dl[@class="storyBlogByline"] | 10 | strip: //dl[@class="storyBlogByline"] |
11 | 11 | strip: //span[@class='image-credit'] | |
12 | prune: no | 12 | |
13 | 13 | prune: no | |
14 | test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/ \ No newline at end of file | 14 | |
15 | test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/ | ||
diff --git a/inc/3rdparty/site_config/standard/cedarrepublican.com.txt b/inc/3rdparty/site_config/standard/cedarrepublican.com.txt new file mode 100755 index 00000000..42faa521 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cedarrepublican.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='frame']//img[@class='horizontal'] | //div[@class='content'] | ||
2 | test_url: http://cedarrepublican.com/online_features/gift_ideas/sending-mother-s-day-flowers-how-to-be-sure-they/article_b69af9b8-1f05-5352-8621-16ce007e5623.html | ||
diff --git a/inc/3rdparty/site_config/standard/chareidi.org.txt b/inc/3rdparty/site_config/standard/chareidi.org.txt index de34a7d8..de34a7d8 100644..100755 --- a/inc/3rdparty/site_config/standard/chareidi.org.txt +++ b/inc/3rdparty/site_config/standard/chareidi.org.txt | |||
diff --git a/inc/3rdparty/site_config/standard/chinamining.org.txt b/inc/3rdparty/site_config/standard/chinamining.org.txt index ea0df2a3..d00d65de 100644..100755 --- a/inc/3rdparty/site_config/standard/chinamining.org.txt +++ b/inc/3rdparty/site_config/standard/chinamining.org.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //*[@id='Content']/span[1] | 1 | title: //*[@id='Content']/span[1] |
2 | author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(') | 2 | author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(') |
3 | date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter') | 3 | date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter') |
4 | 4 | ||
5 | strip: //*[@id='Content']/span[1] | 5 | strip: //*[@id='Content']/span[1] |
6 | strip: //*[@id='Content']/span[2] | 6 | strip: //*[@id='Content']/span[2] |
7 | 7 | ||
8 | body: //*[@id='Content'] | 8 | body: //*[@id='Content'] |
9 | 9 | ||
10 | test_url: http://www.chinamining.org/News/2011-07-22/1311319069d48087.html \ No newline at end of file | 10 | test_url: http://www.chinamining.org/News/2011-07-22/1311319069d48087.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/chomsky.info.txt b/inc/3rdparty/site_config/standard/chomsky.info.txt index 1d294109..31440538 100644..100755 --- a/inc/3rdparty/site_config/standard/chomsky.info.txt +++ b/inc/3rdparty/site_config/standard/chomsky.info.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class='title'] | 1 | title: //div[@class='title'] |
2 | author: //div[@class='author'] | 2 | author: //div[@class='author'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://www.chomsky.info/onchomsky/2002----.htm \ No newline at end of file | 5 | test_url: http://www.chomsky.info/onchomsky/2002----.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/chrisltd.com.txt b/inc/3rdparty/site_config/standard/chrisltd.com.txt new file mode 100755 index 00000000..86d0f5db --- /dev/null +++ b/inc/3rdparty/site_config/standard/chrisltd.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //header/h1/b[contains(@class, 'title')] | ||
2 | author: substring-after(//article/header/div, 'By ') | ||
3 | date: //header/h1/span[contains(@class, 'date')] | ||
4 | body: //div[@id='main]/article | ||
5 | strip: //header | ||
6 | test_url: http://chrisltd.com/blog/2012/03/fix-widows-indesign/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/christianitytoday.com.txt b/inc/3rdparty/site_config/standard/christianitytoday.com.txt index 44288a46..86be14ce 100644..100755 --- a/inc/3rdparty/site_config/standard/christianitytoday.com.txt +++ b/inc/3rdparty/site_config/standard/christianitytoday.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title://div[@class='title'] | 1 | title://div[@class='title'] |
2 | author://div[@class='byline']/b | 2 | author://div[@class='byline']/b |
3 | date:substring-after(//div[@class='byline'], 'posted') | 3 | date:substring-after(//div[@class='byline'], 'posted') |
4 | body://div[@id='body'] | 4 | body://div[@id='body'] |
5 | wrap_in(h2)://span[@class='subhead'] | 5 | wrap_in(h2)://span[@class='subhead'] |
6 | wrap_in(i)://p[@class='bio'] | 6 | wrap_in(i)://p[@class='bio'] |
7 | wrap_in(i)://p[@class='copyright'] | 7 | wrap_in(i)://p[@class='copyright'] |
8 | strip://div[@class='title'] | 8 | strip://div[@class='title'] |
9 | strip://div[@class='deck'] | 9 | strip://div[@class='deck'] |
10 | strip://div[@class='byline'] | 10 | strip://div[@class='byline'] |
11 | strip://div[@class='copyright'] | 11 | strip://div[@class='copyright'] |
12 | strip://br | 12 | strip://br |
13 | test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html \ No newline at end of file | 13 | test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/christianpf.com.txt b/inc/3rdparty/site_config/standard/christianpf.com.txt index 7f089c55..fb5f342d 100644..100755 --- a/inc/3rdparty/site_config/standard/christianpf.com.txt +++ b/inc/3rdparty/site_config/standard/christianpf.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h1[@class="entry-title"] | 1 | title: //h1[@class="entry-title"] |
2 | author: //*[@class="author vcard fn"] | 2 | author: //*[@class="author vcard fn"] |
3 | date: //*[@class="published"] | 3 | date: //*[@class="published"] |
4 | body: //div[(@class = "dd_content_wrap")] | 4 | body: //div[(@class = "dd_content_wrap")] |
5 | test_url: http://christianpf.com/do-ibuys-lead-to-more-buying/ \ No newline at end of file | 5 | test_url: http://christianpf.com/do-ibuys-lead-to-more-buying/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/christies.com.txt b/inc/3rdparty/site_config/standard/christies.com.txt index 5c5889a2..b3c76519 100644..100755 --- a/inc/3rdparty/site_config/standard/christies.com.txt +++ b/inc/3rdparty/site_config/standard/christies.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | prune: no | 2 | prune: no |
3 | date: //article//time[@pubdate] | 3 | date: //article//time[@pubdate] |
4 | title: //article/header/h2 | 4 | title: //article/header/h2 |
5 | body: //article | 5 | body: //article |
6 | test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3 \ No newline at end of file | 6 | test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/chrome.google.com.txt b/inc/3rdparty/site_config/standard/chrome.google.com.txt index d4cc8581..5a1d043d 100644..100755 --- a/inc/3rdparty/site_config/standard/chrome.google.com.txt +++ b/inc/3rdparty/site_config/standard/chrome.google.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //pre[@id='cx-desc-text'] | 1 | body: //pre[@id='cx-desc-text'] |
2 | body: //div[contains(@class, 'overview-tab-right-bar-info')] | 2 | body: //div[contains(@class, 'overview-tab-right-bar-info')] |
3 | title: //h1[contains(@class, 'detail-dialog-title')] | 3 | title: //h1[contains(@class, 'detail-dialog-title')] |
4 | tidy: no | 4 | tidy: no |
5 | prune: no | 5 | prune: no |
6 | replace_string(<noscript>): <div> | 6 | replace_string(<noscript>): <div> |
7 | replace_string(</noscript>): </div> | 7 | replace_string(</noscript>): </div> |
8 | 8 | ||
9 | test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc \ No newline at end of file | 9 | test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/chronicle.com.txt b/inc/3rdparty/site_config/standard/chronicle.com.txt index 0c6c11ed..e86d3eca 100644..100755 --- a/inc/3rdparty/site_config/standard/chronicle.com.txt +++ b/inc/3rdparty/site_config/standard/chronicle.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | title: //h1[contains(@class, "entry-title")] | 1 | title: //h1[contains(@class, "entry-title")] |
2 | author: //p[contains(@class, "byline")] | 2 | author: //p[contains(@class, "byline")] |
3 | 3 | ||
4 | # blog articles (chronicle.com/blogs/*) | 4 | # blog articles (chronicle.com/blogs/*) |
5 | body: //div[contains(@class, "abstract")] | 5 | body: //div[contains(@class, "abstract")] |
6 | date: //p[contains(@class, "time")] | 6 | date: //p[contains(@class, "time")] |
7 | 7 | ||
8 | # all (?) other articles | 8 | # all (?) other articles |
9 | body: //div[@id="article-body"] | 9 | body: //div[@id="article-body"] |
10 | date: //p[contains(@class, "dateline")] | 10 | date: //p[contains(@class, "dateline")] |
11 | 11 | ||
12 | # remove sidebars containing images (I assume this is desired for Instapaper) | 12 | # remove sidebars containing images (I assume this is desired for Instapaper) |
13 | strip: //div[@id="related"] | 13 | strip: //div[@id="related"] |
14 | strip: //div[contains(@class, "image")] | 14 | strip: //div[contains(@class, "image")] |
15 | 15 | ||
16 | # note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet | 16 | # note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet |
17 | test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/ \ No newline at end of file | 17 | test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ciaosamin.com.txt b/inc/3rdparty/site_config/standard/ciaosamin.com.txt new file mode 100755 index 00000000..02fd3434 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ciaosamin.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body://div[contains(@class, 'entry-content')] | ||
2 | date://h2[contains(@class, 'date-header')] | ||
3 | title://h3[contains(@class, 'post-title')] | ||
4 | test_url: http://www.ciaosamin.com/2013/04/how-this-happened.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cicero.de.txt b/inc/3rdparty/site_config/standard/cicero.de.txt index b9f9a12b..b8913639 100644..100755 --- a/inc/3rdparty/site_config/standard/cicero.de.txt +++ b/inc/3rdparty/site_config/standard/cicero.de.txt | |||
@@ -1,33 +1,33 @@ | |||
1 | # fforst@... | 1 | # fforst@... |
2 | 2 | ||
3 | # Use link to print article for single page view | 3 | # Use link to print article for single page view |
4 | single_page_link: //a[@class="print"] | 4 | single_page_link: //a[@class="print"] |
5 | 5 | ||
6 | # set body | 6 | # set body |
7 | tidy: no | 7 | tidy: no |
8 | body: //div[@class='artikel-content'] | 8 | body: //div[@class='artikel-content'] |
9 | 9 | ||
10 | # strip title and subtitle since we got it already | 10 | # strip title and subtitle since we got it already |
11 | strip: //div[@class='issue'] | 11 | strip: //div[@class='issue'] |
12 | strip: //div[@class='artikel-content']/h2 | 12 | strip: //div[@class='artikel-content']/h2 |
13 | 13 | ||
14 | # some authors are known and have a link, others don't | 14 | # some authors are known and have a link, others don't |
15 | author: //a[contains(@href, 'autor?')] | 15 | author: //a[contains(@href, 'autor?')] |
16 | 16 | ||
17 | #date | 17 | #date |
18 | date: //span[@class='article-date'] | 18 | date: //span[@class='article-date'] |
19 | 19 | ||
20 | # Strip author since we got him | 20 | # Strip author since we got him |
21 | strip_id_or_class: author | 21 | strip_id_or_class: author |
22 | 22 | ||
23 | #strip captions | 23 | #strip captions |
24 | strip_id_or_class: field-name-field-image-credit | 24 | strip_id_or_class: field-name-field-image-credit |
25 | strip_id_or_class: field-name-field-article-image-subtitle | 25 | strip_id_or_class: field-name-field-article-image-subtitle |
26 | 26 | ||
27 | # remove community functions | 27 | # remove community functions |
28 | strip: //div[@class='meta'] | 28 | strip: //div[@class='meta'] |
29 | strip: //div[@id='comments'] | 29 | strip: //div[@id='comments'] |
30 | 30 | ||
31 | # remove "continue on the next page" text | 31 | # remove "continue on the next page" text |
32 | strip: //p[text()="[SEITE]"] | 32 | strip: //p[text()="[SEITE]"] |
33 | test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049 \ No newline at end of file | 33 | test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ciperchile.cl.txt b/inc/3rdparty/site_config/standard/ciperchile.cl.txt index 4d3ac804..d7e9b762 100644..100755 --- a/inc/3rdparty/site_config/standard/ciperchile.cl.txt +++ b/inc/3rdparty/site_config/standard/ciperchile.cl.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //*[(@id = "articlebody")] | 1 | body: //*[(@id = "articlebody")] |
2 | strip_id_or_class: rotulo | 2 | strip_id_or_class: rotulo |
3 | 3 | ||
4 | test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/ \ No newline at end of file | 4 | test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cjr.org.txt b/inc/3rdparty/site_config/standard/cjr.org.txt index a0c3ea5d..df4c7cc4 100644..100755 --- a/inc/3rdparty/site_config/standard/cjr.org.txt +++ b/inc/3rdparty/site_config/standard/cjr.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body'] | 1 | body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body'] |
2 | prune: no | 2 | prune: no |
3 | 3 | ||
4 | single_page_link: //li[@class='print']/a | 4 | single_page_link: //li[@class='print']/a |
5 | 5 | ||
6 | test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php \ No newline at end of file | 6 | test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/classyllama.com.txt b/inc/3rdparty/site_config/standard/classyllama.com.txt new file mode 100755 index 00000000..1864eee8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/classyllama.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | date: //div[@id='content']//p[contains(@class, 'date')]/span | ||
2 | author: substring-after(//div[@id='content']//div[contains(@class, 'over-under-bars')]/p[last()]/text(), 'Posted by ') | ||
3 | body: //div[@id='content']//div[@class='pane-content'] | ||
4 | strip_id_or_class: trackback-url | ||
5 | strip_id_or_class: over-under-bars | ||
6 | test_url: http://www.classyllama.com/content/layout-caching \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/clientk.com.txt b/inc/3rdparty/site_config/standard/clientk.com.txt index 369e88ad..d5a22ccb 100644..100755 --- a/inc/3rdparty/site_config/standard/clientk.com.txt +++ b/inc/3rdparty/site_config/standard/clientk.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title://div[@class="entrytitle"]/a | 1 | title://div[@class="entrytitle"]/a |
2 | author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ") | 2 | author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ") |
3 | date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted") | 3 | date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted") |
4 | body://div[@class="entrybody"] | 4 | body://div[@class="entrybody"] |
5 | strip://div[@class="entrybody"]//p[@class="singleinfo"] | 5 | strip://div[@class="entrybody"]//p[@class="singleinfo"] |
6 | test_url: http://clientk.com/2011/12/19/the-impact-of-more/ \ No newline at end of file | 6 | test_url: http://clientk.com/2011/12/19/the-impact-of-more/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/clubic.com.txt b/inc/3rdparty/site_config/standard/clubic.com.txt index b356bbdf..0148e54c 100644..100755 --- a/inc/3rdparty/site_config/standard/clubic.com.txt +++ b/inc/3rdparty/site_config/standard/clubic.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //a[@class='auteur'] | 2 | author: //a[@class='auteur'] |
3 | body: //div[@class='editorial'] | 3 | body: //div[@class='editorial'] |
4 | next_page_link: //a[contains(text(),'Page suivante')] | 4 | next_page_link: //a[contains(text(),'Page suivante')] |
5 | strip: //a[contains(text(),'Page suivante')] | 5 | strip: //a[contains(text(),'Page suivante')] |
6 | strip: //a[contains(text(),'Page précédente')] | 6 | strip: //a[contains(text(),'Page précédente')] |
7 | strip_id_or_class: slideshow | 7 | strip_id_or_class: slideshow |
8 | 8 | ||
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html \ No newline at end of file | 11 | test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cmswire.com.txt b/inc/3rdparty/site_config/standard/cmswire.com.txt index 2bc96d2e..0b76377a 100644..100755 --- a/inc/3rdparty/site_config/standard/cmswire.com.txt +++ b/inc/3rdparty/site_config/standard/cmswire.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[contains(@id,'article-body')] | 1 | body: //div[contains(@id,'article-body')] |
2 | strip://div[contains(@id,'disqus_count_block')] | 2 | strip://div[contains(@id,'disqus_count_block')] |
3 | strip://div[contains(@id,'col-left')] | 3 | strip://div[contains(@id,'col-left')] |
4 | strip://div[contains(@id,'col-right')] | 4 | strip://div[contains(@id,'col-right')] |
5 | 5 | ||
6 | test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php \ No newline at end of file | 6 | test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cn.engadget.com.txt b/inc/3rdparty/site_config/standard/cn.engadget.com.txt new file mode 100755 index 00000000..63f6f7ea --- /dev/null +++ b/inc/3rdparty/site_config/standard/cn.engadget.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h2[@class="posttitle"] | ||
2 | body: //div[@class="postbody"] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://cn.engadget.com/2013/06/29/google-play-music-all-access/ | ||
diff --git a/inc/3rdparty/site_config/standard/cn.reuters.com.txt b/inc/3rdparty/site_config/standard/cn.reuters.com.txt new file mode 100755 index 00000000..b3878662 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cn.reuters.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@id='maincontent']//h1 | ||
2 | body: //div[@id='resizeableText'] | ||
3 | |||
4 | test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710 | ||
5 | test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cnet.com.txt b/inc/3rdparty/site_config/standard/cnet.com.txt index 74f46ba9..eac08aaa 100644..100755 --- a/inc/3rdparty/site_config/standard/cnet.com.txt +++ b/inc/3rdparty/site_config/standard/cnet.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //div[contains(@class, 'postBody')] | 2 | body: //div[contains(@class, 'postBody')] |
3 | date: //div[@id='nameAndTime']/time | 3 | date: //div[@id='nameAndTime']/time |
4 | author: //div[@id='nameAndTime']/span[@class='author'] | 4 | author: //div[@id='nameAndTime']/span[@class='author'] |
5 | 5 | ||
6 | strip_id_or_class: image-credit | 6 | strip_id_or_class: image-credit |
7 | strip_id_or_class: noAutolink | 7 | strip_id_or_class: noAutolink |
8 | strip_id_or_class: related | 8 | strip_id_or_class: related |
9 | 9 | ||
10 | prune: no | 10 | prune: no |
11 | tidy: no | 11 | tidy: no |
12 | 12 | ||
13 | # early end | 13 | # early end |
14 | replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html> | 14 | replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html> |
15 | 15 | ||
16 | test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/ \ No newline at end of file | 16 | test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cnn.com.txt b/inc/3rdparty/site_config/standard/cnn.com.txt index 995e2c79..6f69e4e8 100644..100755 --- a/inc/3rdparty/site_config/standard/cnn.com.txt +++ b/inc/3rdparty/site_config/standard/cnn.com.txt | |||
@@ -1,19 +1,23 @@ | |||
1 | title: //div[@class="cnn_storyarea"]/h1 | 1 | body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')] |
2 | author: //div[@class="cnnByline"]/strong | 2 | title: //div[@class="cnn_storyarea"]/h1 |
3 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun') | 3 | author: //div[@class="cnnByline"]/strong |
4 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon') | 4 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun') |
5 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue') | 5 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon') |
6 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed') | 6 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue') |
7 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu') | 7 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed') |
8 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri') | 8 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu') |
9 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat') | 9 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri') |
10 | strip: //div[@class="cnn_storyarea"]/h1 | 10 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat') |
11 | strip_id_or_class: cnnByline | 11 | strip: //div[@class="cnn_storyarea"]/h1 |
12 | strip_id_or_class: cnn_strytmstmp | 12 | strip_id_or_class: cnnByline |
13 | strip_id_or_class: cnn_strycaptiontxt | 13 | strip_id_or_class: cnn_strytmstmp |
14 | strip_id_or_class: cnn_strybtntoolsbttm | 14 | strip_id_or_class: cnn_strycaptiontxt |
15 | strip_id_or_class: cnn_strybtntools | 15 | strip_id_or_class: cnn_strybtntoolsbttm |
16 | strip_id_or_class: cnn_strybtmcntnt | 16 | strip_id_or_class: cnn_strybtntools |
17 | strip_id_or_class: cnn_containerwht | 17 | strip_id_or_class: cnn_strybtmcntnt |
18 | strip_id_or_class: cnn_stryathrtmp | 18 | strip_id_or_class: sharebar |
19 | test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories \ No newline at end of file | 19 | #strip_id_or_class: cnn_containerwht |
20 | strip_id_or_class: cnn_stryathrtmp | ||
21 | replace_string(<a name="em0"></a>): <!-- a name --> | ||
22 | test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories | ||
23 | test_url: http://rss.cnn.com/rss/edition.rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cnnsi.com.txt b/inc/3rdparty/site_config/standard/cnnsi.com.txt index 6a2c2b80..ac49aef9 100644..100755 --- a/inc/3rdparty/site_config/standard/cnnsi.com.txt +++ b/inc/3rdparty/site_config/standard/cnnsi.com.txt | |||
@@ -1,26 +1,26 @@ | |||
1 | # main sportsillustrated.com articles | 1 | # main sportsillustrated.com articles |
2 | 2 | ||
3 | body: //div[@id="cnnStoryContent"] | 3 | body: //div[@id="cnnStoryContent"] |
4 | title: //div[@id="cnnStoryHeadline"]//h1 | 4 | title: //div[@id="cnnStoryHeadline"]//h1 |
5 | author: //div[@id="cnnSubBanner"]//strong | 5 | author: //div[@id="cnnSubBanner"]//strong |
6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") | 6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") |
7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") | 7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") |
8 | 8 | ||
9 | # kill ugly font buttons | 9 | # kill ugly font buttons |
10 | strip: //div[@id="cnnSCFontButtons"] | 10 | strip: //div[@id="cnnSCFontButtons"] |
11 | 11 | ||
12 | # kill misc filler videos & etc | 12 | # kill misc filler videos & etc |
13 | strip: //div[@class="cnnDivideContent"] | 13 | strip: //div[@class="cnnDivideContent"] |
14 | strip: //*[@class="cnnTMbox"] | 14 | strip: //*[@class="cnnTMbox"] |
15 | 15 | ||
16 | # si vault articles | 16 | # si vault articles |
17 | # ------------- | 17 | # ------------- |
18 | body: //div[@class="siv_artPara"] | 18 | body: //div[@class="siv_artPara"] |
19 | title: //div[@class="siv_artHeader"]//h1 | 19 | title: //div[@class="siv_artHeader"]//h1 |
20 | author: //div[@class="byline"] | 20 | author: //div[@class="byline"] |
21 | date: //div[@class="date"] | 21 | date: //div[@class="date"] |
22 | 22 | ||
23 | next_page_link: //div[@id='cnnStoryContinue']/a | 23 | next_page_link: //div[@id='cnnStoryContinue']/a |
24 | strip_id_or_class: cnnstorypagination | 24 | strip_id_or_class: cnnstorypagination |
25 | 25 | ||
26 | test_url: http://cnnsi.com/2012/writers/peter_king/01/08/wild.card.round/index.html \ No newline at end of file | 26 | test_url: http://cnnsi.com/2012/writers/peter_king/01/08/wild.card.round/index.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/code.activestate.com.txt b/inc/3rdparty/site_config/standard/code.activestate.com.txt index 6cf72e23..83a21e19 100644..100755 --- a/inc/3rdparty/site_config/standard/code.activestate.com.txt +++ b/inc/3rdparty/site_config/standard/code.activestate.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | title: //div[@id='page_header']/h1 | 2 | title: //div[@id='page_header']/h1 |
3 | 3 | ||
4 | strip_id_or_class: 'lineno' | 4 | strip_id_or_class: 'lineno' |
5 | strip_id_or_class: 'block-toolbar-button' | 5 | strip_id_or_class: 'block-toolbar-button' |
6 | strip_id_or_class: 'recipe_score' | 6 | strip_id_or_class: 'recipe_score' |
7 | strip: //div[@id='recipe_tools'] | 7 | strip: //div[@id='recipe_tools'] |
8 | strip: //div[@id='addcomment'] | 8 | strip: //div[@id='addcomment'] |
9 | 9 | ||
10 | test_url: http://code.activestate.com/recipes/500261-named-tuples/ \ No newline at end of file | 10 | test_url: http://code.activestate.com/recipes/500261-named-tuples/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/code.fivefilters.org.txt b/inc/3rdparty/site_config/standard/code.fivefilters.org.txt new file mode 100755 index 00000000..269fb547 --- /dev/null +++ b/inc/3rdparty/site_config/standard/code.fivefilters.org.txt | |||
@@ -0,0 +1 @@ | |||
body: //div[@id='content'] | |||
diff --git a/inc/3rdparty/site_config/standard/code.google.com.txt b/inc/3rdparty/site_config/standard/code.google.com.txt index 40a16209..6e9c00a7 100644..100755 --- a/inc/3rdparty/site_config/standard/code.google.com.txt +++ b/inc/3rdparty/site_config/standard/code.google.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@id="gc-pagecontent"] | 1 | body: //div[@id="gc-pagecontent"] |
2 | strip: //a[@class="backtotop"] | 2 | strip: //a[@class="backtotop"] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html \ No newline at end of file | 5 | test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/codeproject.com.txt b/inc/3rdparty/site_config/standard/codeproject.com.txt new file mode 100755 index 00000000..d1191acc --- /dev/null +++ b/inc/3rdparty/site_config/standard/codeproject.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id="contentdiv"] | ||
2 | date: //span[@class="date"] | ||
3 | test_url: http://www.codeproject.com/Articles/499902/Profiling-Entity-Framework-5-in-code \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/codinghorror.com.txt b/inc/3rdparty/site_config/standard/codinghorror.com.txt index 9c95f107..adf6e5a0 100644..100755 --- a/inc/3rdparty/site_config/standard/codinghorror.com.txt +++ b/inc/3rdparty/site_config/standard/codinghorror.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body: //div[@class='blogbody'] | 1 | body: //div[@class='blogbody'] |
2 | strip: //h3[@class='title'] | 2 | strip: //h3[@class='title'] |
3 | date: //h2[@class='date'] | 3 | date: //h2[@class='date'] |
4 | #Should Atwood just be a literal? | 4 | #Should Atwood just be a literal? |
5 | author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V') | 5 | author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V') |
6 | 6 | ||
7 | # tim.kingman@... 2011-07-26 | 7 | # tim.kingman@... 2011-07-26 |
8 | # Prune:no to retain all-link ULs that are part of the body content like | 8 | # Prune:no to retain all-link ULs that are part of the body content like |
9 | # http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html | 9 | # http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html |
10 | # Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed. | 10 | # Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed. |
11 | 11 | ||
12 | prune: no | 12 | prune: no |
13 | strip: //div[@class='posted']/following-sibling::* | 13 | strip: //div[@class='posted']/following-sibling::* |
14 | strip: //div[@class='posted'] | 14 | strip: //div[@class='posted'] |
15 | test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html \ No newline at end of file | 15 | test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/collegehumor.com.txt b/inc/3rdparty/site_config/standard/collegehumor.com.txt index 9d75d641..318e6ff4 100644..100755 --- a/inc/3rdparty/site_config/standard/collegehumor.com.txt +++ b/inc/3rdparty/site_config/standard/collegehumor.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //h1[@class='title'] | 1 | title: //h1[@class='title'] |
2 | author: //p[@class='byline']/a[1] | 2 | author: //p[@class='byline']/a[1] |
3 | date: //*[@class='date'] | 3 | date: //*[@class='date'] |
4 | 4 | ||
5 | body: //div[@class='article_body'] | 5 | body: //div[@class='article_body'] |
6 | strip: //p[@class='ca_intro'] | 6 | strip: //p[@class='ca_intro'] |
7 | strip: //div[@id='action_bar'] | 7 | strip: //div[@id='action_bar'] |
8 | strip: //div[@class='below_content'] | 8 | strip: //div[@class='below_content'] |
9 | strip: //div[@id='announcement'] | 9 | strip: //div[@id='announcement'] |
10 | strip: //div[@id='leftovers'] | 10 | strip: //div[@id='leftovers'] |
11 | strip: //div[@class='form'] | 11 | strip: //div[@class='form'] |
12 | strip: //div[@id='email_overlay'] | 12 | strip: //div[@id='email_overlay'] |
13 | strip: //a[@class='close'] | 13 | strip: //a[@class='close'] |
14 | test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie \ No newline at end of file | 14 | test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt b/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt index 800a907d..800a907d 100644..100755 --- a/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt +++ b/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/community.service-now.com.txt b/inc/3rdparty/site_config/standard/community.service-now.com.txt index 10fd2516..c9854b43 100644..100755 --- a/inc/3rdparty/site_config/standard/community.service-now.com.txt +++ b/inc/3rdparty/site_config/standard/community.service-now.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | body: //div[@id="center"]//div[@class="node"] | 1 | body: //div[@id="center"]//div[@class="node"] |
2 | title: //div[@id="center"]//h2 | 2 | title: //div[@id="center"]//h2 |
3 | author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") | 3 | author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") |
4 | date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") | 4 | date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") |
5 | strip: //div[@id="center"]//h2[1] | 5 | strip: //div[@id="center"]//h2[1] |
6 | strip: //span[@class="submitted"][1] | 6 | strip: //span[@class="submitted"][1] |
7 | move_into(//div[@class="node"])://div[@class="breadcrumb"] | 7 | move_into(//div[@class="node"])://div[@class="breadcrumb"] |
8 | test_url: http://community.service-now.com/blog/lawrenceeng/seasons-greetings-servicenow-team \ No newline at end of file | 8 | test_url: http://community.service-now.com/blog/lawrenceeng/seasons-greetings-servicenow-team \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/computer.org.txt b/inc/3rdparty/site_config/standard/computer.org.txt index 00e6fddf..8345cf50 100644..100755 --- a/inc/3rdparty/site_config/standard/computer.org.txt +++ b/inc/3rdparty/site_config/standard/computer.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | strip_id_or_class:column-3 | 1 | strip_id_or_class:column-3 |
2 | strip_id_or_class:portlet-boundary | 2 | strip_id_or_class:portlet-boundary |
3 | strip_id_or_class:banner | 3 | strip_id_or_class:banner |
4 | 4 | ||
5 | test_url: http://www.computer.org/portal/web/buildyourcareer/careerwatch/jt19 \ No newline at end of file | 5 | test_url: http://www.computer.org/portal/web/buildyourcareer/careerwatch/jt19 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/computerbase.de.txt b/inc/3rdparty/site_config/standard/computerbase.de.txt index 29199242..5973c50b 100644..100755 --- a/inc/3rdparty/site_config/standard/computerbase.de.txt +++ b/inc/3rdparty/site_config/standard/computerbase.de.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | 2 | ||
3 | author://div[@id="news-meta"]/a | 3 | author://div[@id="news-meta"]/a |
4 | 4 | ||
5 | body://*[@id="main"]/div[1] | 5 | body://*[@id="main"]/div[1] |
6 | 6 | ||
7 | strip://*[@id="main"]/div[2] | 7 | strip://*[@id="main"]/div[2] |
8 | strip://*[@id="main"]/div[3] | 8 | strip://*[@id="main"]/div[3] |
9 | strip://*[@id="page"]//footer | 9 | strip://*[@id="page"]//footer |
10 | 10 | ||
11 | #date: didn't manage to parse it | 11 | #date: didn't manage to parse it |
12 | 12 | ||
13 | #Images have to be stripped because the page does it with overlay | 13 | #Images have to be stripped because the page does it with overlay |
14 | strip://img | 14 | strip://img |
15 | 15 | ||
16 | #figures are not displayed in instapaper... | 16 | #figures are not displayed in instapaper... |
17 | strip://figure | //figcaption | 17 | strip://figure | //figcaption |
18 | test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/ \ No newline at end of file | 18 | test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/computerworld.com.txt b/inc/3rdparty/site_config/standard/computerworld.com.txt index 8e1f3e11..7f20a4da 100644..100755 --- a/inc/3rdparty/site_config/standard/computerworld.com.txt +++ b/inc/3rdparty/site_config/standard/computerworld.com.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | title: //meta[@name='headline']/@content | 1 | title: //meta[@name='headline']/@content |
2 | date: //meta[@name='date']/@content | 2 | date: //meta[@name='date']/@content |
3 | author: //meta[@name='author']/@content | 3 | author: //meta[@name='author']/@content |
4 | body: //div[contains(@class, 'article')] | 4 | body: //div[contains(@class, 'article')] |
5 | body://div[@id="article_body"] | 5 | body://div[@id="article_body"] |
6 | 6 | ||
7 | strip_id_or_class: banner | 7 | strip_id_or_class: banner |
8 | strip: //noscript | 8 | strip: //noscript |
9 | strip: //div[@style='width:1px;height:130px;float:right;'] | 9 | strip: //div[@style='width:1px;height:130px;float:right;'] |
10 | strip: //div[@class='storyby'] | 10 | strip: //div[@class='storyby'] |
11 | strip_image_src: twitter_icon | 11 | strip_image_src: twitter_icon |
12 | strip_image_src: rss_bug | 12 | strip_image_src: rss_bug |
13 | 13 | ||
14 | tidy: no | 14 | tidy: no |
15 | prune: no | 15 | prune: no |
16 | 16 | ||
17 | next_page_link://div[@id="next_page"]/a | 17 | next_page_link://div[@id="next_page"]/a |
18 | 18 | ||
19 | single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/')) | 19 | single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/')) |
20 | 20 | ||
21 | test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware | 21 | test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware |
22 | test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy \ No newline at end of file | 22 | test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/computerworld.dk.txt b/inc/3rdparty/site_config/standard/computerworld.dk.txt index a83f366f..d819109c 100644..100755 --- a/inc/3rdparty/site_config/standard/computerworld.dk.txt +++ b/inc/3rdparty/site_config/standard/computerworld.dk.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | strip: //div[contains(@class, 'articleAdtechAd')] | 1 | strip: //div[contains(@class, 'articleAdtechAd')] |
2 | title: //div[@id='article']/h1 | 2 | title: //div[@id='article']/h1 |
3 | title: //div[contains(@class, 'article')]/h1 | 3 | title: //div[contains(@class, 'article')]/h1 |
4 | body: //div[@id='articleText'] | 4 | body: //div[@id='articleText'] |
5 | test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0 \ No newline at end of file | 5 | test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/contemporist.com.txt b/inc/3rdparty/site_config/standard/contemporist.com.txt index d2b289a3..c3120fe8 100644..100755 --- a/inc/3rdparty/site_config/standard/contemporist.com.txt +++ b/inc/3rdparty/site_config/standard/contemporist.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | # get author from string like "Posted by <author> on <date>" | 1 | # get author from string like "Posted by <author> on <date>" |
2 | author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on') | 2 | author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on') |
3 | 3 | ||
4 | # get date from string like "Posted by <author> on <date>" | 4 | # get date from string like "Posted by <author> on <date>" |
5 | date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on') | 5 | date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on') |
6 | 6 | ||
7 | # this keeps thumbnail images | 7 | # this keeps thumbnail images |
8 | prune: no | 8 | prune: no |
9 | test_url: http://www.contemporist.com/2011/11/02/landing-200-lamp-by-kim-hyunjoo \ No newline at end of file | 9 | test_url: http://www.contemporist.com/2011/11/02/landing-200-lamp-by-kim-hyunjoo \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt b/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt index 9bad2c84..966cc861 100644..100755 --- a/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt +++ b/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='article_header']/h1 | 1 | title: //div[@class='article_header']/h1 |
2 | body: //div[@class='article_header']/p | //div[@class='article_body'] | 2 | body: //div[@class='article_header']/p | //div[@class='article_body'] |
3 | strip_id_or_class: share_this | 3 | strip_id_or_class: share_this |
4 | strip_id_or_class: sociable | 4 | strip_id_or_class: sociable |
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://conversaciones.nokia.com/2011/10/07/cinco-atajos-en-el-nokia-n8/ \ No newline at end of file | 7 | test_url: http://conversaciones.nokia.com/2011/10/07/cinco-atajos-en-el-nokia-n8/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/cooper.com.txt b/inc/3rdparty/site_config/standard/cooper.com.txt new file mode 100755 index 00000000..a4244097 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cooper.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //*[contains(@class,'body')] | ||
2 | date: //abbr[@class='published'] | ||
3 | |||
4 | test_url: http://www.cooper.com/journal/2012/08/2-weeks-left-to-win-your-way-to-the-woodstock-of-ux-coopers-ux-boot-camp.html/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/core77.com.txt b/inc/3rdparty/site_config/standard/core77.com.txt index a24374d8..cf1fa93c 100644..100755 --- a/inc/3rdparty/site_config/standard/core77.com.txt +++ b/inc/3rdparty/site_config/standard/core77.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id="permalink"]/div[@class="post"] | 1 | body: //div[@id="permalink"]/div[@class="post"] |
2 | 2 | ||
3 | strip: //div[@id='backArrow'] | 3 | strip: //div[@id='backArrow'] |
4 | strip: //div[@id='fwdArrow'] | 4 | strip: //div[@id='fwdArrow'] |
5 | strip: //div[@class="post-title"] | 5 | strip: //div[@class="post-title"] |
6 | strip: //div[@class="sharing"] | 6 | strip: //div[@class="sharing"] |
7 | test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp \ No newline at end of file | 7 | test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/counterpunch.org.txt b/inc/3rdparty/site_config/standard/counterpunch.org.txt index c9e92287..b6bd8be5 100644..100755 --- a/inc/3rdparty/site_config/standard/counterpunch.org.txt +++ b/inc/3rdparty/site_config/standard/counterpunch.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='main']//h1[contains(@class, 'article-title')] | 1 | title: //div[@class='main']//h1[contains(@class, 'article-title')] |
2 | author: //div[@class='mainauthorstyle'] | 2 | author: //div[@class='mainauthorstyle'] |
3 | body: //div[@class='main']//div[@class='main-text'] | 3 | body: //div[@class='main']//div[@class='main-text'] |
4 | strip: //td[@width='140'] | 4 | strip: //td[@width='140'] |
5 | 5 | ||
6 | test_url: http://www.counterpunch.org/johnstone05172011.html \ No newline at end of file | 6 | test_url: http://www.counterpunch.org/johnstone05172011.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/crazybutable.com.txt b/inc/3rdparty/site_config/standard/crazybutable.com.txt index d25cd05d..037cd177 100644..100755 --- a/inc/3rdparty/site_config/standard/crazybutable.com.txt +++ b/inc/3rdparty/site_config/standard/crazybutable.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title://h2 | 1 | title://h2 |
2 | body://div[contains(@class, 'entrytext')] | 2 | body://div[contains(@class, 'entrytext')] |
3 | test_url: http://www.crazybutable.com/weblog/archives/2010/07/01/house-ideas-that-worked/ \ No newline at end of file | 3 | test_url: http://www.crazybutable.com/weblog/archives/2010/07/01/house-ideas-that-worked/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/crimemagazine.com.txt b/inc/3rdparty/site_config/standard/crimemagazine.com.txt index 9cf0bccc..9cf0bccc 100644..100755 --- a/inc/3rdparty/site_config/standard/crimemagazine.com.txt +++ b/inc/3rdparty/site_config/standard/crimemagazine.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/crimethinc.com.txt b/inc/3rdparty/site_config/standard/crimethinc.com.txt index 74bc6db9..b5a8018a 100644..100755 --- a/inc/3rdparty/site_config/standard/crimethinc.com.txt +++ b/inc/3rdparty/site_config/standard/crimethinc.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class="readingtext"] | 1 | body: //div[@class="readingtext"] |
2 | title: substring-after(substring-after(//title, ':'), ':') | 2 | title: substring-after(substring-after(//title, ':'), ':') |
3 | test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php \ No newline at end of file | 3 | test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/crn.de.txt b/inc/3rdparty/site_config/standard/crn.de.txt index 7fa950af..61d5d6a7 100644..100755 --- a/inc/3rdparty/site_config/standard/crn.de.txt +++ b/inc/3rdparty/site_config/standard/crn.de.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | author: //p[contains(@class,'author')]/a | 1 | author: //p[contains(@class,'author')]/a |
2 | date: //div[contains(@class,'date')] | 2 | date: //div[contains(@class,'date')] |
3 | test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html \ No newline at end of file | 3 | test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/csmonitor.com.txt b/inc/3rdparty/site_config/standard/csmonitor.com.txt index d4dbc5c8..b482e34e 100644..100755 --- a/inc/3rdparty/site_config/standard/csmonitor.com.txt +++ b/inc/3rdparty/site_config/standard/csmonitor.com.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | title: //h1[contains(@class, 'head')] | 1 | title: //h1[contains(@class, 'head')] |
2 | 2 | ||
3 | # standard page | 3 | # standard page |
4 | body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')] | 4 | body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')] |
5 | # print page | 5 | # print page |
6 | body: //div[@id='mainColumn'] | 6 | body: //div[@id='mainColumn'] |
7 | 7 | ||
8 | author: //a[contains(@class, 'ui-author')] | 8 | author: //a[contains(@class, 'ui-author')] |
9 | 9 | ||
10 | single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')] | 10 | single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')] |
11 | 11 | ||
12 | strip_id_or_class: storyToolbar | 12 | strip_id_or_class: storyToolbar |
13 | strip_id_or_class: promotion-tag | 13 | strip_id_or_class: promotion-tag |
14 | 14 | ||
15 | tidy: no | 15 | tidy: no |
16 | prune: no | 16 | prune: no |
17 | 17 | ||
18 | test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84 \ No newline at end of file | 18 | test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/csnbayarea.com.txt b/inc/3rdparty/site_config/standard/csnbayarea.com.txt index 131a923b..1da60b4e 100644..100755 --- a/inc/3rdparty/site_config/standard/csnbayarea.com.txt +++ b/inc/3rdparty/site_config/standard/csnbayarea.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@id='csn_blogST_headline']/h1 | 1 | title: //div[@id='csn_blogST_headline']/h1 |
2 | 2 | ||
3 | body: //div[@id='csn_blogST_main'] | 3 | body: //div[@id='csn_blogST_main'] |
4 | strip_id_or_class: ipfootnotes | 4 | strip_id_or_class: ipfootnotes |
5 | strip: //div[@id='csn_blogST_main']/p[1]/img | 5 | strip: //div[@id='csn_blogST_main']/p[1]/img |
6 | strip: //div[@id='csn_blogST_sidebar'] | 6 | strip: //div[@id='csn_blogST_sidebar'] |
7 | test_url: http://www.csnbayarea.com/blog/giants-talk/post/-?blog%2Fgiants-talk%2Fpost%2F-=&blockID=578902&feedID=5987 \ No newline at end of file | 7 | test_url: http://www.csnbayarea.com/blog/giants-talk/post/-?blog%2Fgiants-talk%2Fpost%2F-=&blockID=578902&feedID=5987 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/csnphilly.com.txt b/inc/3rdparty/site_config/standard/csnphilly.com.txt index 0df72c32..c14a934a 100644..100755 --- a/inc/3rdparty/site_config/standard/csnphilly.com.txt +++ b/inc/3rdparty/site_config/standard/csnphilly.com.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | # author's name is not isolated as a tag.... ugh | 1 | # author's name is not isolated as a tag.... ugh |
2 | convert_double_br_tags: yes | 2 | convert_double_br_tags: yes |
3 | body: //csn_blogST_main | 3 | body: //csn_blogST_main |
4 | 4 | ||
5 | #junk above and around the article | 5 | #junk above and around the article |
6 | strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div | 6 | strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div |
7 | strip: /html/body/div[4]/header | 7 | strip: /html/body/div[4]/header |
8 | strip_id_or_class: article-right-sidebar | 8 | strip_id_or_class: article-right-sidebar |
9 | strip_id_or_class: rsn-gigya-sharebar-container | 9 | strip_id_or_class: rsn-gigya-sharebar-container |
10 | strip_id_or_class: article-bottom | 10 | strip_id_or_class: article-bottom |
11 | strip_id_or_class: hider | 11 | strip_id_or_class: hider |
12 | strip_id_or_class: footer | 12 | strip_id_or_class: footer |
13 | strip_id_or_class: masthead | 13 | strip_id_or_class: masthead |
14 | strip_id_or_class: block-menu-menu-rsn-login-or-register | 14 | strip_id_or_class: block-menu-menu-rsn-login-or-register |
15 | strip_id_or_class: block-menu-menu-header-links | 15 | strip_id_or_class: block-menu-menu-header-links |
16 | strip_id_or_class: block-rsn-follow-bar-follow-bar | 16 | strip_id_or_class: block-rsn-follow-bar-follow-bar |
17 | strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard | 17 | strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard |
18 | strip_id_or_class: logo | 18 | strip_id_or_class: logo |
19 | strip_id_or_class: element-invisible | 19 | strip_id_or_class: element-invisible |
20 | strip_id_or_class: site-name | 20 | strip_id_or_class: site-name |
21 | strip: //div[contains(@style, 'none')] | 21 | strip: //div[contains(@style, 'none')] |
22 | test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career \ No newline at end of file | 22 | test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/css-tricks.com.txt b/inc/3rdparty/site_config/standard/css-tricks.com.txt new file mode 100755 index 00000000..3d8174aa --- /dev/null +++ b/inc/3rdparty/site_config/standard/css-tricks.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title://article[contains(@id, "post-")]/h1 | ||
2 | date://article[contains(@id, "post-")]/p[@class="time"]/time | ||
3 | body://article[contains(@id, "post-")] | ||
4 | strip://article[contains(@id, "post-")]/p[@class="time"]/time | ||
5 | prune:yes | ||
6 | test_url: http://css-tricks.com/off-canvas-menu-with-css-target/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cucharasonica.com.txt b/inc/3rdparty/site_config/standard/cucharasonica.com.txt index e691fe83..e691fe83 100644..100755 --- a/inc/3rdparty/site_config/standard/cucharasonica.com.txt +++ b/inc/3rdparty/site_config/standard/cucharasonica.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/cw.com.tw.txt b/inc/3rdparty/site_config/standard/cw.com.tw.txt new file mode 100755 index 00000000..6e3a91ee --- /dev/null +++ b/inc/3rdparty/site_config/standard/cw.com.tw.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | author://span[contains(@class,'reporter')] | ||
2 | |||
3 | date://span[contains(@class,'date')] | ||
4 | |||
5 | body://div[contains(@class,'mainContaner')] | ||
6 | |||
7 | strip://div[contains(@class,'mainHeaer')] | ||
8 | strip://div[contains(@class,'keyW')] | ||
9 | strip://div[contains(@class,'wonderful')] | ||
10 | strip://div[contains(@class,'pages')] | ||
11 | strip://div[contains(@class,'Topics TopicsW3')] | ||
12 | |||
13 | next_page_link://li[@class='pageNext']/a[contains(.,'下一é ')] | ||
14 | test_url: http://www.cw.com.tw/article/article.action?id=5032848 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/da.feedsportal.com.txt b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt index 4a00ef44..381446e5 100644..100755 --- a/inc/3rdparty/site_config/standard/da.feedsportal.com.txt +++ b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | single_page_link: //a | 1 | single_page_link: //a |
2 | tidy: no | 2 | tidy: no |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm \ No newline at end of file | 5 | test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dagogtid.no.txt b/inc/3rdparty/site_config/standard/dagogtid.no.txt new file mode 100755 index 00000000..1531472c --- /dev/null +++ b/inc/3rdparty/site_config/standard/dagogtid.no.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //span[@class = 'overskriftEkstrastor'] | ||
2 | author: //em/a | ||
3 | |||
4 | test_url: http://dagogtid.no/nyhet.cfm?nyhetid=2414 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dailydot.com.txt b/inc/3rdparty/site_config/standard/dailydot.com.txt index 61013993..978ed1ce 100644..100755 --- a/inc/3rdparty/site_config/standard/dailydot.com.txt +++ b/inc/3rdparty/site_config/standard/dailydot.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | body: //article | 2 | body: //article |
3 | 3 | ||
4 | test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/ \ No newline at end of file | 4 | test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dailykos.com.txt b/inc/3rdparty/site_config/standard/dailykos.com.txt index 124675cb..6d4cb82a 100644..100755 --- a/inc/3rdparty/site_config/standard/dailykos.com.txt +++ b/inc/3rdparty/site_config/standard/dailykos.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@id='article-1']//div[contains(@class, 'article-body')] | 1 | body: //div[@id='article-1']//div[contains(@class, 'article-body')] |
2 | title: //div[@class='meta']//a[@id='titleHref'] | 2 | title: //div[@class='meta']//a[@id='titleHref'] |
3 | date: //div[@class='meta']//p[@class='date'] | 3 | date: //div[@class='meta']//p[@class='date'] |
4 | 4 | ||
5 | strip_id_or_class: invisible | 5 | strip_id_or_class: invisible |
6 | strip_id_or_class: divider-doodle | 6 | strip_id_or_class: divider-doodle |
7 | 7 | ||
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrichs-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his ex-wife \ No newline at end of file | 10 | test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrich-s-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his-ex-wife |
diff --git a/inc/3rdparty/site_config/standard/dailymail.co.uk.txt b/inc/3rdparty/site_config/standard/dailymail.co.uk.txt index c83dbdb0..cd29a4d4 100644..100755 --- a/inc/3rdparty/site_config/standard/dailymail.co.uk.txt +++ b/inc/3rdparty/site_config/standard/dailymail.co.uk.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@id='js-article-text'] | 1 | body: //div[@id='js-article-text'] |
2 | strip: //div[@class='explore-links'] | 2 | strip: //div[@class='explore-links'] |
3 | strip: //div[@id='js-article-text']/br[position()=1] | 3 | strip: //div[@id='js-article-text']/br[position()=1] |
4 | strip_id_or_class: print-or-mail-links | 4 | strip_id_or_class: print-or-mail-links |
5 | strip_id_or_class: shareArticles | 5 | strip_id_or_class: shareArticles |
6 | strip_id_or_class: googleAds | 6 | strip_id_or_class: googleAds |
7 | strip_id_or_class: digg-button | 7 | strip_id_or_class: digg-button |
8 | strip_id_or_class: article-icon-links-container | 8 | strip_id_or_class: article-icon-links-container |
9 | strip_id_or_class: clickToEnlarge | 9 | strip_id_or_class: clickToEnlarge |
10 | tidy: no | 10 | tidy: no |
11 | 11 | ||
12 | test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html \ No newline at end of file | 12 | test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dailystar.com.lb.txt b/inc/3rdparty/site_config/standard/dailystar.com.lb.txt new file mode 100755 index 00000000..3b153042 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dailystar.com.lb.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='ec-blog-headline'] | ||
2 | body: //*[@id="divDetails"] | ||
3 | date: //*[@id="ctl00_ContentPlaceHolder1_tdDate"] | ||
4 | author: //*[@id="ctl00_ContentPlaceHolder1_anchorAuthor"]/a | ||
5 | autodetect_next_page: no | ||
6 | test_url: http://dailystar.com.lb/Opinion/Columnist/2012/Oct-10/190803-americas-new-modesty-in-the-mideast.ashx#axzz2928JP5xE \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/danleech.com.txt b/inc/3rdparty/site_config/standard/danleech.com.txt new file mode 100755 index 00000000..1d4cec77 --- /dev/null +++ b/inc/3rdparty/site_config/standard/danleech.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | tidy: no | ||
2 | prune: no | ||
3 | date: //article//time[@pubdate] | ||
4 | title: //article/h1//span[contains(@class, 'entry-title')] | ||
5 | body: //article/div[contains(@class, 'entry-content')] | ||
6 | test_url: http://danleech.com/post/36822126876/simple-icons \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dansdata.com.txt b/inc/3rdparty/site_config/standard/dansdata.com.txt index 96a2bc41..60669480 100644..100755 --- a/inc/3rdparty/site_config/standard/dansdata.com.txt +++ b/inc/3rdparty/site_config/standard/dansdata.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | autodetect_next_page: no | 1 | autodetect_next_page: no |
2 | tidy: no | 2 | tidy: no |
3 | prune: no | 3 | prune: no |
4 | body: //div[@class='NoOverflow'] | 4 | body: //div[@class='NoOverflow'] |
5 | test_url: http://www.dansdata.com/gz129.htm \ No newline at end of file | 5 | test_url: http://www.dansdata.com/gz129.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dantri.com.vn.txt b/inc/3rdparty/site_config/standard/dantri.com.vn.txt new file mode 100755 index 00000000..f19fee7c --- /dev/null +++ b/inc/3rdparty/site_config/standard/dantri.com.vn.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[contains(@class, 'fon31 mt2')] | ||
2 | body: //h2[contains(@class, 'fon33 mt1')] | //div[contains(@class, 'fon34 mt3')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://dantri.com.vn/su-kien/chang-trai-mot-minh-dap-xe-vuot-450km-de-vieng-mo-dai-tuong-869763.htm | ||
7 | test_url: http://dantri.com.vn/trangchu.rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/daringfireball.net.txt b/inc/3rdparty/site_config/standard/daringfireball.net.txt index dca8ade7..251cc670 100644..100755 --- a/inc/3rdparty/site_config/standard/daringfireball.net.txt +++ b/inc/3rdparty/site_config/standard/daringfireball.net.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class="article"]/h1 | 1 | title: //div[@class="article"]/h1 |
2 | author: //div[@id="Sidebar"]/p/strong | 2 | author: //div[@id="Sidebar"]/p/strong |
3 | date: //h6[@class="dateline"] | 3 | date: //h6[@class="dateline"] |
4 | body: //div[@class="article"] | 4 | body: //div[@class="article"] |
5 | strip: //h6[@class="dateline"] | 5 | strip: //h6[@class="dateline"] |
6 | strip: //div[@class="article"]/h1 | 6 | strip: //div[@class="article"]/h1 |
7 | test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels \ No newline at end of file | 7 | test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/datanami.com.txt b/inc/3rdparty/site_config/standard/datanami.com.txt index 3534002a..e9111a48 100644..100755 --- a/inc/3rdparty/site_config/standard/datanami.com.txt +++ b/inc/3rdparty/site_config/standard/datanami.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id="article"] | 1 | body: //div[@id="article"] |
2 | date: //p[@class="date"] | 2 | date: //p[@class="date"] |
3 | author: //p[@class="byline"] | 3 | author: //p[@class="byline"] |
4 | test_url: http://www.datanami.com/datanami/2011-12-07/new_path_for_sap:_in_memory_computing,_predictive_analysis_converge.html?featured=top \ No newline at end of file | 4 | test_url: http://www.datanami.com/datanami/2011-12-07/new_path_for_sap:_in_memory_computing,_predictive_analysis_converge.html?featured=top \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dcurt.is.txt b/inc/3rdparty/site_config/standard/dcurt.is.txt index 7d11c6e1..524c4bf1 100644..100755 --- a/inc/3rdparty/site_config/standard/dcurt.is.txt +++ b/inc/3rdparty/site_config/standard/dcurt.is.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: (//article//h2)[1] | 1 | title: (//article//h2)[1] |
2 | body: //article[contains(@class, 'post')] | 2 | body: //article[contains(@class, 'post')] |
3 | date: //time[@id='top_time']/@datetime | 3 | date: //time[@id='top_time']/@datetime |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | 7 | ||
8 | test_url: http://dcurt.is/predictions-txt \ No newline at end of file | 8 | test_url: http://dcurt.is/predictions-txt \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/defomicron.net.txt b/inc/3rdparty/site_config/standard/defomicron.net.txt new file mode 100755 index 00000000..9f11258c --- /dev/null +++ b/inc/3rdparty/site_config/standard/defomicron.net.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //article/h1 | ||
2 | author: //hgroup/h3/a | ||
3 | date: //time | ||
4 | body: //article | ||
5 | strip: //aside | ||
6 | footnotes: yes | ||
7 | prune: no | ||
8 | tidy: no | ||
9 | test_url: https://defomicron.net/2012/09/ios-6/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/delong.typepad.com.txt b/inc/3rdparty/site_config/standard/delong.typepad.com.txt index 84fd4f79..c4b922e4 100644..100755 --- a/inc/3rdparty/site_config/standard/delong.typepad.com.txt +++ b/inc/3rdparty/site_config/standard/delong.typepad.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | strip_id_or_class: banner | 1 | strip_id_or_class: banner |
2 | strip_id_or_class: gamma | 2 | strip_id_or_class: gamma |
3 | strip_id_or_class: module-list | 3 | strip_id_or_class: module-list |
4 | test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html \ No newline at end of file | 4 | test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/democracynow.org.txt b/inc/3rdparty/site_config/standard/democracynow.org.txt new file mode 100755 index 00000000..b0050b4f --- /dev/null +++ b/inc/3rdparty/site_config/standard/democracynow.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[contains(@class, 'blog_body')] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.democracynow.org/blog/2014/1/9/the_fbi_the_nsa_and_a \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/derstandard.at.txt b/inc/3rdparty/site_config/standard/derstandard.at.txt index 48722ebd..07db3521 100644..100755 --- a/inc/3rdparty/site_config/standard/derstandard.at.txt +++ b/inc/3rdparty/site_config/standard/derstandard.at.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //div[@id='artikelHeader']/h1 | 1 | title: //div[@id='artikelHeader']/h1 |
2 | author: //span[@class='author'] | 2 | author: //span[@class='author'] |
3 | date: //span[@class='date'] | 3 | date: //span[@class='date'] |
4 | body: //div[@class='copytext'] | 4 | body: //div[@class='copytext'] |
5 | strip: //ul[@class='lookupLinksArtikel'] | 5 | strip: //ul[@class='lookupLinksArtikel'] |
6 | 6 | ||
7 | strip: //div[@id='pageTop'] | 7 | strip: //div[@id='pageTop'] |
8 | strip: //div[@id='toolbar'] | 8 | strip: //div[@id='toolbar'] |
9 | strip: //div[@id='articleTools'] | 9 | strip: //div[@id='articleTools'] |
10 | strip: //div[@id='weiterlesen'] | 10 | strip: //div[@id='weiterlesen'] |
11 | strip: //div[@id='communityCanvas'] | 11 | strip: //div[@id='communityCanvas'] |
12 | 12 | ||
13 | test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation \ No newline at end of file | 13 | test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/designtagebuch.de.txt b/inc/3rdparty/site_config/standard/designtagebuch.de.txt index 6096db0b..9020847f 100644..100755 --- a/inc/3rdparty/site_config/standard/designtagebuch.de.txt +++ b/inc/3rdparty/site_config/standard/designtagebuch.de.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | body: //div[@class='main'] | 2 | body: //div[@class='main'] |
3 | 3 | ||
4 | author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am') | 4 | author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am') |
5 | date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ') | 5 | date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ') |
6 | 6 | ||
7 | strip_id_or_class: pagelink | 7 | strip_id_or_class: pagelink |
8 | strip_id_or_class: wp-polls | 8 | strip_id_or_class: wp-polls |
9 | 9 | ||
10 | next_page_link: //div[@class='post-page-next']/a | 10 | next_page_link: //div[@class='post-page-next']/a |
11 | test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/ \ No newline at end of file | 11 | test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/desitvforum.net.txt b/inc/3rdparty/site_config/standard/desitvforum.net.txt index a6dac5fd..efa85f76 100644..100755 --- a/inc/3rdparty/site_config/standard/desitvforum.net.txt +++ b/inc/3rdparty/site_config/standard/desitvforum.net.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: (//blockquote[contains(@class, 'postcontent')])[1] | 1 | body: (//blockquote[contains(@class, 'postcontent')])[1] |
2 | body: (//div[starts-with(@id, 'post_message')])[1] | 2 | body: (//div[starts-with(@id, 'post_message')])[1] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | tidy: no \ No newline at end of file | 5 | tidy: no \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/details.com.txt b/inc/3rdparty/site_config/standard/details.com.txt index 548cabad..d1d8a29a 100644..100755 --- a/inc/3rdparty/site_config/standard/details.com.txt +++ b/inc/3rdparty/site_config/standard/details.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@class="content-headline"] | 1 | title: //h1[@class="content-headline"] |
2 | body: //div[@class="headers-container"] | //div[@class="content-container"] | 2 | body: //div[@class="headers-container"] | //div[@class="content-container"] |
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | single_page_link: //li[@class='utility-print']/a | 6 | single_page_link: //li[@class='utility-print']/a |
7 | 7 | ||
8 | test_url: http://www.details.com/culture-trends/critical-eye/201108/best-new-designers-innovations \ No newline at end of file | 8 | test_url: http://www.details.com/culture-trends/critical-eye/201108/best-new-designers-innovations \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/developers.facebook.com.txt b/inc/3rdparty/site_config/standard/developers.facebook.com.txt index 43a8f0a0..7609b72f 100644..100755 --- a/inc/3rdparty/site_config/standard/developers.facebook.com.txt +++ b/inc/3rdparty/site_config/standard/developers.facebook.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //div[@class="bodyText"]/h1 | 1 | title: //div[@class="bodyText"]/h1 |
2 | author: //div[@class="picture"]/a/img/@alt | 2 | author: //div[@class="picture"]/a/img/@alt |
3 | test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/ \ No newline at end of file | 3 | test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt b/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt index b960b37e..6f1d4e27 100644..100755 --- a/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt +++ b/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | date: //h2[@class='date-header'] | 1 | date: //h2[@class='date-header'] |
2 | body: //div[@class='post hentry'] | 2 | body: //div[@class='post hentry'] |
3 | title: //h3 | 3 | title: //h3 |
4 | strip: //div[@class='post-footer'] | 4 | strip: //div[@class='post-footer'] |
5 | 5 | ||
6 | test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html \ No newline at end of file | 6 | test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dictionary.reference.com.txt b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt index a1172024..f8b79c80 100644..100755 --- a/inc/3rdparty/site_config/standard/dictionary.reference.com.txt +++ b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@id='query_h1'] | 1 | title: //h1[@id='query_h1'] |
2 | body: //div[contains(@class, 'lunatext results_content')] | 2 | body: //div[contains(@class, 'lunatext results_content')] |
3 | strip_id_or_class: spl_unshd | 3 | strip_id_or_class: spl_unshd |
4 | #replace_string(<div class="dicTl">): <div class="dicTl">------------------<br /> | 4 | #replace_string(<div class="dicTl">): <div class="dicTl">------------------<br /> |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | 7 | ||
8 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ \ No newline at end of file | 8 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/diepresse.com.txt b/inc/3rdparty/site_config/standard/diepresse.com.txt index 7e825a91..ced189cc 100644..100755 --- a/inc/3rdparty/site_config/standard/diepresse.com.txt +++ b/inc/3rdparty/site_config/standard/diepresse.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='article']/h1 | 1 | title: //div[@class='article']/h1 |
2 | date: substring-before(//p[@class='articletime'],'|') | 2 | date: substring-before(//p[@class='articletime'],'|') |
3 | body: //div[@id='articletext'] | 3 | body: //div[@id='articletext'] |
4 | strip: //div[@class='inlineDiashow'] | 4 | strip: //div[@class='inlineDiashow'] |
5 | 5 | ||
6 | test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do \ No newline at end of file | 6 | test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt b/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt index 2d2ae2c2..80ce5ff3 100644..100755 --- a/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt +++ b/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | # default parser works great | 1 | # default parser works great |
2 | # only add "author" and "next page link" reference | 2 | # only add "author" and "next page link" reference |
3 | # 2012-04-13 | 3 | # 2012-04-13 |
4 | 4 | ||
5 | next_page_link: //div[@class = 'pagination']/a[@class = 'next_page'] | 5 | next_page_link: //div[@class = 'pagination']/a[@class = 'next_page'] |
6 | 6 | ||
7 | author: //*[@class = 'author metadata']/a | 7 | author: //*[@class = 'author metadata']/a |
8 | test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work \ No newline at end of file | 8 | test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/digital-photography-school.com.txt b/inc/3rdparty/site_config/standard/digital-photography-school.com.txt index 37192ac0..18ce370e 100644..100755 --- a/inc/3rdparty/site_config/standard/digital-photography-school.com.txt +++ b/inc/3rdparty/site_config/standard/digital-photography-school.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='post-title']/h1 | 1 | title: //div[@class='post-title']/h1 |
2 | author: //a[@href='#author'] | 2 | author: //a[@href='#author'] |
3 | body: //div[@class='post-content'] | 3 | body: //div[@class='post-content'] |
4 | strip: //div[@class='post-meta'] | 4 | strip: //div[@class='post-meta'] |
5 | 5 | ||
6 | test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically \ No newline at end of file | 6 | test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt b/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt index b21431d7..f48bdfdb 100644..100755 --- a/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt +++ b/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class="article_header"]/h1 | 1 | title: //div[@class="article_header"]/h1 |
2 | date: //div[@class="article_pub"]/span[@class="time"] | 2 | date: //div[@class="article_pub"]/span[@class="time"] |
3 | author: //div[@class="article_pub"]/span[@class="editors"]/a/text() | 3 | author: //div[@class="article_pub"]/span[@class="editors"]/a/text() |
4 | body: //div[@class="article_body clear_left"] | 4 | body: //div[@class="article_body clear_left"] |
5 | test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html \ No newline at end of file | 5 | test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dilbert.com.txt b/inc/3rdparty/site_config/standard/dilbert.com.txt index 413e5506..85cc78e5 100644..100755 --- a/inc/3rdparty/site_config/standard/dilbert.com.txt +++ b/inc/3rdparty/site_config/standard/dilbert.com.txt | |||
@@ -1,8 +1,11 @@ | |||
1 | convert_double_br_tags: yes | 1 | #title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10) |
2 | 2 | title: //div[contains(@class, 'SB_Title')]//a | |
3 | title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10) | 3 | body: //div[contains(@class, 'STR_Image')] |
4 | body: //*[contains(@class, 'SB_Content')] | 4 | body: //*[contains(@class, 'SB_Content')] |
5 | author: string('Scott Adams') | 5 | author: string('Scott Adams') |
6 | date: //*[contains(@class, 'SB_Detail')]/text()[1] | 6 | date: //*[contains(@class, 'SB_Detail')]/text()[1] |
7 | 7 | ||
8 | test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/ \ No newline at end of file | 8 | |
9 | test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/ | ||
10 | test_url: http://dilbert.com/strips/comic/2013-10-22 | ||
11 | test_url: http://feed.dilbert.com/dilbert/daily_strip \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dinamalar.com.txt b/inc/3rdparty/site_config/standard/dinamalar.com.txt index 9ef198c9..bc315cf1 100644..100755 --- a/inc/3rdparty/site_config/standard/dinamalar.com.txt +++ b/inc/3rdparty/site_config/standard/dinamalar.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //div[@class='newsdetbd'] | 1 | title: //div[@class='newsdetbd'] |
2 | body: //div[@id='innerleft'] | 2 | body: //div[@id='innerleft'] |
3 | #//p[@class = 'plnht'] | 3 | #//p[@class = 'plnht'] |
4 | strip_image_src: /albums/ | 4 | strip_image_src: /albums/ |
5 | strip: //div[@class='mrrt'] | 5 | strip: //div[@class='mrrt'] |
6 | prune: yes | 6 | prune: yes |
7 | strip_id_or_class: 'fdpd' | 7 | strip_id_or_class: 'fdpd' |
8 | strip_id_or_class: 'epapt' | 8 | strip_id_or_class: 'epapt' |
9 | strip_id_or_class: 'newsrtwd' | 9 | strip_id_or_class: 'newsrtwd' |
10 | strip_id_or_class: 'padtp' | 10 | strip_id_or_class: 'padtp' |
11 | strip_id_or_class: 'newdt' | 11 | strip_id_or_class: 'newdt' |
12 | strip_id_or_class: 'newdlt' | 12 | strip_id_or_class: 'newdlt' |
13 | strip: //div[@id='selNotes'] | 13 | strip: //div[@id='selNotes'] |
14 | strip_id_or_class: 'clsNotes' | 14 | strip_id_or_class: 'clsNotes' |
15 | strip_id_or_class: 'clear' | 15 | strip_id_or_class: 'clear' |
16 | strip_id_or_class: 'cmtwrap' | 16 | strip_id_or_class: 'cmtwrap' |
17 | strip_id_or_class: 'sess' | 17 | strip_id_or_class: 'sess' |
18 | strip_id_or_class: 'parents' | 18 | strip_id_or_class: 'parents' |
19 | test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725 \ No newline at end of file | 19 | test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dn.se.txt b/inc/3rdparty/site_config/standard/dn.se.txt index 86bb3b8d..5283a0cd 100644..100755 --- a/inc/3rdparty/site_config/standard/dn.se.txt +++ b/inc/3rdparty/site_config/standard/dn.se.txt | |||
@@ -1,26 +1,28 @@ | |||
1 | # Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height. | 1 | # Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height. |
2 | 2 | ||
3 | body: //div[@id="article-content"] | 3 | body: //div[@id="article-content"] |
4 | 4 | ||
5 | 5 | ||
6 | # Ads | 6 | # Ads |
7 | strip_id_or_class: advert-space | 7 | strip_id_or_class: advert-space |
8 | 8 | ||
9 | # Read more, recommend, comments etc | 9 | # Read more, recommend, comments etc |
10 | strip_id_or_class: fbc-recommend | 10 | strip_id_or_class: fbc-recommend |
11 | strip_id_or_class: recommend | 11 | strip_id_or_class: recommend |
12 | strip_id_or_class: article-readers | 12 | strip_id_or_class: article-readers |
13 | strip_id_or_class: article-addons | 13 | strip_id_or_class: article-addons |
14 | strip_id_or_class: hook | 14 | strip_id_or_class: hook |
15 | strip_id_or_class: right | 15 | strip_id_or_class: right |
16 | strip_id_or_class: footer | 16 | strip_id_or_class: footer |
17 | 17 | ||
18 | # Other news | 18 | # Other news |
19 | strip: //div[@id="mirrors"] | 19 | strip: //div[@id="mirrors"] |
20 | 20 | ||
21 | # Author | 21 | # Author |
22 | author: //div[@id="byline"]/div/p/strong | 22 | author: //div[@id="byline"]/div/p/strong |
23 | 23 | ||
24 | # Date | 24 | # Date |
25 | date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11) | 25 | date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11) |
26 | test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade \ No newline at end of file | 26 | |
27 | test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade | ||
28 | test_url: http://www.dn.se/m/rss/senaste-nytt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dobreprogramy.pl.txt b/inc/3rdparty/site_config/standard/dobreprogramy.pl.txt new file mode 100755 index 00000000..972293bc --- /dev/null +++ b/inc/3rdparty/site_config/standard/dobreprogramy.pl.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //*[@class="news"]//h1[@class="title"] | ||
2 | author: //*[@class="news"]//*[@class="newsInfo"]/a | ||
3 | date: substring-before(//*[@class="news"]//*[@class="newsInfo"]/text(), ',') | ||
4 | body: //*[@class="news"]//*[@class="newsContent"] | ||
5 | footnotes: no | ||
6 | test_url: http://www.dobreprogramy.pl/Sony-konczy-z-Foldinghome-na-PS3,Aktualnosc,36899.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/doctac.com.txt b/inc/3rdparty/site_config/standard/doctac.com.txt index 9f65ea9b..1c518a9b 100644..100755 --- a/inc/3rdparty/site_config/standard/doctac.com.txt +++ b/inc/3rdparty/site_config/standard/doctac.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | strip: //*[(@id = "featured")] | 1 | strip: //*[(@id = "featured")] |
2 | 2 | ||
3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') | 3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') |
4 | 4 | ||
5 | date: concat(//div[@class='month'],' ',//div[@class='day']) | 5 | date: concat(//div[@class='month'],' ',//div[@class='day']) |
6 | 6 | ||
7 | #doctac doesn't provide a year, but month/day is better than nothing | 7 | #doctac doesn't provide a year, but month/day is better than nothing |
8 | test_url: http://www.doctac.com/mac/iphone/instapaper-update-app/ \ No newline at end of file | 8 | test_url: http://www.doctac.com/mac/iphone/instapaper-update-app/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/domusweb.it.txt b/inc/3rdparty/site_config/standard/domusweb.it.txt index 81683f02..20566ee3 100644..100755 --- a/inc/3rdparty/site_config/standard/domusweb.it.txt +++ b/inc/3rdparty/site_config/standard/domusweb.it.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | # TODO: clean up the extra junk at the end of articles | 1 | # TODO: clean up the extra junk at the end of articles |
2 | 2 | ||
3 | # general text formatting | 3 | # general text formatting |
4 | prune: no | 4 | prune: no |
5 | convert_double_br_tags:yes | 5 | convert_double_br_tags:yes |
6 | 6 | ||
7 | # where to find the basic metadata | 7 | # where to find the basic metadata |
8 | author://a[@class='articleauthor'] | 8 | author://a[@class='articleauthor'] |
9 | date://a[starts-with(@href,'/en/search/published/')] | 9 | date://a[starts-with(@href,'/en/search/published/')] |
10 | title:substring-before(//h2[@class='title'],'—') | 10 | title:substring-before(//h2[@class='title'],'—') |
11 | body://div[@id='maincontainer'] | 11 | body://div[@id='maincontainer'] |
12 | 12 | ||
13 | dissolve://div[starts-with(@id,'commentableblock')] | 13 | dissolve://div[starts-with(@id,'commentableblock')] |
14 | 14 | ||
15 | # clean up the crap | 15 | # clean up the crap |
16 | strip://div[contains(@class,'domusnetwork')] | 16 | strip://div[contains(@class,'domusnetwork')] |
17 | strip://div[contains(@class,'relative_wrapper')] | 17 | strip://div[contains(@class,'relative_wrapper')] |
18 | 18 | ||
19 | strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] | 19 | strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] |
20 | wrap_in(em): //div[contains(@class,'captionsubimage')]/span | 20 | wrap_in(em): //div[contains(@class,'captionsubimage')]/span |
21 | test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/ \ No newline at end of file | 21 | test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dou.ua.txt b/inc/3rdparty/site_config/standard/dou.ua.txt index 22907c22..0f983112 100644..100755 --- a/inc/3rdparty/site_config/standard/dou.ua.txt +++ b/inc/3rdparty/site_config/standard/dou.ua.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@itemprop="name"] | 1 | title: //h1[@itemprop="name"] |
2 | 2 | ||
3 | author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a | 3 | author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a |
4 | 4 | ||
5 | date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')] | 5 | date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')] |
6 | 6 | ||
7 | body: //div[contains(@class, 'b-typo')] | 7 | body: //div[contains(@class, 'b-typo')] |
8 | test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm \ No newline at end of file | 8 | test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/douban.com.txt b/inc/3rdparty/site_config/standard/douban.com.txt index 99d7e5dc..d72a2223 100644..100755 --- a/inc/3rdparty/site_config/standard/douban.com.txt +++ b/inc/3rdparty/site_config/standard/douban.com.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | # This filter is tested on: | 1 | # This filter is tested on: |
2 | # http://www.douban.com/note/215003067/ | 2 | # http://www.douban.com/note/215003067/ |
3 | # http://www.douban.com/note/213540049/ | 3 | # http://www.douban.com/note/213540049/ |
4 | # http://www.douban.com/group/topic/31140104/ | 4 | # http://www.douban.com/group/topic/31140104/ |
5 | 5 | ||
6 | title: //div[@class='note-header']/h1 | 6 | title: //div[@class='note-header']/h1 |
7 | title: //div[@id='content']/h1 | 7 | title: //div[@id='content']/h1 |
8 | 8 | ||
9 | author: //div[@class='info']/ul/li/a | 9 | author: //div[@class='info']/ul/li/a |
10 | author: //h3/span/a | 10 | author: //h3/span/a |
11 | 11 | ||
12 | date://div[@class='note-header']/div/span | 12 | date://div[@class='note-header']/div/span |
13 | date://h3/span[contains(@class, 'color-green')] | 13 | date://h3/span[contains(@class, 'color-green')] |
14 | 14 | ||
15 | body://div[contains(@class, 'note')] | 15 | body://div[contains(@class, 'note')] |
16 | body://div[contains(@class, 'topic-content')] | 16 | body://div[contains(@class, 'topic-content')] |
17 | 17 | ||
18 | strip://h3 | 18 | strip://h3 |
19 | 19 | ||
20 | convert_double_br_tags: yes | 20 | convert_double_br_tags: yes |
21 | test_url: http://www.douban.com/group/topic/31140104/ \ No newline at end of file | 21 | test_url: http://www.douban.com/group/topic/31140104/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dpreview.com.txt b/inc/3rdparty/site_config/standard/dpreview.com.txt index 30179a3b..001c810f 100644..100755 --- a/inc/3rdparty/site_config/standard/dpreview.com.txt +++ b/inc/3rdparty/site_config/standard/dpreview.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | # next_page_link for product review | 1 | # next_page_link for product review |
2 | # example: http://www.dpreview.com/reviews/lytro/ | 2 | # example: http://www.dpreview.com/reviews/lytro/ |
3 | next_page_link: //img[@alt = 'Next page']/../@href | 3 | next_page_link: //img[@alt = 'Next page']/../@href |
4 | 4 | ||
5 | # next_page_link for other articles | 5 | # next_page_link for other articles |
6 | # example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 | 6 | # example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 |
7 | next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a | 7 | next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a |
8 | single_page_link: //a[contains(.,'Print view')] | 8 | single_page_link: //a[contains(.,'Print view')] |
9 | test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 \ No newline at end of file | 9 | test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dr.dk.txt b/inc/3rdparty/site_config/standard/dr.dk.txt index 7e46b0d6..d8ec1acf 100644..100755 --- a/inc/3rdparty/site_config/standard/dr.dk.txt +++ b/inc/3rdparty/site_config/standard/dr.dk.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | author: //div[@class='articleFunctions']//a | 2 | author: //div[@class='articleFunctions']//a |
3 | date: //meta[@name='pubdate']/@content | 3 | date: //meta[@name='pubdate']/@content |
4 | 4 | ||
5 | # Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason) | 5 | # Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason) |
6 | body: //div[@class='articleContent'] | 6 | body: //div[@class='articleContent'] |
7 | 7 | ||
8 | tidy: no | 8 | tidy: no |
9 | test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm \ No newline at end of file | 9 | test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dramasonline.com.txt b/inc/3rdparty/site_config/standard/dramasonline.com.txt index 659d0443..4898353b 100644..100755 --- a/inc/3rdparty/site_config/standard/dramasonline.com.txt +++ b/inc/3rdparty/site_config/standard/dramasonline.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@class='postext'] | 1 | body: //div[@class='postext'] |
2 | 2 | ||
3 | strip_id_or_class: ratingblock | 3 | strip_id_or_class: ratingblock |
4 | strip_id_or_class: hreview-aggregate | 4 | strip_id_or_class: hreview-aggregate |
5 | strip: //div[contains(@style, 'display: none;')] | 5 | strip: //div[contains(@style, 'display: none;')] |
6 | 6 | ||
7 | tidy: no | 7 | tidy: no |
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.dramasonline.com/jago-pakistan-jago-7th-december-2012-ali-gul-pir/ \ No newline at end of file | 10 | test_url: http://www.dramasonline.com/jago-pakistan-jago-7th-december-2012-ali-gul-pir/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/drdobbs.com.txt b/inc/3rdparty/site_config/standard/drdobbs.com.txt index b1a9db6f..b1a9db6f 100644..100755 --- a/inc/3rdparty/site_config/standard/drdobbs.com.txt +++ b/inc/3rdparty/site_config/standard/drdobbs.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/drive2.ru.txt b/inc/3rdparty/site_config/standard/drive2.ru.txt index 6125ce79..d500cb81 100644..100755 --- a/inc/3rdparty/site_config/standard/drive2.ru.txt +++ b/inc/3rdparty/site_config/standard/drive2.ru.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@class = "description"] | 1 | body: //div[@class = "description"] |
2 | body: //div[@id = "post"] | 2 | body: //div[@id = "post"] |
3 | 3 | ||
4 | strip_id_or_class: vcard | 4 | strip_id_or_class: vcard |
5 | strip_id_or_class: journallist | 5 | strip_id_or_class: journallist |
6 | strip_id_or_class: infobox | 6 | strip_id_or_class: infobox |
7 | strip_id_or_class: terms | 7 | strip_id_or_class: terms |
8 | strip_id_or_class: replieslist | 8 | strip_id_or_class: replieslist |
9 | strip_id_or_class: communityside | 9 | strip_id_or_class: communityside |
10 | 10 | ||
11 | 11 | ||
12 | test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/ \ No newline at end of file | 12 | test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dropbox.com.txt b/inc/3rdparty/site_config/standard/dropbox.com.txt new file mode 100755 index 00000000..92ae31b2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dropbox.com.txt | |||
@@ -0,0 +1 @@ | |||
single_page_link: //a[@id='download_button_link'] \ No newline at end of file | |||
diff --git a/inc/3rdparty/site_config/standard/drupal.org.txt b/inc/3rdparty/site_config/standard/drupal.org.txt index ffb77e4d..2da3eb1c 100644..100755 --- a/inc/3rdparty/site_config/standard/drupal.org.txt +++ b/inc/3rdparty/site_config/standard/drupal.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | author://div[@class="submitted"]/a | 2 | author://div[@class="submitted"]/a |
3 | date:substring-after(//div[@class="meta"],'modified: ') | 3 | date:substring-after(//div[@class="meta"],'modified: ') |
4 | date:substring-after(//div[@class="submitted"],'on ') | 4 | date:substring-after(//div[@class="submitted"],'on ') |
5 | body://div[@class="node-content"] | 5 | body://div[@class="node-content"] |
6 | strip://div[@class="meta"] | 6 | strip://div[@class="meta"] |
7 | strip_id_or_class:book-navigation | 7 | strip_id_or_class:book-navigation |
8 | test_url: http://drupal.org/node/1327354 \ No newline at end of file | 8 | test_url: http://drupal.org/node/1327354 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt b/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt index 418c9f62..2978797e 100644..100755 --- a/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt +++ b/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h2/a | 1 | title: //h2/a |
2 | author: substring-before(substring-after(//span[@class='byline'], 'by'), ',') | 2 | author: substring-before(substring-after(//span[@class='byline'], 'by'), ',') |
3 | date: substring-before(substring-after(//span[@class='byline'], ','), '|') | 3 | date: substring-before(substring-after(//span[@class='byline'], ','), '|') |
4 | body: //div[@class='entry'] | 4 | body: //div[@class='entry'] |
5 | 5 | ||
6 | 6 | ||
7 | # strip out auction stuff at the end of posts | 7 | # strip out auction stuff at the end of posts |
8 | # tidy kills the center tag, so disable it | 8 | # tidy kills the center tag, so disable it |
9 | tidy: no | 9 | tidy: no |
10 | strip: //center//table | 10 | strip: //center//table |
11 | test_url: http://www.dukebasketballreport.com/articles/?p=42660 \ No newline at end of file | 11 | test_url: http://www.dukebasketballreport.com/articles/?p=42660 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/dushumashang.com.txt b/inc/3rdparty/site_config/standard/dushumashang.com.txt new file mode 100755 index 00000000..6a50a77e --- /dev/null +++ b/inc/3rdparty/site_config/standard/dushumashang.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.dushumashang.com/2389 | ||
3 | # http://www.dushumashang.com/2415 | ||
4 | # http://www.dushumashang.com/2355 | ||
5 | |||
6 | body://div[@class='main_content'] | ||
7 | #body://section[@class='entry_content fl'] | ||
8 | title://h2 | ||
9 | author://span[@class='article_author']/a | ||
10 | date://span[@class='pub_date']/time | ||
11 | |||
12 | strip://span[@class='article_author'] | ||
13 | strip://span[@class='pub_date'] | ||
14 | strip://div[@class='page_turn'] | ||
15 | strip://span[@class='source_link']/em | ||
16 | wrap_in(strong)://span[@class='source_link']/a | ||
17 | test_url: http://www.dushumashang.com/2355 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dvice.com.txt b/inc/3rdparty/site_config/standard/dvice.com.txt index c8163680..1a1990ee 100644..100755 --- a/inc/3rdparty/site_config/standard/dvice.com.txt +++ b/inc/3rdparty/site_config/standard/dvice.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | strip://*[@id = 'blog_top_stories'] | 1 | strip://*[@id = 'blog_top_stories'] |
2 | strip://*[@id = 'takeover_off'] | 2 | strip://*[@id = 'takeover_off'] |
3 | strip://*[@id = 'right_gray_box'] | 3 | strip://*[@id = 'right_gray_box'] |
4 | strip://*[@class = 'blog_topics'] | 4 | strip://*[@class = 'blog_topics'] |
5 | strip://*[@class = 'section_titles'] | 5 | strip://*[@class = 'section_titles'] |
6 | 6 | ||
7 | author://div[@class = 'post_author_info']/a | 7 | author://div[@class = 'post_author_info']/a |
8 | date://div[@class = 'post_date_info'] | 8 | date://div[@class = 'post_date_info'] |
9 | test_url: http://dvice.com/archives/2012/05/is-nfc-and-smar.php \ No newline at end of file | 9 | test_url: http://dvice.com/archives/2012/05/is-nfc-and-smar.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/eamesinerudition.com.txt b/inc/3rdparty/site_config/standard/eamesinerudition.com.txt index 908a1b51..89a68bcd 100644..100755 --- a/inc/3rdparty/site_config/standard/eamesinerudition.com.txt +++ b/inc/3rdparty/site_config/standard/eamesinerudition.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div [@class="post contain"]/h1 | 1 | title: //div [@class="post contain"]/h1 |
2 | strip: //div [@class="post contain"]/h1 | 2 | strip: //div [@class="post contain"]/h1 |
3 | body: //div [@class="post contain"] | 3 | body: //div [@class="post contain"] |
4 | author: substring-before(//title, ':') | 4 | author: substring-before(//title, ':') |
5 | author: substring-before(//title, ' ') | 5 | author: substring-before(//title, ' ') |
6 | 6 | ||
7 | 7 | ||
8 | test_url: http://eamesinerudition.com/2012/03/hospital-numbers-are-bad-for-you \ No newline at end of file | 8 | test_url: http://eamesinerudition.com/2012/03/hospital-numbers-are-bad-for-you \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/eandt.theiet.org.txt b/inc/3rdparty/site_config/standard/eandt.theiet.org.txt index c4c38f25..ba9d312d 100644..100755 --- a/inc/3rdparty/site_config/standard/eandt.theiet.org.txt +++ b/inc/3rdparty/site_config/standard/eandt.theiet.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | date: //div[@class="et_dateUnderTitle"] | 2 | date: //div[@class="et_dateUnderTitle"] |
3 | author: substring-after(//div[@class="et_authorUnderTitle"], 'By ') | 3 | author: substring-after(//div[@class="et_authorUnderTitle"], 'By ') |
4 | body: //div[@id="et_leftCol640split"] | 4 | body: //div[@id="et_leftCol640split"] |
5 | 5 | ||
6 | strip: //div[@id="et_leftCol640splitRight"] | 6 | strip: //div[@id="et_leftCol640splitRight"] |
7 | strip: //div[@class="et_light_greybgboxlower"] | 7 | strip: //div[@class="et_light_greybgboxlower"] |
8 | test_url: http://eandt.theiet.org/magazine/2011/12/this-festive-waste.cfm \ No newline at end of file | 8 | test_url: http://eandt.theiet.org/magazine/2011/12/this-festive-waste.cfm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/eastoftheweb.com.txt b/inc/3rdparty/site_config/standard/eastoftheweb.com.txt index d762091c..36708da3 100644..100755 --- a/inc/3rdparty/site_config/standard/eastoftheweb.com.txt +++ b/inc/3rdparty/site_config/standard/eastoftheweb.com.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | title: //div[@class='title_text'] | 1 | title: //div[@class='title_text'] |
2 | 2 | ||
3 | author: //div[@class='author_text'] | 3 | author: //div[@class='author_text'] |
4 | 4 | ||
5 | body: //div[@class='story_text']/.. | 5 | body: //div[@class='story_text']/.. |
6 | 6 | ||
7 | strip: //b | 7 | strip: //b |
8 | 8 | ||
9 | strip_id_or_class: back_to_top | 9 | strip_id_or_class: back_to_top |
10 | strip_id_or_class: author_text | 10 | strip_id_or_class: author_text |
11 | strip_id_or_class: title_text | 11 | strip_id_or_class: title_text |
12 | 12 | ||
13 | wrap_in(center): //a | 13 | wrap_in(center): //a |
14 | 14 | ||
15 | dissolve: //a | 15 | dissolve: //a |
16 | 16 | ||
17 | footnotes: no | 17 | footnotes: no |
18 | test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml \ No newline at end of file | 18 | test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ebay.com.txt b/inc/3rdparty/site_config/standard/ebay.com.txt index 5fa18ff3..f17e1f72 100644..100755 --- a/inc/3rdparty/site_config/standard/ebay.com.txt +++ b/inc/3rdparty/site_config/standard/ebay.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum'] | 1 | body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum'] |
2 | 2 | ||
3 | strip_image_src: imgLoading_30x30.gif | 3 | strip_image_src: imgLoading_30x30.gif |
4 | 4 | ||
5 | test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204 \ No newline at end of file | 5 | test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ecetia.com.txt b/inc/3rdparty/site_config/standard/ecetia.com.txt index d67e9103..d67e9103 100644..100755 --- a/inc/3rdparty/site_config/standard/ecetia.com.txt +++ b/inc/3rdparty/site_config/standard/ecetia.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/econlog.econlib.org.txt b/inc/3rdparty/site_config/standard/econlog.econlib.org.txt index ebafc197..729affd4 100644..100755 --- a/inc/3rdparty/site_config/standard/econlog.econlib.org.txt +++ b/inc/3rdparty/site_config/standard/econlog.econlib.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class="title"] | 1 | title: //h1[@class="title"] |
2 | author: //div[@class="hosted"]/a | 2 | author: //div[@class="hosted"]/a |
3 | date: substring-after(//div[@class="dateline"]/text(), '|') | 3 | date: substring-after(//div[@class="dateline"]/text(), '|') |
4 | 4 | ||
5 | strip: //a[@class="top" and @href="#"] | 5 | strip: //a[@class="top" and @href="#"] |
6 | test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html \ No newline at end of file | 6 | test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt b/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt index b59f554e..936a191d 100644..100755 --- a/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt +++ b/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | date: //div[@class="bb-md-noticia-fecha"] | 1 | date: //div[@class="bb-md-noticia-fecha"] |
2 | body: //div[@class="corpo"] | 2 | body: //div[@class="corpo"] |
3 | dissolve: //div[@class="bb-md-noticia-extras"] | 3 | dissolve: //div[@class="bb-md-noticia-extras"] |
4 | strip: //strong | 4 | strip: //strong |
5 | strip_id_or_class: bb-md-noticia-foto-autor | 5 | strip_id_or_class: bb-md-noticia-foto-autor |
6 | strip_id_or_class: bb-md-noticia-foto-bajada | 6 | strip_id_or_class: bb-md-noticia-foto-bajada |
7 | test_url: http://economia.estadao.com.br/noticias/economia,cmn-aprova-r-67-bi-em-credito-para-20-setores-da-economia,118501,0.htm \ No newline at end of file | 7 | test_url: http://economia.estadao.com.br/noticias/economia,cmn-aprova-r-67-bi-em-credito-para-20-setores-da-economia,118501,0.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/economist.com.txt b/inc/3rdparty/site_config/standard/economist.com.txt index 71dd62f5..16c9ed64 100644..100755 --- a/inc/3rdparty/site_config/standard/economist.com.txt +++ b/inc/3rdparty/site_config/standard/economist.com.txt | |||
@@ -1,10 +1,8 @@ | |||
1 | title: //div[@class='ec-blog-headline'] | 1 | body: //div[@class='main-content'] |
2 | body: //div[@class='ec-blog-body'] | 2 | date: //time[@class='date-created'] |
3 | body: //div[@class='ec-article-content clear'] | 3 | strip: //aside |
4 | strip: //div[@class='related-items'] | 4 | prune: no |
5 | date: substring-before(//p[@class='ec-article-info'], '|') | 5 | |
6 | prune: no | 6 | autodetect_next_page: no |
7 | 7 | ||
8 | autodetect_next_page: no | ||
9 | |||
10 | test_url: http://www.economist.com/node/21528429 \ No newline at end of file | 8 | test_url: http://www.economist.com/node/21528429 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/edge-online.com.txt b/inc/3rdparty/site_config/standard/edge-online.com.txt index 461d909c..cf585815 100644..100755 --- a/inc/3rdparty/site_config/standard/edge-online.com.txt +++ b/inc/3rdparty/site_config/standard/edge-online.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')] | 2 | body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')] |
3 | date: //time[@pubdate]/@datetime | 3 | date: //time[@pubdate]/@datetime |
4 | author: //span[@class='author-name'] | 4 | author: //span[@class='author-name'] |
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | strip: //footer | 7 | strip: //footer |
8 | 8 | ||
9 | replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak --> | 9 | replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak --> |
10 | 10 | ||
11 | single_page_link: //a[contains(@href, '?page=show')] | 11 | single_page_link: //a[contains(@href, '?page=show')] |
12 | 12 | ||
13 | test_url: http://www.edge-online.com/features/telling-modern-warfares-story \ No newline at end of file | 13 | test_url: http://www.edge-online.com/features/telling-modern-warfares-story \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/edge.org.txt b/inc/3rdparty/site_config/standard/edge.org.txt index 9980000d..95805f6e 100644..100755 --- a/inc/3rdparty/site_config/standard/edge.org.txt +++ b/inc/3rdparty/site_config/standard/edge.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class='HomeLeftPannel IMGCTRL']/h2 | 1 | title: //div[@class='HomeLeftPannel IMGCTRL']/h2 |
2 | body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc'] | 2 | body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc'] |
3 | tidy: no | 3 | tidy: no |
4 | 4 | ||
5 | test_url: http://edge.org/print/conversation.php?cid=the-argumentative-theory \ No newline at end of file | 5 | test_url: http://edge.org/print/conversation.php?cid=the-argumentative-theory \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/edition.channel5belize.com.txt b/inc/3rdparty/site_config/standard/edition.channel5belize.com.txt new file mode 100755 index 00000000..6d5f170a --- /dev/null +++ b/inc/3rdparty/site_config/standard/edition.channel5belize.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='singlePage']//h2 | ||
2 | body: //div[@id='singlePage']//div[contains(@class, 'post')] | ||
3 | strip: //a[@title='Email This Story'] | ||
4 | strip_id_or_class: sociable | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://edition.channel5belize.com/archives/86016 | ||
9 | test_url: http://edition.channel5belize.com/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/edition.cnn.com.txt b/inc/3rdparty/site_config/standard/edition.cnn.com.txt index dc8ebe14..6fc82d24 100644..100755 --- a/inc/3rdparty/site_config/standard/edition.cnn.com.txt +++ b/inc/3rdparty/site_config/standard/edition.cnn.com.txt | |||
@@ -1,9 +1,18 @@ | |||
1 | body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')] | 1 | body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')] |
2 | strip: //div[@id='cnnCVP2'] | 2 | strip: //a[starts-with(@name, 'em')] |
3 | strip_id_or_class: cnn_strylftcexpbx | 3 | strip: //div[@id='cnnCVP2'] |
4 | strip_id_or_class: cnn_strylctcqrelt | 4 | strip_id_or_class: cnn_strylftcexpbx |
5 | strip_id_or_class: cnn_strybtntoolsbttm | 5 | strip_id_or_class: cnn_strylctcqrelt |
6 | strip_id_or_class: cnn_stryftsbttm | 6 | strip_id_or_class: cnn_strybtntoolsbttm |
7 | strip_id_or_class: cnn_strybtmcntnt | 7 | strip_id_or_class: cnn_stryftsbttm |
8 | strip_id_or_class: cnn_strybtmcntnt | ||
9 | strip_id_or_class: cnn_stryshrwdgtbtm | ||
10 | strip_id_or_class: cnnGalleryContainer | ||
11 | strip_id_or_class: cnn_strycrcntr | ||
12 | strip_id_or_class: cnn_html_slideshow | ||
8 | prune: no | 13 | prune: no |
9 | test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html \ No newline at end of file | 14 | |
15 | test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html | ||
16 | test_url: http://edition.cnn.com/2013/08/15/world/africa/nigeria-boko-haram-commander-killed/index.html?eref=edition | ||
17 | test_url: http://rss.cnn.com/rss/edition.rss | ||
18 | test_url: http://rss.cnn.com/rss/edition_technology.rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eetimes.com.txt b/inc/3rdparty/site_config/standard/eetimes.com.txt new file mode 100755 index 00000000..300db307 --- /dev/null +++ b/inc/3rdparty/site_config/standard/eetimes.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[contains(@class, 'grayshowlinks')] | ||
2 | |||
3 | next_page_link: //div[@id='sitecontentcol']//a[.='Next >'] | ||
4 | # Doesn't work (site doesn't always load full content in print view) | ||
5 | #single_page_link: //div[@id='sitecontentcol']//a[contains(@href, 'print=yes')] | ||
6 | |||
7 | test_url: http://www.eetimes.com/document.asp?doc_id=1319966& | ||
8 | test_url: http://www.eetimes.com/rss_simple.asp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ekultura.hu.txt b/inc/3rdparty/site_config/standard/ekultura.hu.txt index 59f6a711..3756027c 100644..100755 --- a/inc/3rdparty/site_config/standard/ekultura.hu.txt +++ b/inc/3rdparty/site_config/standard/ekultura.hu.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1[@class='style6 nevek'] | 1 | title: //h1[@class='style6 nevek'] |
2 | 2 | ||
3 | body: //div[@class='bal3'] | 3 | body: //div[@class='bal3'] |
4 | 4 | ||
5 | 5 | ||
6 | prune: yes | 6 | prune: yes |
7 | 7 | ||
8 | tidy: yes | 8 | tidy: yes |
9 | convert_double_br_tags: yes | 9 | convert_double_br_tags: yes |
10 | 10 | ||
11 | test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december \ No newline at end of file | 11 | test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/elance.com.txt b/inc/3rdparty/site_config/standard/elance.com.txt index 52ffe2d0..d4b0a9b8 100644..100755 --- a/inc/3rdparty/site_config/standard/elance.com.txt +++ b/inc/3rdparty/site_config/standard/elance.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='jobDesc-bd']/p | 1 | body: //div[@id='jobDesc-bd']/p |
2 | 2 | ||
3 | test_url: http://www.elance.com/j/xml-technical-intergration/23687172/ \ No newline at end of file | 3 | test_url: http://www.elance.com/j/xml-technical-intergration/23687172/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/elderscrollsonline.com.txt b/inc/3rdparty/site_config/standard/elderscrollsonline.com.txt new file mode 100755 index 00000000..fa3892c6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/elderscrollsonline.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | date: //time | ||
2 | title: //h1[contains(@class, "alpha")] | ||
3 | body: //article[contains(@class, "news-post")] | ||
4 | |||
5 | # fix dates - dates as they are won't work as strtotime doesn't understand format (03.28.2013) | ||
6 | replace_string(<time class="gamma">01.): <time class="gamma">January. | ||
7 | replace_string(<time class="gamma">02.): <time class="gamma">February. | ||
8 | replace_string(<time class="gamma">03.): <time class="gamma">March. | ||
9 | replace_string(<time class="gamma">04.): <time class="gamma">April. | ||
10 | replace_string(<time class="gamma">05.): <time class="gamma">May. | ||
11 | replace_string(<time class="gamma">06.): <time class="gamma">June. | ||
12 | replace_string(<time class="gamma">07.): <time class="gamma">July. | ||
13 | replace_string(<time class="gamma">08.): <time class="gamma">August. | ||
14 | replace_string(<time class="gamma">09.): <time class="gamma">September. | ||
15 | replace_string(<time class="gamma">10.): <time class="gamma">October. | ||
16 | replace_string(<time class="gamma">11.): <time class="gamma">November. | ||
17 | replace_string(<time class="gamma">12.): <time class="gamma">December. | ||
18 | |||
19 | prune: no | ||
20 | |||
21 | test_url: http://elderscrollsonline.com/en/rss | ||
22 | test_url: http://elderscrollsonline.com/en/news/post/2013/03/27/developer-question-of-the-week-17 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/elektroniknet.de.txt b/inc/3rdparty/site_config/standard/elektroniknet.de.txt index 07664719..56fba5ff 100644..100755 --- a/inc/3rdparty/site_config/standard/elektroniknet.de.txt +++ b/inc/3rdparty/site_config/standard/elektroniknet.de.txt | |||
@@ -1,27 +1,27 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | date: //div[@class='datum'] | 2 | date: //div[@class='datum'] |
3 | single_page_link: //a[contains(@href, '?type=99')] | 3 | single_page_link: //a[contains(@href, '?type=99')] |
4 | 4 | ||
5 | # this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1 | 5 | # this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1 |
6 | dissolve: //div[@class='artikelMeldung'] | 6 | dissolve: //div[@class='artikelMeldung'] |
7 | 7 | ||
8 | 8 | ||
9 | strip_id_or_class: anzeige | 9 | strip_id_or_class: anzeige |
10 | strip_id_or_class: top_page_navigation | 10 | strip_id_or_class: top_page_navigation |
11 | strip_id_or_class: cr_image_container | 11 | strip_id_or_class: cr_image_container |
12 | strip_id_or_class: cr_image_reference | 12 | strip_id_or_class: cr_image_reference |
13 | strip_id_or_class: cr_image_icon | 13 | strip_id_or_class: cr_image_icon |
14 | strip_id_or_class: _close_txt | 14 | strip_id_or_class: _close_txt |
15 | strip_id_or_class: _close_ico | 15 | strip_id_or_class: _close_ico |
16 | strip_id_or_class: clearer | 16 | strip_id_or_class: clearer |
17 | 17 | ||
18 | strip://h1 | 18 | strip://h1 |
19 | strip://h6 | 19 | strip://h6 |
20 | strip://div[contains(@id, 'plista')] | 20 | strip://div[contains(@id, 'plista')] |
21 | strip://img[contains(@id,'tiny')] | 21 | strip://img[contains(@id,'tiny')] |
22 | strip://img[@class='cr_image'] | 22 | strip://img[@class='cr_image'] |
23 | 23 | ||
24 | # strip url at the top | 24 | # strip url at the top |
25 | strip: //p[@style='font-size: 10px;'] | 25 | strip: //p[@style='font-size: 10px;'] |
26 | 26 | ||
27 | test_url: http://www.elektroniknet.de/automotive/technik-know-how/sicherheitselektronik/article/87717/0/Besser_als_die_Wirklichkeit/ \ No newline at end of file | 27 | test_url: http://www.elektroniknet.de/automotive/technik-know-how/sicherheitselektronik/article/87717/0/Besser_als_die_Wirklichkeit/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/elmalpensante.com.txt b/inc/3rdparty/site_config/standard/elmalpensante.com.txt index 9fecd663..435c6c20 100644..100755 --- a/inc/3rdparty/site_config/standard/elmalpensante.com.txt +++ b/inc/3rdparty/site_config/standard/elmalpensante.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | single_page_link: //a[contains(@href, 'print_contenido')] | 1 | single_page_link: //a[contains(@href, 'print_contenido')] |
2 | title: //h2 | 2 | title: //h2 |
3 | author: //div[@class="autor"] | 3 | author: //div[@class="autor"] |
4 | test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668 \ No newline at end of file | 4 | test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/elpais.com.txt b/inc/3rdparty/site_config/standard/elpais.com.txt index 32f9fc3f..c6f9787b 100644..100755 --- a/inc/3rdparty/site_config/standard/elpais.com.txt +++ b/inc/3rdparty/site_config/standard/elpais.com.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | title: //meta[@name='DC.title']/@content | 1 | title: //meta[@name='DC.title']/@content |
2 | title: //div[contains(@class, 'cabecera_noticia')]//h1 | 2 | title: //div[contains(@class, 'cabecera_noticia')]//h1 |
3 | date: //meta[@name='DC.date']/@content | 3 | date: //meta[@name='DC.date']/@content |
4 | date: //meta[@name='date']/@content | 4 | date: //meta[@name='date']/@content |
5 | body: //div[@class='columna_texto'] | 5 | body: //div[@class='columna_texto'] |
6 | body: //div[@id='cuerpo_noticia'] | 6 | body: //div[@id='cuerpo_noticia'] |
7 | body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] | 7 | body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] |
8 | |||
9 | prune: no | ||
10 | |||
11 | strip_id_or_class: disposicion_vertical | ||
12 | strip_id_or_class: ampliar_foto | ||
13 | strip_id_or_class: utilidades | ||
14 | strip_id_or_class: info_relacionada | ||
15 | strip_id_or_class: m-kiosko | ||
16 | strip_id_or_class: info_complementa | ||
17 | |||
18 | strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] | ||
19 | strip: //div[@id='coment' or @id='foros_not'] | ||
20 | 8 | ||
21 | test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html | 9 | prune: no |
10 | |||
11 | strip_id_or_class: disposicion_vertical | ||
12 | strip_id_or_class: ampliar_foto | ||
13 | strip_id_or_class: utilidades | ||
14 | strip_id_or_class: info_relacionada | ||
15 | strip_id_or_class: m-kiosko | ||
16 | strip_id_or_class: info_complementa | ||
17 | |||
18 | strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] | ||
19 | strip: //div[@id='coment' or @id='foros_not'] | ||
20 | |||
21 | test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html | ||
22 | test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes \ No newline at end of file | 22 | test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/emaratalyoum.com.txt b/inc/3rdparty/site_config/standard/emaratalyoum.com.txt new file mode 100755 index 00000000..3d1313e2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/emaratalyoum.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id='main-column']//div[@class='content'] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601844 | ||
6 | test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601842 | ||
7 | test_url: http://www.emaratalyoum.com/public-sports-1.533088?ot=ot.AjaxPageLayout \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/en.espnf1.com.txt b/inc/3rdparty/site_config/standard/en.espnf1.com.txt index c1a91063..2ca0216b 100644..100755 --- a/inc/3rdparty/site_config/standard/en.espnf1.com.txt +++ b/inc/3rdparty/site_config/standard/en.espnf1.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | strip: //div[@class='rl'] | 2 | strip: //div[@class='rl'] |
3 | strip: //p[@class='authdesc'] | 3 | strip: //p[@class='authdesc'] |
4 | strip: //p[@class='strybtm'] | 4 | strip: //p[@class='strybtm'] |
5 | strip: //div[@id='stryFtrLft'] | 5 | strip: //div[@id='stryFtrLft'] |
6 | strip: //div[@id='f1Conversation'] | 6 | strip: //div[@id='f1Conversation'] |
7 | strip: //div[@id='cmtSpncrRuler'] | 7 | strip: //div[@id='cmtSpncrRuler'] |
8 | strip: //div[@id='stryComments'] | 8 | strip: //div[@id='stryComments'] |
9 | strip: //div[@id='athrData'] | 9 | strip: //div[@id='athrData'] |
10 | test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html \ No newline at end of file | 10 | test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/engadget.com.txt b/inc/3rdparty/site_config/standard/engadget.com.txt index 6cc6b14e..52acddb0 100644..100755 --- a/inc/3rdparty/site_config/standard/engadget.com.txt +++ b/inc/3rdparty/site_config/standard/engadget.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //div[@class='post_body'] | 2 | body: //div[@class='post_body'] |
3 | date: //*[@class='post_time'] | 3 | date: //*[@class='post_time'] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/ \ No newline at end of file | 7 | test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt b/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt index 35ace467..48f301fe 100644..100755 --- a/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt +++ b/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | body: //div[@class="post_content"] | 2 | body: //div[@class="post_content"] |
3 | author: //p[@class="author"]/a | 3 | author: //p[@class="author"]/a |
4 | date: //p[@class="date"] | 4 | date: //p[@class="date"] |
5 | strip: //h2 | 5 | strip: //h2 |
6 | strip: //header | 6 | strip: //header |
7 | test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose \ No newline at end of file | 7 | test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/english.aljazeera.net.txt b/inc/3rdparty/site_config/standard/english.aljazeera.net.txt index aed3a5f9..97365994 100644..100755 --- a/inc/3rdparty/site_config/standard/english.aljazeera.net.txt +++ b/inc/3rdparty/site_config/standard/english.aljazeera.net.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //span[@id='DetailedTitle'] | 1 | title: //span[@id='DetailedTitle'] |
2 | body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary'] | 2 | body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary'] |
3 | strip_id_or_class: sidebar | 3 | strip_id_or_class: sidebar |
4 | strip_id_or_class: Skyscrapper_Body | 4 | strip_id_or_class: Skyscrapper_Body |
5 | strip: //td[@class='DetailedSummary']/table[position() != 1] | 5 | strip: //td[@class='DetailedSummary']/table[position() != 1] |
6 | prune: no | 6 | prune: no |
7 | test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html \ No newline at end of file | 7 | test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/enikos.gr.txt b/inc/3rdparty/site_config/standard/enikos.gr.txt index e2b99bfc..ddd51c4b 100644..100755 --- a/inc/3rdparty/site_config/standard/enikos.gr.txt +++ b/inc/3rdparty/site_config/standard/enikos.gr.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@id='article']//div[contains(@class, 'inside')] | 1 | body: //div[@id='article']//div[contains(@class, 'inside')] |
2 | 2 | ||
3 | strip_id_or_class: tags | 3 | strip_id_or_class: tags |
4 | strip_id_or_class: actions | 4 | strip_id_or_class: actions |
5 | strip_id_or_class: google-ads | 5 | strip_id_or_class: google-ads |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html \ No newline at end of file | 9 | test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt b/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt index 3e7fba09..a756c457 100644..100755 --- a/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt +++ b/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | author://div[@class = 'article-author']/span[@class = 'byline'] | 1 | author://div[@class = 'article-author']/span[@class = 'byline'] |
2 | title://h1[@class = 'heading'] | 2 | title://h1[@class = 'heading'] |
3 | body://div[@id = 'related-article-links'] | 3 | body://div[@id = 'related-article-links'] |
4 | strip://div[@id = 'comment-sort-order'] | 4 | strip://div[@id = 'comment-sort-order'] |
5 | strip://div[@id = 'my-profile'] | 5 | strip://div[@id = 'my-profile'] |
6 | strip://div[@class = 'article-author'] | 6 | strip://div[@class = 'article-author'] |
7 | strip://div[@class = 'bg-f8f1d8 width-385 text-left'] | 7 | strip://div[@class = 'bg-f8f1d8 width-385 text-left'] |
8 | strip://div[@id = 'login-status'] | 8 | strip://div[@id = 'login-status'] |
9 | strip://div[@class = 'puff-padding'] | 9 | strip://div[@class = 'puff-padding'] |
10 | test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece \ No newline at end of file | 10 | test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ericsuh.com.txt b/inc/3rdparty/site_config/standard/ericsuh.com.txt new file mode 100755 index 00000000..d25140c5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ericsuh.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | date: //h6[@class='datetime']/child::text() | ||
2 | author: string("Eric J. Suh") | ||
3 | footnotes: yes | ||
4 | test_url: http://www.ericsuh.com/blog/posts/2012/8/strange-numbers.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/es.hu.txt b/inc/3rdparty/site_config/standard/es.hu.txt index 19a1e9dd..21691a56 100644..100755 --- a/inc/3rdparty/site_config/standard/es.hu.txt +++ b/inc/3rdparty/site_config/standard/es.hu.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title'])) | 1 | title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title'])) |
2 | 2 | ||
3 | body: //div[@class='doc'] | 3 | body: //div[@class='doc'] |
4 | 4 | ||
5 | prune: yes | 5 | prune: yes |
6 | 6 | ||
7 | tidy: yes | 7 | tidy: yes |
8 | convert_double_br_tags: yes | 8 | convert_double_br_tags: yes |
9 | 9 | ||
10 | strip: //a[contains(@href, 'www.facebook.com/pages/Elet-es-Irodalom/')] | 10 | strip: //a[contains(@href, 'www.facebook.com/pages/Elet-es-Irodalom/')] |
11 | test_url: http://www.es.hu/2010-12-08_vissza-a-partpenzt \ No newline at end of file | 11 | test_url: http://www.es.hu/2010-12-08_vissza-a-partpenzt \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/escapistmagazine.com.txt b/inc/3rdparty/site_config/standard/escapistmagazine.com.txt index 7e17a04d..fd453a19 100644..100755 --- a/inc/3rdparty/site_config/standard/escapistmagazine.com.txt +++ b/inc/3rdparty/site_config/standard/escapistmagazine.com.txt | |||
@@ -1,2 +1,8 @@ | |||
1 | title: //h1[@class='headline']/div[@class='name'] | ||
2 | |||
3 | strip_image_src: 'http://cdn.themis-media.com/media/global/images/library/deriv/115/115825.png' | ||
4 | |||
5 | next_page_link: //a[@class='next_page'] | ||
6 | |||
1 | strip_comments: no | 7 | strip_comments: no |
2 | test_url: http://www.escapistmagazine.com/articles/view/columns/extraconsideration/8717-Extra-Consideration-The-Story \ No newline at end of file | 8 | test_url: http://www.escapistmagazine.com/articles/view/columns/criticalintel/10302-I-Hate-Magic \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/espn.go.com.txt b/inc/3rdparty/site_config/standard/espn.go.com.txt index 319d352b..06476296 100644..100755 --- a/inc/3rdparty/site_config/standard/espn.go.com.txt +++ b/inc/3rdparty/site_config/standard/espn.go.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 | 1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 |
2 | body: //div[contains(@class, 'article')] | 2 | body: //div[contains(@class, 'article')] |
3 | strip: //div[contains(@class, 'mod-inline')] | 3 | strip: //div[contains(@class, 'mod-inline')] |
4 | strip: //*/span[@class='page-actions'] | 4 | strip: //*/span[@class='page-actions'] |
5 | strip: //div[@class='page-actions']/* | 5 | strip: //div[@class='page-actions']/* |
6 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 | 6 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 |
7 | strip: //div[@class='mod-blog-navigation'] | 7 | strip: //div[@class='mod-blog-navigation'] |
8 | strip: //div[@class='monthday'] | 8 | strip: //div[@class='monthday'] |
9 | strip: //div[@class='time'] | 9 | strip: //div[@class='time'] |
10 | strip: //div[@class='timeofday'] | 10 | strip: //div[@class='timeofday'] |
11 | strip: //div[contains(@class, 'mod-conversations')] | 11 | strip: //div[contains(@class, 'mod-conversations')] |
12 | test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign \ No newline at end of file | 12 | test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/esquire.com.txt b/inc/3rdparty/site_config/standard/esquire.com.txt index 7566e8cc..b9cb1e55 100644..100755 --- a/inc/3rdparty/site_config/standard/esquire.com.txt +++ b/inc/3rdparty/site_config/standard/esquire.com.txt | |||
@@ -1,10 +1,11 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //div[@id='byline'] | 2 | author: //div[@id='byline'] |
3 | 3 | ||
4 | body: //div[@id='printBody'] | 4 | body: //div[@id='printBody'] |
5 | 5 | ||
6 | single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/')) | 6 | single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/')) |
7 | 7 | ||
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810 \ No newline at end of file | 10 | test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810 |
11 | test_url: http://www.esquire.com/blogs/politics/police-getting-leftover-armoured-iraq-trucks-112513 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt b/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt index 88c8c560..9a922392 100644..100755 --- a/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt +++ b/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //*[@itemprop='headline'] | 1 | title: //*[@itemprop='headline'] |
2 | author: //*[@itemprop='author'] | 2 | author: //*[@itemprop='author'] |
3 | date: //*[@itemprop='datePublished'] | 3 | date: //*[@itemprop='datePublished'] |
4 | body: //*[@itemprop='articleBody'] | 4 | body: //*[@itemprop='articleBody'] |
5 | strip: //*[contains(@class, 'instapaper_ignore')] | 5 | strip: //*[contains(@class, 'instapaper_ignore')] |
6 | test_url: http://www.essentialpublicradio.org/story/2011-11-14/volunteers-sought-federal-tax-assistance-program-pennsylvania-9421 \ No newline at end of file | 6 | test_url: http://www.essentialpublicradio.org/story/2011-11-14/volunteers-sought-federal-tax-assistance-program-pennsylvania-9421 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/etc.se.txt b/inc/3rdparty/site_config/standard/etc.se.txt index 58da5ef7..95f8cf78 100644..100755 --- a/inc/3rdparty/site_config/standard/etc.se.txt +++ b/inc/3rdparty/site_config/standard/etc.se.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | strip_id_or_class: 'left' | 1 | strip_id_or_class: 'left' |
2 | strip_id_or_class: 'right' | 2 | strip_id_or_class: 'right' |
3 | strip_id_or_class: 'block-belowcontent' | 3 | strip_id_or_class: 'block-belowcontent' |
4 | author: //span[@class = 'name']/a | 4 | author: //span[@class = 'name']/a |
5 | date: //div[@class= 'datum'] | 5 | date: //div[@class= 'datum'] |
6 | test_url: http://www.etc.se/intervju/lonsamt-att-radda-jorden \ No newline at end of file | 6 | test_url: http://www.etc.se/intervju/lonsamt-att-radda-jorden \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt b/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt index bfa2c5dc..bfa2c5dc 100644..100755 --- a/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt +++ b/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/eurogamer.net.txt b/inc/3rdparty/site_config/standard/eurogamer.net.txt index 6ecdf6bd..8a351667 100644..100755 --- a/inc/3rdparty/site_config/standard/eurogamer.net.txt +++ b/inc/3rdparty/site_config/standard/eurogamer.net.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | body: //div[ @class='content' ] | //div[ @class='blog-entry' ] | 1 | body: //div[ @class='content' ] | //div[ @class='blog-entry' ] |
2 | 2 | ||
3 | strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')] | 3 | strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')] |
4 | 4 | ||
5 | date://p[ @class='timestamp' ] | 5 | date://p[ @class='timestamp' ] |
6 | 6 | ||
7 | author://a[ @class='eurogamer-author' ] | 7 | author://a[ @class='eurogamer-author' ] |
8 | test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4 \ No newline at end of file | 8 | test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/evo.co.uk.txt b/inc/3rdparty/site_config/standard/evo.co.uk.txt index 07162513..ccb4f879 100644..100755 --- a/inc/3rdparty/site_config/standard/evo.co.uk.txt +++ b/inc/3rdparty/site_config/standard/evo.co.uk.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | author: substring-after(//div[@class='articleauthor'],'By ') | 1 | author: substring-after(//div[@class='articleauthor'],'By ') |
2 | 2 | ||
3 | # Blog posts | 3 | # Blog posts |
4 | date: //div[@class='articledate'] | 4 | date: //div[@class='articledate'] |
5 | # News | 5 | # News |
6 | date: //div[@class='articledate_b'] | 6 | date: //div[@class='articledate_b'] |
7 | 7 | ||
8 | body: //div[@class='articletext'] | 8 | body: //div[@class='articletext'] |
9 | 9 | ||
10 | convert_double_br_tags: yes | 10 | convert_double_br_tags: yes |
11 | test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html \ No newline at end of file | 11 | test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/expressen.se.txt b/inc/3rdparty/site_config/standard/expressen.se.txt index d0cb283e..d81d3251 100644..100755 --- a/inc/3rdparty/site_config/standard/expressen.se.txt +++ b/inc/3rdparty/site_config/standard/expressen.se.txt | |||
@@ -1,9 +1,10 @@ | |||
1 | title: //div[@id='article']/div[contains(@class, 'content')]/h1 | 1 | title: //h1[contains(@class, 'b-headline_article')] |
2 | body: //div[@id='article']/div[contains(@class, 'content')] | 2 | body: //div[contains(@class, 'b-article_print')] |
3 | date: //div[contains(@class, 'article-slot')]/descendant::div[contains(@id, 'articledates')] | 3 | |
4 | 4 | single_page_link: //div[contains(@class, 'b-page__footer__actions')]//a[contains(@href, 'print=true')] | |
5 | strip: //img[contains(@src, 'img/px.gif')] | 5 | |
6 | prune: no | 6 | prune: no |
7 | # remove Facebook banner and obtrusive ad | 7 | |
8 | strip: //div[@id='article']/div[contains(@class, 'content')]/div[contains(@class, 'art-right')] | 8 | test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at |
9 | test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at \ No newline at end of file | 9 | test_url: http://www.expressen.se/gt/polis-om-styckmordet-extremt-markligt-fall/ |
10 | test_url: http://www.expressen.se/Pages/OutboundFeedsPage.aspx?id=3642159&viewstyle=rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/extracine.com.txt b/inc/3rdparty/site_config/standard/extracine.com.txt index 52b598da..52b598da 100644..100755 --- a/inc/3rdparty/site_config/standard/extracine.com.txt +++ b/inc/3rdparty/site_config/standard/extracine.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/f1actual.com.txt b/inc/3rdparty/site_config/standard/f1actual.com.txt index 6ef2738a..6ef2738a 100644..100755 --- a/inc/3rdparty/site_config/standard/f1actual.com.txt +++ b/inc/3rdparty/site_config/standard/f1actual.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/facebook.com.txt b/inc/3rdparty/site_config/standard/facebook.com.txt new file mode 100755 index 00000000..6a492767 --- /dev/null +++ b/inc/3rdparty/site_config/standard/facebook.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id='imagestage'] | ||
2 | prune: no | ||
3 | tidy: no | ||
4 | |||
5 | test_url: https://www.facebook.com/feeds/page.php?id=338077742912613&format=rss20 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/facta.co.jp.txt b/inc/3rdparty/site_config/standard/facta.co.jp.txt index c17e0b8c..4c96a1a4 100644..100755 --- a/inc/3rdparty/site_config/standard/facta.co.jp.txt +++ b/inc/3rdparty/site_config/standard/facta.co.jp.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | bosdy: //div[@class='content'] | 1 | bosdy: //div[@class='content'] |
2 | 2 | ||
3 | test_url: http://facta.co.jp/blog/archives/20111026001026.html \ No newline at end of file | 3 | test_url: http://facta.co.jp/blog/archives/20111026001026.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/falter.at.txt b/inc/3rdparty/site_config/standard/falter.at.txt index b941b740..2bfcc9b4 100644..100755 --- a/inc/3rdparty/site_config/standard/falter.at.txt +++ b/inc/3rdparty/site_config/standard/falter.at.txt | |||
@@ -1,18 +1,14 @@ | |||
1 | title: //h2[@class='related relatedTitle'] | 1 | title: //h1 |
2 | author: //a[contains(@href, 'liste.php?author_id')] | 2 | author: //a[contains(@href, '/kategorie/autoren')] |
3 | 3 | date: //a[contains(@href, '/falter/ausgabe')] | |
4 | # can't think of a better way unfortunately, really bad markup on this site | 4 | body: //article[@class='spanMain'] |
5 | date: substring-after(//td[@style='width:85%;'], 'vom') | 5 | |
6 | 6 | # cleanup | |
7 | # not sure why, but instapaper seems to suck up the teaser paragraph | 7 | strip_id_or_class: 'respond' |
8 | # not solved! | 8 | strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif'] |
9 | body: //div[contains(@class, 'teaser')] | 9 | strip_id_or_class: 'meta' |
10 | body: //div[@id='content'] | 10 | strip_id_or_class: 'servicebox' |
11 | 11 | strip_id_or_class: 'related' | |
12 | # cleanup | 12 | strip_id_or_class: 'twitter-share-button' |
13 | strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif'] | 13 | strip: //br |
14 | strip: //div[@class='servicebox'] | 14 | test_url: http://www.falter.at/falter/2013/03/26/der-dandy-auf-der-sinkenden-galeere/ \ No newline at end of file |
15 | strip: //h1 | ||
16 | strip: //br | ||
17 | strip: //td[@id='adcol'] | ||
18 | test_url: http://www.falter.at/web/print/detail.php?id=1634 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fanfiction.net.txt b/inc/3rdparty/site_config/standard/fanfiction.net.txt index 8d0c4daf..e7cab4d4 100644..100755 --- a/inc/3rdparty/site_config/standard/fanfiction.net.txt +++ b/inc/3rdparty/site_config/standard/fanfiction.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //*[@id = 'story text'] | 1 | body: //*[@id = 'story text'] |
2 | author: //a[starts-with(@href, '/u/')] | 2 | author: //a[starts-with(@href, '/u/')] |
3 | next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") | 3 | next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") |
4 | autodetect_next_page:yes | 4 | autodetect_next_page:yes |
5 | strip_id_or_class: 'a2a_kit' | 5 | strip_id_or_class: 'a2a_kit' |
6 | test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love \ No newline at end of file | 6 | test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/fastcompany.com.txt b/inc/3rdparty/site_config/standard/fastcompany.com.txt index 5547a76c..a6417237 100644..100755 --- a/inc/3rdparty/site_config/standard/fastcompany.com.txt +++ b/inc/3rdparty/site_config/standard/fastcompany.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //h5[@class='byline']//a | 2 | author: //h5[@class='byline']//a |
3 | date: //h5[@class='date'] | 3 | date: //h5[@class='date'] |
4 | body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")] | 4 | body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")] |
5 | strip_id_or_class: article-top-wrapper | 5 | strip_id_or_class: article-top-wrapper |
6 | strip_id_or_class: footer-message | 6 | strip_id_or_class: footer-message |
7 | strip_id_or_class: print-logo | 7 | strip_id_or_class: print-logo |
8 | strip: //cite | 8 | strip: //cite |
9 | strip://*[@class='timestamp'] | 9 | strip://*[@class='timestamp'] |
10 | strip://div[@id='page_right'] | 10 | strip://div[@id='page_right'] |
11 | strip://section[@id='header_region'] | 11 | strip://section[@id='header_region'] |
12 | strip://h1[@class='node-title'] | 12 | strip://h1[@class='node-title'] |
13 | strip://div[@class='node-submitted'] | 13 | strip://div[@class='node-submitted'] |
14 | strip_id_or_class: skipnav | 14 | strip_id_or_class: skipnav |
15 | test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity | 15 | test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity |
16 | test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day \ No newline at end of file | 16 | test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt index 4fe5968b..d087d2aa 100644..100755 --- a/inc/3rdparty/site_config/standard/faz.net.txt +++ b/inc/3rdparty/site_config/standard/faz.net.txt | |||
@@ -1,30 +1,36 @@ | |||
1 | # Title | 1 | # Title |
2 | title: //p[@class='Content HeadlineShort'] | 2 | title: //p[@class='Content HeadlineShort'] |
3 | 3 | ||
4 | # Authors | 4 | # Authors |
5 | # some are known and have a link, others don't | 5 | # some are known and have a link, others don't |
6 | author: substring-after(//span[@class='Autor'], 'Von') | 6 | author: substring-after(//span[@class='Autor'], 'Von') |
7 | 7 | ||
8 | # Date | 8 | # Date |
9 | date: //span[@class='Datum'] | 9 | date: //span[@class='Datum'] |
10 | 10 | ||
11 | # Body | 11 | # Body |
12 | body: //div[@class='Artikel'] | 12 | body: //div[@class='Artikel'] |
13 | 13 | ||
14 | # Removements before body text | 14 | # Removements before body text |
15 | strip: //div[@class='Breadcrumbs'] | 15 | strip: //div[@class='Breadcrumbs'] |
16 | strip: //div[@class='QuickSearchBox'] | 16 | strip: //div[@class='QuickSearchBox'] |
17 | strip: //div[@class='FAZArtikelEinleitung'] | 17 | strip: //div[@class='FAZArtikelEinleitung'] |
18 | strip: //div[@class='FAZArtikelReiter'] | 18 | strip: //div[@class='FAZArtikelReiter'] |
19 | strip: //div[@class='clear'] | 19 | strip: //div[@class='clear'] |
20 | 20 | ||
21 | # General removements | 21 | # General removements |
22 | strip: //span[@class='Bildnachweis'] | 22 | strip: //span[@class='Bildnachweis'] |
23 | 23 | strip: //img[@class='MediaIcon'] | |
24 | # Removements after body text | 24 | strip: //div[@class='ArtikelMediaLink'] |
25 | strip: //div[@class='ArtikelAbbinder'] | 25 | dissolve: //a[img] |
26 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] | 26 | |
27 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] | 27 | # Removements after body text |
28 | strip: //div[@class='FAZArtikelFunktionen'] | 28 | strip: //div[@class='ArtikelAbbinder'] |
29 | strip: //div[@id='FAZContentRight'] | 29 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] |
30 | test_url: http://www.faz.net/aktuell/gesellschaft/ehe-haltbarkeitsformel-verliebe-dich-oft-verlobe-dich-selten-heirate-vielleicht-11685306.html \ No newline at end of file | 30 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] |
31 | strip: //div[@class='FAZArtikelFunktionen'] | ||
32 | strip: //div[@id='FAZContentRight'] | ||
33 | |||
34 | # Fix picture captions | ||
35 | wrap_in(small): //span[@class='Bildunterschrift']/text() | ||
36 | test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fertigung.de.txt b/inc/3rdparty/site_config/standard/fertigung.de.txt new file mode 100755 index 00000000..90145e58 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fertigung.de.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | title: //title | ||
2 | |||
3 | body: //div[@id='content'] | ||
4 | |||
5 | strip: (//div[@id='content']/h2)[1] | ||
6 | |||
7 | strip: //h2[contains(., 'mehr News')]/following::* | ||
8 | strip: //h2[contains(., 'mehr News')] | ||
9 | |||
10 | strip: //div[contains(@class, 'indizar')]/following::* | ||
11 | strip: //div[contains(@class, 'indizar')] | ||
12 | |||
13 | strip: //h1[contains(@class, 'single')]/preceding::* | ||
14 | strip: //h1[contains(@class, 'single')] | ||
15 | |||
16 | strip_id_or_class: plista_widget | ||
17 | |||
18 | prune: no | ||
19 | |||
20 | next_page_link: //a[contains(., 'Weiter')] | ||
21 | |||
22 | test_url: http://www.fertigung.de/2013/04/igus-neuer-energiekettenkatalog/ | ||
23 | test_url: http://www.fertigung.de/2013/04/dynamisch-und-hochpraezise/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fictionpress.com.txt b/inc/3rdparty/site_config/standard/fictionpress.com.txt index 4a04e832..19ec16b0 100644..100755 --- a/inc/3rdparty/site_config/standard/fictionpress.com.txt +++ b/inc/3rdparty/site_config/standard/fictionpress.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: id('storytext') | 1 | body: id('storytext') |
2 | author: //a[starts-with(@href, '/u/')] | 2 | author: //a[starts-with(@href, '/u/')] |
3 | #next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") | 3 | #next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") |
4 | strip_id_or_class: 'a2a_kit' | 4 | strip_id_or_class: 'a2a_kit' |
5 | test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew \ No newline at end of file | 5 | test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ficwad.com.txt b/inc/3rdparty/site_config/standard/ficwad.com.txt index 3dbfe76f..081f0bb0 100644..100755 --- a/inc/3rdparty/site_config/standard/ficwad.com.txt +++ b/inc/3rdparty/site_config/standard/ficwad.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h4 | 1 | title: //h4 |
2 | author: //span[@class="author"] | 2 | author: //span[@class="author"] |
3 | body: //div[@id="story"] | 3 | body: //div[@id="story"] |
4 | strip_id_or_class: summary | 4 | strip_id_or_class: summary |
5 | strip_id_or_class: meta | 5 | strip_id_or_class: meta |
6 | strip_id_or_class: storyfoot | 6 | strip_id_or_class: storyfoot |
7 | convert_double_br_tags: yes | 7 | convert_double_br_tags: yes |
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | # Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface. | 10 | # Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface. |
11 | 11 | ||
12 | test_url: http://www.ficwad.com/story/158977 \ No newline at end of file | 12 | test_url: http://www.ficwad.com/story/158977 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/finance.yahoo.com.txt b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt index 81c18fd3..248522cb 100644..100755 --- a/inc/3rdparty/site_config/standard/finance.yahoo.com.txt +++ b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | body: //div[@id='y-article-bd'] | 2 | body: //div[@id='y-article-bd'] |
3 | body: //div[contains(@class, 'yom-art-content')] | 3 | body: //div[contains(@class, 'yom-art-content')] |
4 | strip: //div[contains(@class, 'related-companies')] | 4 | strip: //div[contains(@class, 'related-companies')] |
5 | strip: //div[@id='y-article-related'] | 5 | strip: //div[@id='y-article-related'] |
6 | strip: //div[@id='ypf-article-related'] | 6 | strip: //div[@id='ypf-article-related'] |
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] | 9 | single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] |
10 | 10 | ||
11 | test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1 | 11 | test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1 |
12 | test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html \ No newline at end of file | 12 | test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt b/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt index 1a5cd2e1..43aef750 100644..100755 --- a/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt +++ b/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | date: //div[@class='notes']/a | 1 | date: //div[@class='notes']/a |
2 | body: //div[@id='content'] | 2 | body: //div[@id='content'] |
3 | 3 | ||
4 | strip_id_or_class: tags | 4 | strip_id_or_class: tags |
5 | strip_id_or_class: permalink | 5 | strip_id_or_class: permalink |
6 | strip_id_or_class: notes | 6 | strip_id_or_class: notes |
7 | strip_id_or_class: post_nav | 7 | strip_id_or_class: post_nav |
8 | strip: //div[@id='content']//h2 | 8 | strip: //div[@id='content']//h2 |
9 | strip_id_or_class: right_column | 9 | strip_id_or_class: right_column |
10 | test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final \ No newline at end of file | 10 | test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/firstthings.com.txt b/inc/3rdparty/site_config/standard/firstthings.com.txt index dd56da22..ce972bac 100644..100755 --- a/inc/3rdparty/site_config/standard/firstthings.com.txt +++ b/inc/3rdparty/site_config/standard/firstthings.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='articleTitle'] | 1 | title: //div[@class='articleTitle'] |
2 | author: //div[@class='articleAuthor'] | 2 | author: //div[@class='articleAuthor'] |
3 | body: //div[@class='articleContent'] | 3 | body: //div[@class='articleContent'] |
4 | prune: no | 4 | prune: no |
5 | convert_double_br_tags: yes | 5 | convert_double_br_tags: yes |
6 | 6 | ||
7 | test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand \ No newline at end of file | 7 | test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/fivechapters.com.txt b/inc/3rdparty/site_config/standard/fivechapters.com.txt index d9c5e42e..d9c5e42e 100644..100755 --- a/inc/3rdparty/site_config/standard/fivechapters.com.txt +++ b/inc/3rdparty/site_config/standard/fivechapters.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/fivefilters.org.txt b/inc/3rdparty/site_config/standard/fivefilters.org.txt index dc1db432..dc1db432 100644..100755 --- a/inc/3rdparty/site_config/standard/fivefilters.org.txt +++ b/inc/3rdparty/site_config/standard/fivefilters.org.txt | |||
diff --git a/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt b/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt index 3d7b45a8..d0a0a772 100644..100755 --- a/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt +++ b/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: substring-after(//title, 'Right:') | 1 | title: substring-after(//title, 'Right:') |
2 | body: //div[@class = 'post-body'] | 2 | body: //div[@class = 'post-body'] |
3 | author: substring-after(//*[@class='post-author'], 'by') | 3 | author: substring-after(//*[@class='post-author'], 'by') |
4 | date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a) | 4 | date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a) |
5 | convert_double_br_tags: yes | 5 | convert_double_br_tags: yes |
6 | 6 | ||
7 | test_url: http://www.fivethirtyeight.com/2010/07/does-rnc-have-structural-problems.html \ No newline at end of file | 7 | test_url: http://www.fivethirtyeight.com/2010/07/does-rnc-have-structural-problems.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/flyingmachinestudios.com.txt b/inc/3rdparty/site_config/standard/flyingmachinestudios.com.txt new file mode 100755 index 00000000..2053f801 --- /dev/null +++ b/inc/3rdparty/site_config/standard/flyingmachinestudios.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_id_or_class: linenos | ||
2 | test_url: http://www.flyingmachinestudios.com/programming/whoops-dci-refactoring/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fm4.orf.at.txt b/inc/3rdparty/site_config/standard/fm4.orf.at.txt index 32d44c87..5db3e58c 100644..100755 --- a/inc/3rdparty/site_config/standard/fm4.orf.at.txt +++ b/inc/3rdparty/site_config/standard/fm4.orf.at.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | author: //div[@class='authorDescription']/h2 | 1 | author: //div[@class='authorDescription']/h2 |
2 | body: //div[@id='story'] | 2 | body: //div[@id='story'] |
3 | date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-') | 3 | date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-') |
4 | title: //h1[@class='detail'] | 4 | title: //h1[@class='detail'] |
5 | strip: //div[@class='fact'] | 5 | strip: //div[@class='fact'] |
6 | 6 | ||
7 | test_url: http://fm4.orf.at/stories/1689156/ \ No newline at end of file | 7 | test_url: http://fm4.orf.at/stories/1689156/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/fnal.gov.txt b/inc/3rdparty/site_config/standard/fnal.gov.txt index 7faa6bfc..e404ccb8 100644..100755 --- a/inc/3rdparty/site_config/standard/fnal.gov.txt +++ b/inc/3rdparty/site_config/standard/fnal.gov.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: normalize(//h1) | 1 | title: normalize(//h1) |
2 | 2 | ||
3 | author: //td/p[position()=last()]/em | 3 | author: //td/p[position()=last()]/em |
4 | 4 | ||
5 | # I swear, this is really the best way to do this | 5 | # I swear, this is really the best way to do this |
6 | date: normalize(//td[contains(@style, "color: #ffffff")]) | 6 | date: normalize(//td[contains(@style, "color: #ffffff")]) |
7 | 7 | ||
8 | # my god, it's full of tables | 8 | # my god, it's full of tables |
9 | body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td | 9 | body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td |
10 | strip: //h1 | 10 | strip: //h1 |
11 | 11 | ||
12 | # the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output. | 12 | # the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output. |
13 | strip: //p[position()=last()]/em | 13 | strip: //p[position()=last()]/em |
14 | strip: //p[position()=last()]/child::text() | 14 | strip: //p[position()=last()]/child::text() |
15 | test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html \ No newline at end of file | 15 | test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/focus.de.txt b/inc/3rdparty/site_config/standard/focus.de.txt index 3ad5cabf..6da3687e 100644..100755 --- a/inc/3rdparty/site_config/standard/focus.de.txt +++ b/inc/3rdparty/site_config/standard/focus.de.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] | 3 | author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] |
4 | 4 | ||
5 | date: //div[@class='articleHead']/span[@class='created'] | 5 | date: //div[@class='articleHead']/span[@class='created'] |
6 | 6 | ||
7 | body: //div[@id='article'] | 7 | body: //div[@id='article'] |
8 | 8 | ||
9 | strip: //span[@class='markerText'] | 9 | strip: //span[@class='markerText'] |
10 | strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] | 10 | strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] |
11 | strip: //div[@class='sidebar'] | 11 | strip: //div[@class='sidebar'] |
12 | strip: //div[@class='starbar'] | 12 | strip: //div[@class='starbar'] |
13 | strip: //div[@class='actions clearfix'] | 13 | strip: //div[@class='actions clearfix'] |
14 | strip: //div[@id='commentForm'] | 14 | strip: //div[@id='commentForm'] |
15 | strip: //div[@id='commentSent'] | 15 | strip: //div[@id='commentSent'] |
16 | strip: //div[@id='comments'] | 16 | strip: //div[@id='comments'] |
17 | strip: //div[@class='similarityBlock'] | 17 | strip: //div[@class='similarityBlock'] |
18 | 18 | ||
19 | test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html \ No newline at end of file | 19 | test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/folklore.org.txt b/inc/3rdparty/site_config/standard/folklore.org.txt new file mode 100755 index 00000000..ed23a0b6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/folklore.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[1]/td[2] | ||
2 | date: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[2]/td[2] | ||
3 | body: //div[@class='main'] | ||
4 | test_url: http://www.folklore.org/StoryView.py?story=Calculator_Construction_Set.txt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/food.com.txt b/inc/3rdparty/site_config/standard/food.com.txt new file mode 100755 index 00000000..a70da766 --- /dev/null +++ b/inc/3rdparty/site_config/standard/food.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@id='print-area'] | ||
2 | title: //h1[contains(@class, 'section-title')] | ||
3 | single_page_link: //a[@id='prntrec'] | ||
4 | strip_image_src: food-logo-small | ||
5 | strip_id_or_class: timer | ||
6 | strip_id_or_class: photo-sm | ||
7 | strip_id_or_class: page-header | ||
8 | |||
9 | prune: no | ||
10 | |||
11 | test_url: http://www.food.com/recipe/couldnt-be-easier-bbq-pork-tenderloin-crock-pot-317152 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fool.com.txt b/inc/3rdparty/site_config/standard/fool.com.txt index 69867ccb..89cb8b9a 100644..100755 --- a/inc/3rdparty/site_config/standard/fool.com.txt +++ b/inc/3rdparty/site_config/standard/fool.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class='entry-content'] | 1 | body: //div[@class='entry-content'] |
2 | date: //meta[@name="date"]/@content | 2 | date: //meta[@name="date"]/@content |
3 | author: //meta[@name="author"]/@content | 3 | author: //meta[@name="author"]/@content |
4 | 4 | ||
5 | strip_id_or_class: ecapShell | 5 | strip_id_or_class: ecapShell |
6 | strip_id_or_class: noindent | 6 | strip_id_or_class: noindent |
7 | strip_id_or_class: targetedPromotion | 7 | strip_id_or_class: targetedPromotion |
8 | 8 | ||
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx \ No newline at end of file | 11 | test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/forbes.com.txt b/inc/3rdparty/site_config/standard/forbes.com.txt index 2381b56a..9e1d04c1 100644..100755 --- a/inc/3rdparty/site_config/standard/forbes.com.txt +++ b/inc/3rdparty/site_config/standard/forbes.com.txt | |||
@@ -1,16 +1,27 @@ | |||
1 | title: //hgroup//h1 | 1 | title: //hgroup//h1 |
2 | title: //span[@class='mainarttitle'] | 2 | title: //span[@class='mainarttitle'] |
3 | 3 | ||
4 | body: //div[@id='leftRail']//div[contains(@class, 'body')] | 4 | body: //div[@id='leftRail']//div[contains(@class, 'body')] |
5 | 5 | ||
6 | author: //meta[@name="author"]/@content | 6 | author: //meta[@name="author"]/@content |
7 | author: //span[@class='mainartauthor'] | 7 | author: //span[@class='mainartauthor'] |
8 | 8 | ||
9 | date: substring-before(//hgroup//h6, '@') | 9 | date: substring-before(//hgroup//h6, '@') |
10 | date: //span[@class='mainartdate'] | 10 | date: //span[@class='mainartdate'] |
11 | 11 | ||
12 | prune: no | 12 | prune: no |
13 | 13 | strip: //aside | |
14 | single_page_link: //a[contains(@href, '/print/')] | 14 | strip_id_or_class: sticky_sharing |
15 | 15 | strip_id_or_class: pagination | |
16 | test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html \ No newline at end of file | 16 | strip_id_or_class: controlsbox |
17 | strip_id_or_class: storyboxes | ||
18 | strip_id_or_class: sponsoredlinks | ||
19 | strip_id_or_class: nextpage | ||
20 | strip_id_or_class: contextuallinks | ||
21 | strip_id_or_class: article_actions | ||
22 | strip_id_or_class: engagement_block | ||
23 | |||
24 | single_page_link: //a[contains(@href, '/print/')] | ||
25 | |||
26 | test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html | ||
27 | test_url: http://www.forbes.com/sites/bruceupbin/2012/09/11/the-iphone-5-winners-and-losers/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/foreignaffairs.com.txt b/inc/3rdparty/site_config/standard/foreignaffairs.com.txt new file mode 100755 index 00000000..cf8b742f --- /dev/null +++ b/inc/3rdparty/site_config/standard/foreignaffairs.com.txt | |||
@@ -0,0 +1,34 @@ | |||
1 | # TIDY | ||
2 | #tidy: no | ||
3 | # PRUNE | ||
4 | #prune: no | ||
5 | |||
6 | # SINGLE PAGE | ||
7 | single_page_link: //div[@class='showlinks']/a | ||
8 | |||
9 | # TITLE | ||
10 | title: //h1[@class="title"] | ||
11 | |||
12 | # AUTHOR | ||
13 | author: //div[contains(@class,"field-field-article-display-authors")]/div/div/a/text() | ||
14 | |||
15 | # DATE | ||
16 | date: //div[contains(@class,"field-field-article-issue")]/div/div/a/text() | //span[@class="date-display-single"] | ||
17 | |||
18 | # BODY | ||
19 | body: //div[contains(@class,"content-resize")] | ||
20 | |||
21 | # Remove clutter | ||
22 | strip: //div[@class="article-sidebar"] | ||
23 | strip: //div[@class="showlinks"] | ||
24 | strip: //div[contains(@class,"premium-box")] | ||
25 | strip: //div[contains(@class,"premium-box")] | ||
26 | strip: //table[contains(@border,"2")] | ||
27 | |||
28 | # Fix picture captions | ||
29 | wrap_in(small): //p/img/following-sibling::em | ||
30 | wrap_in(small): //p[img]/text() | ||
31 | |||
32 | # Fix sub-headlines | ||
33 | wrap_in(h3): //div[contains(@class,"field-field-article-subtitle")]/div/div/text() | ||
34 | test_url: http://www.foreignaffairs.com/articles/138810/pierre-n-leval/the-long-arm-of-international-law \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/foreignpolicy.com.txt b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt index 6ab7a091..4e84b989 100644..100755 --- a/inc/3rdparty/site_config/standard/foreignpolicy.com.txt +++ b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt | |||
@@ -1,11 +1,15 @@ | |||
1 | title: //div[@id='art-mast']//h1 | 1 | title: //div[@class='translateHead']//h1 | //div[@id='art-mast']//h1 |
2 | author: substring-after(//span[@id='by-line'], 'BY ') | 2 | author: substring-after(//span[@id='by-line'], 'BY ') |
3 | date: //span[@id='pub-date'] | 3 | date: //span[@id='pub-date'] |
4 | body: //div[@id='art-mast']//h2 | //div[@id='art-mast']/h3 | //div[@id='art-body']//div[@class='translateBody'] | 4 | body: //div[@id='art-mast']/h2 | //div[@class='translateBody'] | //div[@id='art-body'] |
5 | strip: //div[@id='share-box'] | 5 | #Strip inside article content |
6 | prune: no | 6 | strip: //div[@id='share-box'] |
7 | 7 | strip: //div[@id='special-box'] | |
8 | single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')] | 8 | |
9 | 9 | prune: no | |
10 | test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me | 10 | |
11 | test_url: test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus \ No newline at end of file | 11 | single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')] |
12 | single_page_link: //a[text()='SINGLE PAGE'] | ||
13 | |||
14 | test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me | ||
15 | test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/forsvaret.no.txt b/inc/3rdparty/site_config/standard/forsvaret.no.txt index 3085c8f2..c1bd2bac 100644..100755 --- a/inc/3rdparty/site_config/standard/forsvaret.no.txt +++ b/inc/3rdparty/site_config/standard/forsvaret.no.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@class="articleHeader"]/h1 | 1 | title: //div[@class="articleHeader"]/h1 |
2 | author: //p[@class="byline"] | 2 | author: //p[@class="byline"] |
3 | date: //p[contains(@class,"publishedDate")]/span | 3 | date: //p[contains(@class,"publishedDate")]/span |
4 | # remove the right menu | 4 | # remove the right menu |
5 | strip: //div[contains(@class,"aside")] | 5 | strip: //div[contains(@class,"aside")] |
6 | # remove some SharePoint webpart label junk | 6 | # remove some SharePoint webpart label junk |
7 | strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"] | 7 | strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"] |
8 | strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"] | 8 | strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"] |
9 | test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx \ No newline at end of file | 9 | test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/foxnews.com.txt b/inc/3rdparty/site_config/standard/foxnews.com.txt index f1ee4851..e19c77db 100644..100755 --- a/inc/3rdparty/site_config/standard/foxnews.com.txt +++ b/inc/3rdparty/site_config/standard/foxnews.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | prune: no | 1 | prune: no |
2 | 2 | ||
3 | author: //meta[@name="dc.publisher"]/@content | 3 | author: //meta[@name="dc.publisher"]/@content |
4 | date: //meta[@name="dc.date"]/@content | 4 | date: //meta[@name="dc.date"]/@content |
5 | strip: //p[contains(@class, 'contributor vcard')] | 5 | strip: //p[contains(@class, 'contributor vcard')] |
6 | replace_string(<ul><li><div class="photo">): <div class="photo"> | 6 | replace_string(<ul><li><div class="photo">): <div class="photo"> |
7 | strip: //p[a[contains(., 'Click here to read more on this story ')]] | 7 | strip: //p[a[contains(., 'Click here to read more on this story ')]] |
8 | 8 | ||
9 | test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/ \ No newline at end of file | 9 | test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/freelancer.com.txt b/inc/3rdparty/site_config/standard/freelancer.com.txt index f3d5425c..78d37729 100644..100755 --- a/inc/3rdparty/site_config/standard/freelancer.com.txt +++ b/inc/3rdparty/site_config/standard/freelancer.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id="projectDetailsContent"]//td | 1 | body: //div[@id="projectDetailsContent"]//td |
2 | 2 | ||
3 | test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html \ No newline at end of file | 3 | test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/freytag-film.com.txt b/inc/3rdparty/site_config/standard/freytag-film.com.txt index 8dc0dabc..c83f8303 100644..100755 --- a/inc/3rdparty/site_config/standard/freytag-film.com.txt +++ b/inc/3rdparty/site_config/standard/freytag-film.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class = 'instapaperbody'] | 1 | body: //div[@class = 'instapaperbody'] |
2 | convert_double_br_tags: no | 2 | convert_double_br_tags: no |
3 | date: //div[@class='instadate'] | 3 | date: //div[@class='instadate'] |
4 | title: //h2[@class = 'instatitle'] | 4 | title: //h2[@class = 'instatitle'] |
5 | test_url: http://freytag-film.com/blog/artikel/shooting_a_feature_film_in_10_days \ No newline at end of file | 5 | test_url: http://freytag-film.com/blog/artikel/shooting_a_feature_film_in_10_days \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/fria.nu.txt b/inc/3rdparty/site_config/standard/fria.nu.txt new file mode 100755 index 00000000..9d8eff97 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fria.nu.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.fria.nu/artikel/112079 | ||
8 | test_url: http://www.fria.nu/taxonomy/term/1928/all/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/friatidningen.se.txt b/inc/3rdparty/site_config/standard/friatidningen.se.txt new file mode 100755 index 00000000..1e4abc5a --- /dev/null +++ b/inc/3rdparty/site_config/standard/friatidningen.se.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.friatidningen.se/artikel/112074 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/friendskorner.com.txt b/inc/3rdparty/site_config/standard/friendskorner.com.txt index 39a9973f..b067d88a 100644..100755 --- a/inc/3rdparty/site_config/standard/friendskorner.com.txt +++ b/inc/3rdparty/site_config/standard/friendskorner.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | 1 | #body: (//div[@class='ftr-yt-vid'])[1] |
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | 2 | body: (//blockquote[contains(@class, 'postcontent')])[1] |
3 | body: (//div[starts-with(@id, 'post_message')])[1] | 3 | body: (//div[starts-with(@id, 'post_message')])[1] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | 7 | ||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | 8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" |
9 | #replace_string(</iframe>): </iframe> </div> | 9 | #replace_string(</iframe>): </iframe> </div> |
10 | 10 | ||
11 | test_url: http://www.friendskorner.com/forum/f137/debate-personal-lives-leaders-west-vs-pakistan-must-read-297989/ \ No newline at end of file | 11 | test_url: http://www.friendskorner.com/forum/f137/debate-personal-lives-leaders-west-vs-pakistan-must-read-297989/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ft.com.txt b/inc/3rdparty/site_config/standard/ft.com.txt index 38d9d326..e66b9603 100644..100755 --- a/inc/3rdparty/site_config/standard/ft.com.txt +++ b/inc/3rdparty/site_config/standard/ft.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[contains(@class, 'ft-story-body')] | 1 | body: //div[contains(@class, 'ft-story-body')] |
2 | 2 | ||
3 | author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ') | 3 | author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ') |
4 | date: substring-before(substring-after(//div[contains(@class, 'ft-story-header')]/p[2], 'Published:'), '|') | 4 | date: substring-before(substring-after(//div[contains(@class, 'ft-story-header')]/p[2], 'Published:'), '|') |
5 | test_url: http://www.ft.com/cms/s/2/e1be4b5a-620c-11e0-8ee4-00144feab49a.html \ No newline at end of file | 5 | test_url: http://www.ft.com/cms/s/2/e1be4b5a-620c-11e0-8ee4-00144feab49a.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ftchinese.com.txt b/inc/3rdparty/site_config/standard/ftchinese.com.txt new file mode 100755 index 00000000..5c94d9b0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ftchinese.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | # Modified to define the single_page_link | ||
2 | # This filter is tested on: | ||
3 | # http://www.ftchinese.com/story/001047373 | ||
4 | # http://www.ftchinese.com/story/001047631 | ||
5 | # http://www.ftchinese.com/story/001047622/?print=y | ||
6 | # http://www.ftchinese.com/story/001049052 | ||
7 | # http://www.ftchinese.com/story/001049088 | ||
8 | |||
9 | title:substring-before(//title, '-') | ||
10 | author: //div[@class='byline']/a | ||
11 | date: //a[@class='storytime'] | ||
12 | #Set date in print view | ||
13 | #date: //div[@class='byline']/a/following-sibling::a | ||
14 | body: //div[@id="bodytext"] | ||
15 | strip://div[@class='pagination'] | ||
16 | single_page_link://div[@class='pagination']/a[.='全文'] | ||
17 | #next_page_link: //div[@class='pagination']//a[.='下一页'] | ||
18 | test_url: http://www.ftchinese.com/story/001049088 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ftd.de.txt b/inc/3rdparty/site_config/standard/ftd.de.txt index a58765b0..7d76af00 100644..100755 --- a/inc/3rdparty/site_config/standard/ftd.de.txt +++ b/inc/3rdparty/site_config/standard/ftd.de.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft'] | 1 | body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft'] |
2 | single_page_link: //a[@class='icon print'] | 2 | single_page_link: //a[@class='icon print'] |
3 | 3 | ||
4 | test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html | 4 | test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html |
5 | test_url: http://www.ftd.de/it-medien/medien-internet/:verkauf-von-warner-music-musikbranche-auf-dem-sprung/60048185.html \ No newline at end of file | 5 | test_url: http://www.ftd.de/it-medien/medien-internet/:verkauf-von-warner-music-musikbranche-auf-dem-sprung/60048185.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/fubiz.net.txt b/inc/3rdparty/site_config/standard/fubiz.net.txt index 8e6356bf..0dc30475 100644..100755 --- a/inc/3rdparty/site_config/standard/fubiz.net.txt +++ b/inc/3rdparty/site_config/standard/fubiz.net.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class = 'entry'] | 1 | body: //div[@class = 'entry'] |
2 | 2 | ||
3 | test_url: http://www.fubiz.net/2011/05/31/world-press-photo-2011/ \ No newline at end of file | 3 | test_url: http://www.fubiz.net/2011/05/31/world-press-photo-2011/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/futurezone.at.txt b/inc/3rdparty/site_config/standard/futurezone.at.txt index 50fc144a..808c1f1b 100644..100755 --- a/inc/3rdparty/site_config/standard/futurezone.at.txt +++ b/inc/3rdparty/site_config/standard/futurezone.at.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | date: //span[@class='date'] | 1 | date: //span[@class='date'] |
2 | strip: //div[@class='postsidebar'] | 2 | strip: //div[@class='postsidebar'] |
3 | body: //div[@class='singlepost'] | 3 | body: //div[@class='singlepost'] |
4 | title: //div[@class='singlepost']/h1 | 4 | title: //div[@class='singlepost']/h1 |
5 | move_into(//div[@class='singlepost']): //div[@class='info'] | 5 | move_into(//div[@class='singlepost']): //div[@class='info'] |
6 | strip: //div[@class='gallery'] | 6 | strip: //div[@class='gallery'] |
7 | strip: //div[@class='biggallery'] | 7 | strip: //div[@class='biggallery'] |
8 | strip: //ul[@class='social'] | 8 | strip: //ul[@class='social'] |
9 | strip: //ul[@class='social_mail'] | 9 | strip: //ul[@class='social_mail'] |
10 | 10 | ||
11 | test_url: http://futurezone.at/future/5502-erste-galileo-satelliten-starten-ins-all.php \ No newline at end of file | 11 | test_url: http://futurezone.at/future/5502-erste-galileo-satelliten-starten-ins-all.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gamasutra.com.txt b/inc/3rdparty/site_config/standard/gamasutra.com.txt index 35a8762a..7c808cfd 100644..100755 --- a/inc/3rdparty/site_config/standard/gamasutra.com.txt +++ b/inc/3rdparty/site_config/standard/gamasutra.com.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | # default view title | 1 | # default view title |
2 | title: //span[@class='newsTitle'] | 2 | title: //span[@class='newsTitle'] |
3 | # print view title | 3 | # print view title |
4 | title: //h3[@class='title'] | 4 | title: //h3[@class='title'] |
5 | 5 | ||
6 | # default view author | 6 | # default view author |
7 | author: //span[@class='newsAuth']/a | 7 | author: //span[@class='newsAuth']/a |
8 | author: substring-after(//span[@class='newsAuth'], 'by ') | 8 | author: substring-after(//span[@class='newsAuth'], 'by ') |
9 | 9 | ||
10 | # default view date | 10 | # default view date |
11 | date: //td[@class='newsDate'] | 11 | date: //td[@class='newsDate'] |
12 | 12 | ||
13 | # default view body | 13 | # default view body |
14 | body: //td[@class='featureText'] | 14 | body: //td[@class='featureText'] |
15 | body: //td[@class='newsText'] | 15 | body: //td[@class='newsText'] |
16 | 16 | ||
17 | strip: //h3[@class='title'] | 17 | strip: //h3[@class='title'] |
18 | 18 | ||
19 | single_page_link: //a[contains(@href, '?print=1')] | 19 | single_page_link: //a[contains(@href, '?print=1')] |
20 | test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php \ No newline at end of file | 20 | test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gameblog.fr.txt b/inc/3rdparty/site_config/standard/gameblog.fr.txt index 2cc4b378..73f8342f 100644..100755 --- a/inc/3rdparty/site_config/standard/gameblog.fr.txt +++ b/inc/3rdparty/site_config/standard/gameblog.fr.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')] | 2 | body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | strip_id_or_class: noprint | 6 | strip_id_or_class: noprint |
7 | strip: //div[@id='gbNewsTextContent']/following-sibling::* | 7 | strip: //div[@id='gbNewsTextContent']/following-sibling::* |
8 | 8 | ||
9 | test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video | 9 | test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video |
10 | test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible \ No newline at end of file | 10 | test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gamechurch.com.txt b/inc/3rdparty/site_config/standard/gamechurch.com.txt new file mode 100755 index 00000000..c9eea5f8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gamechurch.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1[@class='title'] | ||
2 | |||
3 | date: substring-before(substring-after(//div[@class='comment-bubble']/.., 'Posted'), 'by') | ||
4 | |||
5 | body: //div[@class='the-content'] | ||
6 | |||
7 | strip: //div[@class='article-image responsive'] | ||
8 | |||
9 | strip_id_or_class: 'pullquote' | ||
10 | test_url: http://gamechurch.com/virtual-gun-control-the-best-amendment/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gamer.no.txt b/inc/3rdparty/site_config/standard/gamer.no.txt new file mode 100755 index 00000000..e76a59d9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gamer.no.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class='pageContent description'] | ||
2 | date: //div[@class='authorsAndDateTime']/span[@title] | ||
3 | single_page_link: //div[@class='pages']/a[last()-1] | ||
4 | |||
5 | # fix images and captions | ||
6 | wrap_in(figure): //div[contains(concat(' ', @class, ' '), ' image')] | ||
7 | wrap_in(figcaption): //div[contains(concat(' ', @class, ' '), ' image')]/div[@class='text']/text() | ||
8 | |||
9 | # get rid of videos | ||
10 | strip_id_or_class: 'video full' | ||
11 | test_url: http://www.gamer.no/artikler/142455/slik-blei-ambisiose-dragons-dogma-skapt/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gamereactor.no.txt b/inc/3rdparty/site_config/standard/gamereactor.no.txt new file mode 100755 index 00000000..6f7c1b9b --- /dev/null +++ b/inc/3rdparty/site_config/standard/gamereactor.no.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@id='content']/div/h1 | ||
2 | |||
3 | author: //a[@itemprop='reviewer'] | ||
4 | |||
5 | date: //time[@itemprop='dtreviewed']/@datetime | ||
6 | |||
7 | body: //div[@id='breadtext'] | ||
8 | |||
9 | # fix for NOT magically removing anchors with text identical to title | ||
10 | dissolve: //a[text()=//div[@id='content']/div/h1/text()] | ||
11 | test_url: http://www.gamereactor.no/previews/177481/The+Evil+Within/?sid=38b5bd30f56f1b7214de4ff5bed4b76f \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/garythink.com.txt b/inc/3rdparty/site_config/standard/garythink.com.txt index 1791e816..327ac55b 100644..100755 --- a/inc/3rdparty/site_config/standard/garythink.com.txt +++ b/inc/3rdparty/site_config/standard/garythink.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | 2 | ||
3 | test_url: http://www.garythink.com/eft/testing.html \ No newline at end of file | 3 | test_url: http://www.garythink.com/eft/testing.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gasteroprod.com.txt b/inc/3rdparty/site_config/standard/gasteroprod.com.txt index ef68082a..8eda0c36 100644..100755 --- a/inc/3rdparty/site_config/standard/gasteroprod.com.txt +++ b/inc/3rdparty/site_config/standard/gasteroprod.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | # These should work, but don't. They were given by Firefox XPather extension | 1 | # These should work, but don't. They were given by Firefox XPather extension |
2 | title: //article//header//a//h1 | 2 | title: //article//header//a//h1 |
3 | body: //article//section | 3 | body: //article//section |
4 | test_url: http://gasteroprod.com/blog/faut-il-continuer-a-supporter-internet-explorer-6.html \ No newline at end of file | 4 | test_url: http://gasteroprod.com/blog/faut-il-continuer-a-supporter-internet-explorer-6.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gatopardo.com.txt b/inc/3rdparty/site_config/standard/gatopardo.com.txt index 74346328..2ab144f5 100644..100755 --- a/inc/3rdparty/site_config/standard/gatopardo.com.txt +++ b/inc/3rdparty/site_config/standard/gatopardo.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | body: //div[@class='panel'] | 1 | body: //div[@class='panel'] |
2 | strip: //div[@style='float:right'] | 2 | strip: //div[@style='float:right'] |
3 | strip: //span[@class='titulosHomePublicidad'] | 3 | strip: //span[@class='titulosHomePublicidad'] |
4 | strip: //div[@id='TitTop5Der'] | 4 | strip: //div[@id='TitTop5Der'] |
5 | strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png'] | 5 | strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png'] |
6 | 6 | ||
7 | prune: yes | 7 | prune: yes |
8 | test_url: http://www.gatopardo.com/ReportajesGP.php?R=95 \ No newline at end of file | 8 | test_url: http://www.gatopardo.com/ReportajesGP.php?R=95 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gawker.com.txt b/inc/3rdparty/site_config/standard/gawker.com.txt index 6531d81a..9bc5613a 100644..100755 --- a/inc/3rdparty/site_config/standard/gawker.com.txt +++ b/inc/3rdparty/site_config/standard/gawker.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@class="post-body"] | 1 | body: //div[@class="post-body"] |
2 | 2 | ||
3 | # Remove 'content is restricted' | 3 | # Remove 'content is restricted' |
4 | strip: //div[@id='agegate_IDHERE'] | 4 | strip: //div[@id='agegate_IDHERE'] |
5 | 5 | ||
6 | test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy \ No newline at end of file | 6 | test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/geeksofdoom.com.txt b/inc/3rdparty/site_config/standard/geeksofdoom.com.txt index 55586e1c..89eb402f 100644..100755 --- a/inc/3rdparty/site_config/standard/geeksofdoom.com.txt +++ b/inc/3rdparty/site_config/standard/geeksofdoom.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | author: substring-after(//span[@class='storyauthor'],'Posted by') | 1 | author: substring-after(//span[@class='storyauthor'],'Posted by') |
2 | date: //span[@class='storydate'] | 2 | date: //span[@class='storydate'] |
3 | test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/ \ No newline at end of file | 3 | test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/geenstijl.nl.txt b/inc/3rdparty/site_config/standard/geenstijl.nl.txt index f6dccf48..a664b4d9 100644..100755 --- a/inc/3rdparty/site_config/standard/geenstijl.nl.txt +++ b/inc/3rdparty/site_config/standard/geenstijl.nl.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id = 'article'] | 1 | body: //div[@id = 'article'] |
2 | strip: //div[@id = 'klasbox'] | 2 | strip: //div[@id = 'klasbox'] |
3 | test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html \ No newline at end of file | 3 | test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/getnews.jp.txt b/inc/3rdparty/site_config/standard/getnews.jp.txt index 537b4c2e..e28d4b8b 100644..100755 --- a/inc/3rdparty/site_config/standard/getnews.jp.txt +++ b/inc/3rdparty/site_config/standard/getnews.jp.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='post'] | 1 | body: //div[@class='post'] |
2 | strip: //ul[@id='bookmark_single'] | 2 | strip: //ul[@id='bookmark_single'] |
3 | test_url: http://getnews.jp/archives/117312 \ No newline at end of file | 3 | test_url: http://getnews.jp/archives/117312 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/giantbomb.com.txt b/inc/3rdparty/site_config/standard/giantbomb.com.txt index 8a54bc07..61de51b2 100644..100755 --- a/inc/3rdparty/site_config/standard/giantbomb.com.txt +++ b/inc/3rdparty/site_config/standard/giantbomb.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | # 2011-11-19 - carlo@... - Initial setup. | 1 | # 2011-11-19 - carlo@... - Initial setup. |
2 | 2 | ||
3 | strip_id_or_class: user-review-detail | 3 | strip_id_or_class: user-review-detail |
4 | strip: //h1 | 4 | strip: //h1 |
5 | 5 | ||
6 | body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"] | 6 | body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"] |
7 | 7 | ||
8 | author: //span[@class="reviewer"] | //p[@class="byline"]/a/text() | 8 | author: //span[@class="reviewer"] | //p[@class="byline"]/a/text() |
9 | date: //span[@class="dtreviewed"] | 9 | date: //span[@class="dtreviewed"] |
10 | 10 | ||
11 | test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/ \ No newline at end of file | 11 | test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/giga.de.txt b/inc/3rdparty/site_config/standard/giga.de.txt index f60199ad..e2689eae 100644..100755 --- a/inc/3rdparty/site_config/standard/giga.de.txt +++ b/inc/3rdparty/site_config/standard/giga.de.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | tidy:no | 1 | tidy:no |
2 | title://h2[@class="title"] | 2 | title://h2[@class="title"] |
3 | # author:"Ben Miller" | 3 | # author:"Ben Miller" |
4 | date://div[@id="stats"]/span | 4 | date://div[@id="stats"]/span |
5 | strip_id_or_class:stats | 5 | strip_id_or_class:stats |
6 | strip_id_or_class:breadcrumbs | 6 | strip_id_or_class:breadcrumbs |
7 | strip_id_or_class:gn-why-content | 7 | strip_id_or_class:gn-why-content |
8 | strip_id_or_class:single-social | 8 | strip_id_or_class:single-social |
9 | strip_id_or_class:sidebar-ads | 9 | strip_id_or_class:sidebar-ads |
10 | strip_id_or_class:sidebar-top | 10 | strip_id_or_class:sidebar-top |
11 | strip_id_or_class:footer | 11 | strip_id_or_class:footer |
12 | strip_id_or_class:post_meta | 12 | strip_id_or_class:post_meta |
13 | # strip_id_or_class: | 13 | # strip_id_or_class: |
14 | # strip_id_or_class: | 14 | # strip_id_or_class: |
15 | # strip_id_or_class: | 15 | # strip_id_or_class: |
16 | # strip_id_or_class: | 16 | # strip_id_or_class: |
17 | # strip_id_or_class: | 17 | # strip_id_or_class: |
18 | # strip_id_or_class: | 18 | # strip_id_or_class: |
19 | 19 | ||
20 | test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033 \ No newline at end of file | 20 | test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gigaom.com.txt b/inc/3rdparty/site_config/standard/gigaom.com.txt index 348bdf23..cc8fdfa0 100644..100755 --- a/inc/3rdparty/site_config/standard/gigaom.com.txt +++ b/inc/3rdparty/site_config/standard/gigaom.com.txt | |||
@@ -1,17 +1,12 @@ | |||
1 | date: //meta[@name='DC.date.issued']/@content | 1 | date: //meta[@name='dcterms.created']/@content |
2 | date: //span[@class='post-meta the-date'] | 2 | title: //meta[@property='og:title']/@content |
3 | 3 | author: //section[@class="post-meta"]//a[@rel="author"] | |
4 | title: //meta[@property='og:title']/@content | 4 | |
5 | 5 | body: //div[starts-with(@id, 'post-content-')] | |
6 | author: //meta[@name='DC.creator']/@content | 6 | |
7 | 7 | strip_id_or_class: sharedaddy | |
8 | body: //div[contains(@class, 'post-sub-head') or starts-with(@id, 'post-content-')] | 8 | |
9 | 9 | prune: no | |
10 | find_string: id="content" | 10 | |
11 | replace_string: id="content-ignore" | 11 | test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/ |
12 | 12 | test_url: http://gigaom.com/2012/12/26/snapchat-rises-why-pokes-decline-shows-facebooks-inability-to-invent/ \ No newline at end of file | |
13 | strip_id_or_class: sharedaddy | ||
14 | |||
15 | prune: no | ||
16 | |||
17 | test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gihyo.jp.txt b/inc/3rdparty/site_config/standard/gihyo.jp.txt index 478b23a3..d3534b29 100644..100755 --- a/inc/3rdparty/site_config/standard/gihyo.jp.txt +++ b/inc/3rdparty/site_config/standard/gihyo.jp.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //p[@id='skip']//a[contains(@href, 'skip')] | 1 | single_page_link: //p[@id='skip']//a[contains(@href, 'skip')] |
2 | 2 | ||
3 | test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010 \ No newline at end of file | 3 | test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gist.github.com.txt b/inc/3rdparty/site_config/standard/gist.github.com.txt index 53095b34..90207862 100644..100755 --- a/inc/3rdparty/site_config/standard/gist.github.com.txt +++ b/inc/3rdparty/site_config/standard/gist.github.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@class="highlight"]/pre | 1 | body: //div[@class="highlight"]/pre |
2 | 2 | ||
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | test_url: https://gist.github.com/1258908 \ No newline at end of file | 6 | test_url: https://gist.github.com/1258908 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt b/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt index 144ce045..0de0750b 100644..100755 --- a/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt +++ b/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //div[@id="content"]//h2/a | 1 | single_page_link: //div[@id="content"]//h2/a |
2 | 2 | ||
3 | test_url: http://givemesomethingtoread.com/post/6285838917/the-baddest-lawyer-in-the-history-of-jersey \ No newline at end of file | 3 | test_url: http://givemesomethingtoread.com/post/6285838917/the-baddest-lawyer-in-the-history-of-jersey \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt b/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt index 285e76c0..2eb82a6d 100644..100755 --- a/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt +++ b/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id="leadimage" or @class="postcontent"] | 1 | body: //div[@id="leadimage" or @class="postcontent"] |
2 | author: //div[@class="contentauthor"] | 2 | author: //div[@class="contentauthor"] |
3 | date: //div[@class="timestamp"] | 3 | date: //div[@class="timestamp"] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/ \ No newline at end of file | 7 | test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gizmodo.com.txt b/inc/3rdparty/site_config/standard/gizmodo.com.txt index c9536255..e73ec9d2 100644..100755 --- a/inc/3rdparty/site_config/standard/gizmodo.com.txt +++ b/inc/3rdparty/site_config/standard/gizmodo.com.txt | |||
@@ -1,7 +1,11 @@ | |||
1 | body: //div[@class="post-body" or contains(@class, 'illustration top')] | 1 | #body: //div[@class="post-body" or contains(@class, 'illustration top')] |
2 | author: (//cite//span[@class="plus-icon"])[1] | 2 | body: //div[contains(@class, 'image-annotation-box') or contains(@class, 'post-content')] |
3 | date: //span[@class="date"] | 3 | #author: (//cite//span[@class="plus-icon"])[1] |
4 | 4 | author: //span[contains(@class, 'display-name')] | |
5 | prune: no | 5 | date: //span[@class="date"] |
6 | 6 | ||
7 | test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science \ No newline at end of file | 7 | prune: no |
8 | |||
9 | test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science | ||
10 | test_url: http://gizmodo.com/what-van-goghs-paintings-would-look-like-if-they-came-874035680 | ||
11 | test_url: http://gizmodo.com/vip.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gizmodo.uol.com.br.txt b/inc/3rdparty/site_config/standard/gizmodo.uol.com.br.txt new file mode 100755 index 00000000..d963d684 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gizmodo.uol.com.br.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | body: //div[@id='destaques']//div[contains(@class, 'img')] | //div[@id='maincontent']//p | ||
4 | |||
5 | test_url: http://gizmodo.uol.com.br/nvidia-gtx-titan-z/ | ||
6 | test_url: http://gizmodo.uol.com.br/perfil-mark-zuckerberg-hackeado/ | ||
diff --git a/inc/3rdparty/site_config/standard/gizmologia.com.txt b/inc/3rdparty/site_config/standard/gizmologia.com.txt index d2c7c9f9..d2c7c9f9 100644..100755 --- a/inc/3rdparty/site_config/standard/gizmologia.com.txt +++ b/inc/3rdparty/site_config/standard/gizmologia.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/gizmovil.com.txt b/inc/3rdparty/site_config/standard/gizmovil.com.txt index 5fc204b8..5fc204b8 100644..100755 --- a/inc/3rdparty/site_config/standard/gizmovil.com.txt +++ b/inc/3rdparty/site_config/standard/gizmovil.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/global.txt b/inc/3rdparty/site_config/standard/global.txt index 135ed500..71fbc934 100644..100755 --- a/inc/3rdparty/site_config/standard/global.txt +++ b/inc/3rdparty/site_config/standard/global.txt | |||
@@ -1,4 +1,18 @@ | |||
1 | # Look for Open Graph data - http://ogp.me | 1 | # Look for Open Graph data - http://ogp.me |
2 | title: //meta[@property="og:title"]/@content | 2 | title: //meta[@property="og:title"]/@content |
3 | date: //meta[@property="article:published_time"]/@content | 3 | date: //meta[@property="article:published_time"]/@content |
4 | # article:author is someties URL, e.g. on guardian.co.uk \ No newline at end of file | 4 | # article:author is someties URL, e.g. on guardian.co.uk |
5 | |||
6 | # Remove Google Publisher Tags: https://support.google.com/dfp_sb/answer/1649768?hl=en | ||
7 | #strip_id_or_class: div-gpt-ad | ||
8 | |||
9 | # Strip doubleclick image ads | ||
10 | strip_image_src: doubleclick.net | ||
11 | |||
12 | # If you get chunks of Javascript code appearing in the extracted output, try uncommenting the lines below. | ||
13 | # This tries to convert script tags to hidden div elements (which Full-Text RSS removes). | ||
14 | # If you notice issues with this approach, please let us know. | ||
15 | #find_string: <script | ||
16 | #replace_string: <div style="display:none" | ||
17 | #find_string: </script> | ||
18 | #replace_string: </div> \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/globalissues.org.txt b/inc/3rdparty/site_config/standard/globalissues.org.txt index 95d4becf..ee50f68f 100644..100755 --- a/inc/3rdparty/site_config/standard/globalissues.org.txt +++ b/inc/3rdparty/site_config/standard/globalissues.org.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | 2 | ||
3 | strip: //p[@class='top'] | 3 | strip: //p[@class='top'] |
4 | strip: //h2[.='Where next?'] | 4 | strip: //h2[.='Where next?'] |
5 | strip_id_or_class: where-next | 5 | strip_id_or_class: where-next |
6 | strip_id_or_class: social-bookmarks | 6 | strip_id_or_class: social-bookmarks |
7 | strip_id_or_class: link-to-here | 7 | strip_id_or_class: link-to-here |
8 | strip_id_or_class: options-heading | 8 | strip_id_or_class: options-heading |
9 | strip_id_or_class: page-options-content | 9 | strip_id_or_class: page-options-content |
10 | strip_id_or_class: page-info-bottom | 10 | strip_id_or_class: page-info-bottom |
11 | 11 | ||
12 | tidy: no | 12 | tidy: no |
13 | prune: no | 13 | prune: no |
14 | 14 | ||
15 | test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism \ No newline at end of file | 15 | test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/globoesporte.globo.com.txt b/inc/3rdparty/site_config/standard/globoesporte.globo.com.txt new file mode 100755 index 00000000..fd8e70ff --- /dev/null +++ b/inc/3rdparty/site_config/standard/globoesporte.globo.com.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | |||
3 | body: //div[@class='materia-titulo']/h2 | //*[@id="materia-letra"] | ||
4 | |||
5 | date: //abbr[@class="published"] | ||
6 | date: //abbr[@class="updated"] | ||
7 | |||
8 | author: //*[@class="author"]/strong | ||
9 | |||
10 | strip: //div[contains(@class,'foto')]/strong | ||
11 | strip: //div[contains(@class,'frase-materia')]/div[@class='autor'] | ||
12 | strip: //div[contains(@class,'saibamais')] | ||
13 | strip: //*[contains(text(),'Clique aqui e veja mais')]/ancestor::p | ||
14 | strip: //ul[@class="toolbar"] | ||
15 | |||
16 | # quotes | ||
17 | wrap_in(blockquote): //div[@id='materia-letra']//div[contains(@class,'frase-materia')]/div[@class='frase'] | ||
18 | |||
19 | prune: no | ||
20 | |||
21 | replace_string([Clique aqui e veja mais vÃdeos do Fluminense]): [] | ||
22 | |||
23 | test_url: http://globoesporte.globo.com/atletismo/noticia/2013/08/michael-johnson-diz-que-bolt-e-melhor-da-historia-nao-ha-duvidas.html | ||
24 | test_url: http://globoesporte.globo.com/futebol/futebol-internacional/futebol-espanhol/noticia/2013/08/barca-atropela-levante-e-neymar-passa-em-branco-em-estreia-oficial.html | ||
25 | test_url: http://globoesporte.globo.com/futebol/times/fluminense/noticia/2013/08/poupado-no-sabado-felipe-se-diz-pronto-para-ser-titular-contra-o-goias.html | ||
diff --git a/inc/3rdparty/site_config/standard/gloswielkopolski.pl.txt b/inc/3rdparty/site_config/standard/gloswielkopolski.pl.txt new file mode 100755 index 00000000..16487955 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gloswielkopolski.pl.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //article[@id='material']/header/h1 | ||
2 | author: //article[@id='material']/header/div[2]/p | ||
3 | date: //article[@id='material']/header/p/time[1] | ||
4 | body: //section[@id='tresc'] | ||
5 | next_page_link: .//section[@id='tresc']/div[@class='stronicowanie']/a[@rel='next'] | ||
6 | strip://div[@class='podobneSonda'] | ||
7 | |||
8 | test_url: http://www.gloswielkopolski.pl/artykul/803547,abc-telemarketingu-praca-ktora-zwalnia-z-myslenia,id,t.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/goal.com.txt b/inc/3rdparty/site_config/standard/goal.com.txt index 075c4d2b..e25e9a00 100644..100755 --- a/inc/3rdparty/site_config/standard/goal.com.txt +++ b/inc/3rdparty/site_config/standard/goal.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title: //div[@id='article_headline']//h1 | 1 | title: //div[@id='article_headline']//h1 |
2 | date: //div[contains(@class, 'articleDate')]//h4 | 2 | date: //div[contains(@class, 'articleDate')]//h4 |
3 | body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content'] | 3 | body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content'] |
4 | 4 | ||
5 | strip_id_or_class: relatedLinksBox | 5 | strip_id_or_class: relatedLinksBox |
6 | strip_id_or_class: betting-widget | 6 | strip_id_or_class: betting-widget |
7 | strip_image_src: install_flash.gif | 7 | strip_image_src: install_flash.gif |
8 | 8 | ||
9 | strip: //table[contains(@style, 'float: right; width: 285px;')] | 9 | strip: //table[contains(@style, 'float: right; width: 285px;')] |
10 | strip: //div[@class='caption'] | 10 | strip: //div[@class='caption'] |
11 | 11 | ||
12 | tidy: no | 12 | tidy: no |
13 | prune: no | 13 | prune: no |
14 | 14 | ||
15 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and- | 15 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and- |
16 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to# \ No newline at end of file | 16 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to# \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/golem.de.txt b/inc/3rdparty/site_config/standard/golem.de.txt index 6c5d1c4f..6afdebe8 100644..100755 --- a/inc/3rdparty/site_config/standard/golem.de.txt +++ b/inc/3rdparty/site_config/standard/golem.de.txt | |||
@@ -1,25 +1,25 @@ | |||
1 | # Jens Kohl, jens.kohl@... | 1 | # Jens Kohl, jens.kohl@... |
2 | # - Added publication date | 2 | # - Added publication date |
3 | # - Striped pagination block | 3 | # - Striped pagination block |
4 | # - Added single page link | 4 | # - Added single page link |
5 | # - Added xpath-querys for the printer friendly version | 5 | # - Added xpath-querys for the printer friendly version |
6 | 6 | ||
7 | title: //h1 | 7 | title: //h1 |
8 | body: //div[@class='formatted'] | 8 | body: //div[@class='formatted'] |
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | date: substring-after(//li[2][@class="text1"], 'Datum:') | 11 | date: substring-after(//li[2][@class="text1"], 'Datum:') |
12 | strip: //ol[@class="list-chapters"] | 12 | strip: //ol[@class="list-chapters"] |
13 | strip_comments: yes | 13 | strip_comments: yes |
14 | 14 | ||
15 | # next: commands for printer friendly pages | 15 | # next: commands for printer friendly pages |
16 | single_page_link: //a[contains(@href, 'print.php?a=')]/@href | 16 | single_page_link: //a[contains(@href, 'print.php?a=')]/@href |
17 | title: //body/h3 | 17 | title: //body/h3 |
18 | strip_image_src: staticrl/images/logo.jpg | 18 | strip_image_src: staticrl/images/logo.jpg |
19 | strip_image_src: http://cpx.golem.de/cpx.php?class=7 | 19 | strip_image_src: http://cpx.golem.de/cpx.php?class=7 |
20 | strip: //body/h3 | 20 | strip: //body/h3 |
21 | strip: //body/b[1] | 21 | strip: //body/b[1] |
22 | strip: //body/b[2] | 22 | strip: //body/b[2] |
23 | strip: //body/b[3] | 23 | strip: //body/b[3] |
24 | strip: //div[1] | 24 | strip: //div[1] |
25 | test_url: http://www.golem.de/1112/88696.html \ No newline at end of file | 25 | test_url: http://www.golem.de/1112/88696.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/good.is.txt b/inc/3rdparty/site_config/standard/good.is.txt index 5cf67011..94159fbf 100644..100755 --- a/inc/3rdparty/site_config/standard/good.is.txt +++ b/inc/3rdparty/site_config/standard/good.is.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //div[@class="title"]/div/h1 | 1 | title: //div[@class="title"]/div/h1 |
2 | body: //div[@class="body"] | 2 | body: //div[@class="body"] |
3 | date: //li[@class="date-time"] | 3 | date: //li[@class="date-time"] |
4 | test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/ \ No newline at end of file | 4 | test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/goodfil.ms.txt b/inc/3rdparty/site_config/standard/goodfil.ms.txt new file mode 100755 index 00000000..f8bbbc6a --- /dev/null +++ b/inc/3rdparty/site_config/standard/goodfil.ms.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_id_or_class: gutter | ||
2 | test_url: http://goodfil.ms/blog/posts/2012/08/13/angularjs-and-the-goodfilms-mobile-site-part-1/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gossip-tv.gr.txt b/inc/3rdparty/site_config/standard/gossip-tv.gr.txt index c2fe4e40..e2d2d0b2 100644..100755 --- a/inc/3rdparty/site_config/standard/gossip-tv.gr.txt +++ b/inc/3rdparty/site_config/standard/gossip-tv.gr.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | date: //meta[@name='og:article:published_time']/@value | 1 | date: //meta[@name='og:article:published_time']/@value |
2 | 2 | ||
3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] | 3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] |
4 | 4 | ||
5 | strip_id_or_class: itemImageGallery | 5 | strip_id_or_class: itemImageGallery |
6 | 6 | ||
7 | # remove extras at end of post content | 7 | # remove extras at end of post content |
8 | find_string: <div style="margin:5px 0 10px;"> | 8 | find_string: <div style="margin:5px 0 10px;"> |
9 | replace_string: </div></body></html><!-- | 9 | replace_string: </div></body></html><!-- |
10 | 10 | ||
11 | prune: no | 11 | prune: no |
12 | 12 | ||
13 | test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous | 13 | test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous |
14 | test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas \ No newline at end of file | 14 | test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/goteborgsfria.se.txt b/inc/3rdparty/site_config/standard/goteborgsfria.se.txt new file mode 100755 index 00000000..c90aed0b --- /dev/null +++ b/inc/3rdparty/site_config/standard/goteborgsfria.se.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.goteborgsfria.se/artikel/112079 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gothamist.com.txt b/inc/3rdparty/site_config/standard/gothamist.com.txt index 5179fc12..36453878 100644..100755 --- a/inc/3rdparty/site_config/standard/gothamist.com.txt +++ b/inc/3rdparty/site_config/standard/gothamist.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='entry-header'] | 1 | title: //div[@class='entry-header'] |
2 | author: //span[@class='vcard author'] | 2 | author: //span[@class='vcard author'] |
3 | date: //abbr[@class='published'] | 3 | date: //abbr[@class='published'] |
4 | #move_into(//div[@class='entry-body']): //img[@id='photo_1'] | 4 | #move_into(//div[@class='entry-body']): //img[@id='photo_1'] |
5 | body: //div[@class='entry-body'] | 5 | body: //div[@class='entry-body'] |
6 | strip: //div[@class='galleryEaseThumbs'] | 6 | strip: //div[@class='galleryEaseThumbs'] |
7 | test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php \ No newline at end of file | 7 | test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gotomanager.com.txt b/inc/3rdparty/site_config/standard/gotomanager.com.txt index 7fb0ee03..f8af7324 100644..100755 --- a/inc/3rdparty/site_config/standard/gotomanager.com.txt +++ b/inc/3rdparty/site_config/standard/gotomanager.com.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | title: //span[@id="showTitle"] | 1 | title: //span[@id="showTitle"] |
2 | author: //span[@id="showAuthor"] | 2 | author: //span[@id="showAuthor"] |
3 | date: //span[@id="showRefDate"] | 3 | date: //span[@id="showRefDate"] |
4 | 4 | ||
5 | strip: //span[@class="black_bold"] | 5 | strip: //span[@class="black_bold"] |
6 | strip: //div[@id="sectionName"] | 6 | strip: //div[@id="sectionName"] |
7 | strip: //div[@id="storyHeader"] | 7 | strip: //div[@id="storyHeader"] |
8 | 8 | ||
9 | body: //div[@id="newsBodyText"] | 9 | body: //div[@id="newsBodyText"] |
10 | 10 | ||
11 | strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif" | 11 | strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif" |
12 | strip_image_src: "http://www.gotomanager.com/images/separator.gif" | 12 | strip_image_src: "http://www.gotomanager.com/images/separator.gif" |
13 | strip_image_src: "http://www.gotomanager.com/images/spaces.gif" | 13 | strip_image_src: "http://www.gotomanager.com/images/spaces.gif" |
14 | 14 | ||
15 | convert_double_br_tags: yes | 15 | convert_double_br_tags: yes |
16 | tidy: yes | 16 | tidy: yes |
17 | 17 | ||
18 | strip: //div[@id="smallLeadImage"] | 18 | strip: //div[@id="smallLeadImage"] |
19 | strip: //div[@id="truehitsSurvey"] | 19 | strip: //div[@id="truehitsSurvey"] |
20 | strip: //table[@id="relatedInfoTable"] | 20 | strip: //table[@id="relatedInfoTable"] |
21 | test_url: http://www.gotomanager.com/news/details.aspx?id=86759 \ No newline at end of file | 21 | test_url: http://www.gotomanager.com/news/details.aspx?id=86759 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gov.ky.txt b/inc/3rdparty/site_config/standard/gov.ky.txt new file mode 100755 index 00000000..294ece3a --- /dev/null +++ b/inc/3rdparty/site_config/standard/gov.ky.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | strip: //body//title | ||
2 | |||
3 | test_url: http://www.gov.ky/pls/portal/PORTAL.wwv_media.show?p_id=7593947&p_settingssetid=1&p_settingssiteid=0&p_siteid=2425&p_type=basetext&p_textid=7593948 | ||
4 | test_url: http://www.rcips.ky/pls/portal/wlacomp.wlafeed.show_cignewsfeed_agency?p_sitecode=POL&p_agency=Police \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gp.se.txt b/inc/3rdparty/site_config/standard/gp.se.txt new file mode 100755 index 00000000..158ae4ed --- /dev/null +++ b/inc/3rdparty/site_config/standard/gp.se.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@id='articleContainer'] | ||
2 | author: //div[@id='articleContent']//div[contains(@class, 'byline')]//span[contains(@class, 'name fn')] | ||
3 | strip_id_or_class: toolbar | ||
4 | strip_id_or_class: ADad | ||
5 | strip_id_or_class: articleSerieWrapper | ||
6 | strip_id_or_class: articleFloatContainer | ||
7 | strip: //div[contains(@class, 'byline')]//img | ||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.gp.se/nyheter/bohuslan/1.2045564-styckade-mannen-hade-mordat-hustrun | ||
11 | test_url: http://www.gp.se/1.16560 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gq.com.txt b/inc/3rdparty/site_config/standard/gq.com.txt index 233c4a7f..8ad8a14e 100644..100755 --- a/inc/3rdparty/site_config/standard/gq.com.txt +++ b/inc/3rdparty/site_config/standard/gq.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a | 1 | next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a |
2 | strip_id_or_class: utility | 2 | strip_id_or_class: utility |
3 | strip_id_or_class: keywords | 3 | strip_id_or_class: keywords |
4 | strip_id_or_class: pagination | 4 | strip_id_or_class: pagination |
5 | strip_id_or_class: position2_content | 5 | strip_id_or_class: position2_content |
6 | body: //div[@class='article'] | 6 | body: //div[@class='article'] |
7 | title: //h1[@class='content-headline'] | 7 | title: //h1[@class='content-headline'] |
8 | author: //span[@class='contributor']//a | 8 | author: //span[@class='contributor']//a |
9 | test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012 \ No newline at end of file | 9 | test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/grantland.com.txt b/inc/3rdparty/site_config/standard/grantland.com.txt index 3269e086..b8d419f4 100644..100755 --- a/inc/3rdparty/site_config/standard/grantland.com.txt +++ b/inc/3rdparty/site_config/standard/grantland.com.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | # this is fragile with footnotes -- leave it for now | 1 | # this is fragile with footnotes -- leave it for now |
2 | 2 | ||
3 | #tidy: no | 3 | #tidy: no |
4 | #prune: no | 4 | #prune: no |
5 | #move_into(//article): //aside[@id='footnotes'] | 5 | #move_into(//article): //aside[@id='footnotes'] |
6 | author: //cite/a | 6 | author: //cite/a |
7 | date: //time | 7 | date: //time |
8 | 8 | ||
9 | strip: //a[text()='Grantland'] | 9 | strip: //a[text()='Grantland'] |
10 | strip_id_or_class: ad-wrapper | 10 | strip_id_or_class: ad-wrapper |
11 | strip_id_or_class: fb-connect-link | 11 | strip_id_or_class: fb-connect-link |
12 | strip_id_or_class: fb-status | 12 | strip_id_or_class: fb-status |
13 | strip: //li[@class='print'] | 13 | strip: //li[@class='print'] |
14 | strip: //cite | 14 | strip: //cite |
15 | strip: //a[contains(text(), '[+]')] | 15 | strip: //a[contains(text(), '[+]')] |
16 | strip: //a[@id='jump-nav-link'] | 16 | strip: //a[@id='jump-nav-link'] |
17 | strip: //h1[text()='Share This'] | 17 | strip: //h1[text()='Share This'] |
18 | strip: //h1[text()='Top Stories'] | 18 | strip: //h1[text()='Top Stories'] |
19 | strip: //div[@id="update-text-size"] | 19 | strip: //div[@id="update-text-size"] |
20 | test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day \ No newline at end of file | 20 | test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt b/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt index a5258030..31a41075 100644..100755 --- a/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt +++ b/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //div[@class="blogpost"]/h2 | 1 | title: //div[@class="blogpost"]/h2 |
2 | author: //div[@class="blogpost"]/p[@class="byline"]/a | 2 | author: //div[@class="blogpost"]/p[@class="byline"]/a |
3 | date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"] | 3 | date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"] |
4 | body: //div[@class="blogpost"] | 4 | body: //div[@class="blogpost"] |
5 | strip_id_or_class: flag | 5 | strip_id_or_class: flag |
6 | strip_id_or_class: byline | 6 | strip_id_or_class: byline |
7 | strip_id_or_class: post_footer | 7 | strip_id_or_class: post_footer |
8 | strip_id_or_class: related_posts | 8 | strip_id_or_class: related_posts |
9 | strip_id_or_class: post_author_bios | 9 | strip_id_or_class: post_author_bios |
10 | strip: //h2 | 10 | strip: //h2 |
11 | test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/ \ No newline at end of file | 11 | test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/groups.drupal.org.txt b/inc/3rdparty/site_config/standard/groups.drupal.org.txt index 7e15a5c1..0fe30ef5 100644..100755 --- a/inc/3rdparty/site_config/standard/groups.drupal.org.txt +++ b/inc/3rdparty/site_config/standard/groups.drupal.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | author://span[@class="submitted"]/a | 2 | author://span[@class="submitted"]/a |
3 | date:substring-after(//span[@class="submitted"],'on ') | 3 | date:substring-after(//span[@class="submitted"],'on ') |
4 | body://div[@class="content"] | 4 | body://div[@class="content"] |
5 | test_url: http://groups.drupal.org/node/36816 \ No newline at end of file | 5 | test_url: http://groups.drupal.org/node/36816 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/gulfnews.com.txt b/inc/3rdparty/site_config/standard/gulfnews.com.txt index e69044b3..97b620de 100644..100755 --- a/inc/3rdparty/site_config/standard/gulfnews.com.txt +++ b/inc/3rdparty/site_config/standard/gulfnews.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article'] | 1 | body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article'] |
2 | strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1] | 2 | strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1] |
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084 \ No newline at end of file | 5 | test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/guokr.com.txt b/inc/3rdparty/site_config/standard/guokr.com.txt index 00255eb8..f8327bea 100644..100755 --- a/inc/3rdparty/site_config/standard/guokr.com.txt +++ b/inc/3rdparty/site_config/standard/guokr.com.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | # To administrator: | 1 | # To administrator: |
2 | # Please change the hostname to "www.guokr.com/article/*" | 2 | # Please change the hostname to "www.guokr.com/article/*" |
3 | # Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com | 3 | # Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com |
4 | 4 | ||
5 | # This filter is tested on: | 5 | # This filter is tested on: |
6 | # http://www.guokr.com/article/274325/ | 6 | # http://www.guokr.com/article/274325/ |
7 | # http://www.guokr.com/article/275013/ | 7 | # http://www.guokr.com/article/275013/ |
8 | 8 | ||
9 | title://h1 | 9 | title://h1 |
10 | author://div[contains(@class, 'content-th-info')]/a | 10 | author://div[contains(@class, 'content-th-info')]/a |
11 | date://div[contains(@class, 'content-th-info')]/span | 11 | date://div[contains(@class, 'content-th-info')]/span |
12 | body://div[contains(@class, 'Content')] | 12 | body://div[contains(@class, 'Content')] |
13 | 13 | ||
14 | strip://div[contains(@class, 'bottom-i')] | 14 | strip://div[contains(@class, 'bottom-i')] |
15 | strip://div[contains(@class, 'copyright')] | 15 | strip://div[contains(@class, 'copyright')] |
16 | strip://div[contains(@class, 'fr')] | 16 | strip://div[contains(@class, 'fr')] |
17 | strip://div[contains(@class, 'content-th-info')] | 17 | strip://div[contains(@class, 'content-th-info')] |
18 | strip://h1[contains(@id, 'articleTitle')] | 18 | strip://h1[contains(@id, 'articleTitle')] |
19 | strip://div[contains(@class, 'side')] | 19 | strip://div[contains(@class, 'side')] |
20 | strip://div[contains(@class, 'top-wp')] | 20 | strip://div[contains(@class, 'top-wp')] |
21 | test_url: http://www.guokr.com/article/275013/ | 21 | test_url: http://www.guokr.com/article/275013/ |
22 | test_url: http://www.guokr.com/article/338387/ \ No newline at end of file | 22 | test_url: http://www.guokr.com/article/338387/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/haberler.com.txt b/inc/3rdparty/site_config/standard/haberler.com.txt index bc1ce689..1bb2bc7d 100644..100755 --- a/inc/3rdparty/site_config/standard/haberler.com.txt +++ b/inc/3rdparty/site_config/standard/haberler.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@id="habermetni"]/h1[@id="haber_baslik"] | 1 | title: //div[@id="habermetni"]/h1[@id="haber_baslik"] |
2 | body: //div[@id="habermetni"]/p | 2 | body: //div[@id="habermetni"]/p |
3 | strip: //img[@class='newsDetailLeft'] | 3 | strip: //img[@class='newsDetailLeft'] |
4 | strip_image_src: /haber-resimleri/ | 4 | strip_image_src: /haber-resimleri/ |
5 | test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/ \ No newline at end of file | 5 | test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hackmake.org.txt b/inc/3rdparty/site_config/standard/hackmake.org.txt new file mode 100755 index 00000000..98140117 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hackmake.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | date: //article//time[@pubdate] | ||
2 | body: //article/div[@id="post-wide"] | ||
3 | title: //article/header/h2 | ||
4 | strip: /div[@id="comment"] | ||
5 | strip: //footer | ||
6 | author: substring-after(//footer/p[@class='byline'] , 'By') | ||
7 | test_url: http://hackmake.org/2012/12/21/mindfulness-of-concentration \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/halo.bungie.org.txt b/inc/3rdparty/site_config/standard/halo.bungie.org.txt index 7989d09f..1802efea 100644..100755 --- a/inc/3rdparty/site_config/standard/halo.bungie.org.txt +++ b/inc/3rdparty/site_config/standard/halo.bungie.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title:substring-before(id("maincontent")/table, 'Posted') | 1 | title:substring-before(id("maincontent")/table, 'Posted') |
2 | body:id("maincontent")/p | 2 | body:id("maincontent")/p |
3 | # eventually convert linebreaks better | 3 | # eventually convert linebreaks better |
4 | 4 | ||
5 | test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html \ No newline at end of file | 5 | test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt b/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt index 747f90a1..33f7e726 100644..100755 --- a/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt +++ b/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | # Remove right column | 1 | # Remove right column |
2 | strip: //*[(@class = 'right_col')] | 2 | strip: //*[(@class = 'right_col')] |
3 | 3 | ||
4 | # Remove comments etc. | 4 | # Remove comments etc. |
5 | strip: //*[(@class = 'category')] | 5 | strip: //*[(@class = 'category')] |
6 | strip: /html/body/div[1][@class='absolute_content_high']/div[1][@class='wrapper']/div[1][@class='main_col']/div[@class='main_content']/h3 | 6 | strip: /html/body/div[1][@class='absolute_content_high']/div[1][@class='wrapper']/div[1][@class='main_col']/div[@class='main_content']/h3 |
7 | test_url: http://hammers.theoffside.com/carling-cup/a-funny-thing-happened-on-the-way-to-4-nil.html \ No newline at end of file | 7 | test_url: http://hammers.theoffside.com/carling-cup/a-funny-thing-happened-on-the-way-to-4-nil.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/handelsblatt.com.txt b/inc/3rdparty/site_config/standard/handelsblatt.com.txt new file mode 100755 index 00000000..7d067aa6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/handelsblatt.com.txt | |||
@@ -0,0 +1,31 @@ | |||
1 | #Single Page | ||
2 | single_page_link: //li[contains(@class,"hcf-print")]/a | ||
3 | |||
4 | # Title hcf-headline | ||
5 | title: //span[@class='hcf-headline'] | ||
6 | |||
7 | # Authors | ||
8 | author: //div[@class="hcf-author"]/a/text() | ||
9 | author: substring-after(//div[@class='hcf-author'], 'von ') | ||
10 | |||
11 | # Date | ||
12 | date: //div[@class='hcf-article-date'] | ||
13 | |||
14 | # Body | ||
15 | body: //div[@class='article'] | ||
16 | |||
17 | # General removements | ||
18 | strip: //div[contains(@class,"hcf-smartbox")] | ||
19 | strip: //div[contains(@class,"hcf-stopper")] | ||
20 | strip: //div[contains(@class,"hcf-img-controls")] | ||
21 | strip: //span[@class='hcf-location-mark'] | ||
22 | strip: //span[@class='hcf-copyright'] | ||
23 | strip: //div[@class='hcf-copyright'] | ||
24 | strip: //div[@class='hcf-origin'] | ||
25 | |||
26 | |||
27 | |||
28 | |||
29 | # Fix picture captions | ||
30 | wrap_in(small): //div[@class="hcf-caption"] | ||
31 | test_url: http://www.handelsblatt.com/meinung/gastbeitraege/gastkommentar-zum-emissionshandel-kurskorrekturen-fuehren-zum-kentern/8044326.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hanselman.com.txt b/inc/3rdparty/site_config/standard/hanselman.com.txt index d3ffeab1..1dca632f 100644..100755 --- a/inc/3rdparty/site_config/standard/hanselman.com.txt +++ b/inc/3rdparty/site_config/standard/hanselman.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | date: //span[@class="item-date"] | 1 | date: //span[@class="item-date"] |
2 | body: //div[@class="item-content"] | 2 | body: //div[@class="item-content"] |
3 | strip_comments: no | 3 | strip_comments: no |
4 | test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx \ No newline at end of file | 4 | test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hardware.fr.txt b/inc/3rdparty/site_config/standard/hardware.fr.txt index 318885c8..e4f1f6bc 100644..100755 --- a/inc/3rdparty/site_config/standard/hardware.fr.txt +++ b/inc/3rdparty/site_config/standard/hardware.fr.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //a[@class='a_aut'] | 2 | author: //a[@class='a_aut'] |
3 | body: //div[@class='content_dossier'] | 3 | body: //div[@class='content_dossier'] |
4 | strip: //div[@id='pagination'] | 4 | strip: //div[@id='pagination'] |
5 | next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href | 5 | next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href |
6 | test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html \ No newline at end of file | 6 | test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hardware.no.txt b/inc/3rdparty/site_config/standard/hardware.no.txt new file mode 100755 index 00000000..cbbcf84e --- /dev/null +++ b/inc/3rdparty/site_config/standard/hardware.no.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //h1[@class='headline'] | ||
2 | title: //h2[@itemprop='alternativeHeadline'] | ||
3 | title: //h1[@itemprop='headline'] | ||
4 | author: //span[@itemprop='name'] | ||
5 | date: //time[@itemprop='datePublished'] | ||
6 | body: //div[@itemprop='reviewBody'] | ||
7 | |||
8 | wrap_in(blockquote): //div[@class='factBox'] | ||
9 | |||
10 | next_page_link: //a[@rel='next'] | ||
11 | |||
12 | strip_id_or_class: 'product-box' | ||
13 | strip: //a[@rel='next'] | ||
14 | strip: //a[text()='Del på Facebook'] | ||
15 | strip: //a[text()='Del på Twitter'] | ||
16 | test_url: http://www.hardware.no/artikler/asus-vg248qe/132792 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hbr.org.txt b/inc/3rdparty/site_config/standard/hbr.org.txt index fd6145e7..c2f292e1 100644..100755 --- a/inc/3rdparty/site_config/standard/hbr.org.txt +++ b/inc/3rdparty/site_config/standard/hbr.org.txt | |||
@@ -1,6 +1,7 @@ | |||
1 | title: //div[@id='article-title'] | 1 | title: //div[@id='article-title'] |
2 | author: //div[@id='articleAuthors'] | 2 | author: //div[@id='articleAuthors'] |
3 | body: //div[@id='article'] | 3 | body: //div[@id='article'] |
4 | strip: //div[@class='module wide'] | 4 | strip: //div[@class='module wide'] |
5 | next_page_link: //a[@title='Next Page'] | 5 | #single_page_link: //a[@class='social-print'] |
6 | test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/ \ No newline at end of file | 6 | test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/ |
7 | test_url: http://hbr.org/2013/03/big-bang-disruption/ar/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/headrush.typepad.com.txt b/inc/3rdparty/site_config/standard/headrush.typepad.com.txt new file mode 100755 index 00000000..a3146771 --- /dev/null +++ b/inc/3rdparty/site_config/standard/headrush.typepad.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title://div[@class='content']/h3[1] | ||
2 | body://div[@class='content'] | ||
3 | |||
4 | # Article nav | ||
5 | strip://div[@class='content']/p[1] | ||
6 | |||
7 | # Comments and trackbacks | ||
8 | strip://h2/following-sibling::p | ||
9 | strip://h2 | ||
10 | |||
11 | # Posted on | ||
12 | strip://b/p | ||
13 | strip://div[@class='content']/p[@class='posted'] | ||
14 | test_url: http://headrush.typepad.com/creating_passionate_users/2005/05/the_case_for_ea.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/heise-online.mobi.txt b/inc/3rdparty/site_config/standard/heise-online.mobi.txt index 1da82ac7..daff6143 100644..100755 --- a/inc/3rdparty/site_config/standard/heise-online.mobi.txt +++ b/inc/3rdparty/site_config/standard/heise-online.mobi.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='content']/div | 1 | body: //div[@id='content']/div |
2 | date: //p[@class='author_date']/span[@class='date'] | 2 | date: //p[@class='author_date']/span[@class='date'] |
3 | test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html \ No newline at end of file | 3 | test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/heise.de.txt b/inc/3rdparty/site_config/standard/heise.de.txt index 5f19d3f8..c51af561 100644..100755 --- a/inc/3rdparty/site_config/standard/heise.de.txt +++ b/inc/3rdparty/site_config/standard/heise.de.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | single_page_link: //p[@class='news_option']/a | 1 | single_page_link: //p[@class='news_option']/a |
2 | 2 | ||
3 | date: //p[@class='news_datum'] | 3 | date: //p[@class='news_datum'] |
4 | title: //h1 | 4 | title: //h1 |
5 | body: //div[@class='meldung_wrapper'] | 5 | body: //div[@class='meldung_wrapper'] |
6 | 6 | ||
7 | test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html \ No newline at end of file | 7 | test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hemmings.com.txt b/inc/3rdparty/site_config/standard/hemmings.com.txt new file mode 100755 index 00000000..a02b4a62 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hemmings.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h2 | ||
2 | body: //div[@id='leftdetail'] | ||
3 | single_page_link: //a[contains(@href, 'printable=1')] | ||
4 | strip: //a[contains(., 'Full Version')] | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.hemmings.com/classifieds/dealer/ferrari/330gtc/1601235.html | ||
9 | test_url: http://www.hemmings.com/rss/keyword.xml?adtype=carsforsale&make=ferrari \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/heroturko.me.txt b/inc/3rdparty/site_config/standard/heroturko.me.txt new file mode 100755 index 00000000..07b6adf1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/heroturko.me.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[contains(@class, 'title')]//h1 | ||
2 | body: //div[contains(@class, 'story')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.heroturko.me/5223034-ds-catia-p3-v5-6r2014-gasp0-x86x64-multilanguage-english-docs.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hespress.com.txt b/inc/3rdparty/site_config/standard/hespress.com.txt index d866f629..4ed0b8b5 100644..100755 --- a/inc/3rdparty/site_config/standard/hespress.com.txt +++ b/inc/3rdparty/site_config/standard/hespress.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body'] | 1 | body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body'] |
2 | 2 | ||
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | test_url: http://hespress.com/videos/73684.html | 6 | test_url: http://hespress.com/videos/73684.html |
7 | test_url: http://hespress.com/permalink/73678.html \ No newline at end of file | 7 | test_url: http://hespress.com/permalink/73678.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hiamag.com.txt b/inc/3rdparty/site_config/standard/hiamag.com.txt new file mode 100755 index 00000000..3c7ba5ac --- /dev/null +++ b/inc/3rdparty/site_config/standard/hiamag.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: (//div[contains(@class, 'gallery-slides')]//img)[1] | //div[contains(@class, 'node_body_inner')] | ||
2 | |||
3 | test_url: http://www.hiamag.com/rss.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/highscalability.com.txt b/inc/3rdparty/site_config/standard/highscalability.com.txt index fd50b6ad..5a808fa4 100644..100755 --- a/inc/3rdparty/site_config/standard/highscalability.com.txt +++ b/inc/3rdparty/site_config/standard/highscalability.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='journal-entry-text'] | 1 | body: //div[@class='journal-entry-text'] |
2 | 2 | ||
3 | test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html \ No newline at end of file | 3 | test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hiperpop.com.txt b/inc/3rdparty/site_config/standard/hiperpop.com.txt index b5eb062e..b5eb062e 100644..100755 --- a/inc/3rdparty/site_config/standard/hiperpop.com.txt +++ b/inc/3rdparty/site_config/standard/hiperpop.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt b/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt index c57c1aa9..d869a866 100644..100755 --- a/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt +++ b/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class = 'pd'] | 1 | body: //div[@class = 'pd'] |
2 | strip: //div[@id = 'overzicht-albumrecensies'] | 2 | strip: //div[@id = 'overzicht-albumrecensies'] |
3 | strip: //div[@id = 'jc'] | 3 | strip: //div[@id = 'jc'] |
4 | test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142 \ No newline at end of file | 4 | test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/historytoday.com.txt b/inc/3rdparty/site_config/standard/historytoday.com.txt index dc687f3f..78fb60a6 100644..100755 --- a/inc/3rdparty/site_config/standard/historytoday.com.txt +++ b/inc/3rdparty/site_config/standard/historytoday.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body://div[@id = 'content'] | 1 | body://div[@id = 'content'] |
2 | author://span[@class = 'authors'] | 2 | author://span[@class = 'authors'] |
3 | author://span[@class = 'ht-vtag'][1] | 3 | author://span[@class = 'ht-vtag'][1] |
4 | date:substring-before(//meta[@name = 'dc.date']/@content,'T') | 4 | date:substring-before(//meta[@name = 'dc.date']/@content,'T') |
5 | strip://div[contains(@class, 'region-ubercontent')] | 5 | strip://div[contains(@class, 'region-ubercontent')] |
6 | strip://h1 | 6 | strip://h1 |
7 | strip://div[@id = 'ht-author'] | 7 | strip://div[@id = 'ht-author'] |
8 | strip://ul[@class = 'links inline'] | 8 | strip://ul[@class = 'links inline'] |
9 | strip://div[@id = 'ht-tools'] | 9 | strip://div[@id = 'ht-tools'] |
10 | test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur \ No newline at end of file | 10 | test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hmercer.com.txt b/inc/3rdparty/site_config/standard/hmercer.com.txt index eeee1594..2da13a8e 100644..100755 --- a/inc/3rdparty/site_config/standard/hmercer.com.txt +++ b/inc/3rdparty/site_config/standard/hmercer.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //*[@class='ptitle'] | 1 | title: //*[@class='ptitle'] |
2 | date: //span[@class='date'] | 2 | date: //span[@class='date'] |
3 | body: //div[@class='body'] | 3 | body: //div[@class='body'] |
4 | prune: no | 4 | prune: no |
5 | test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/ \ No newline at end of file | 5 | test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hollywoodlife.com.txt b/inc/3rdparty/site_config/standard/hollywoodlife.com.txt new file mode 100755 index 00000000..975ffa26 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hollywoodlife.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | date: //meta[@name='sailthru.date']/@content | ||
2 | body: //article[contains(@class, 'entry-content')] | ||
3 | |||
4 | strip_image_src: subscribe.png | ||
5 | |||
6 | strip_id_or_class: wpcom-iframe-form | ||
7 | strip_id_or_class: gallery-thumbs | ||
8 | strip_id_or_class: twitter | ||
9 | strip_id_or_class: fb-link | ||
10 | strip_id_or_class: pinterest | ||
11 | |||
12 | strip: //div[@class='data'] | ||
13 | strip: //iframe[contains(@name, 'wpcom')] | ||
14 | |||
15 | find_string: <a href="http://www.youtube.com/subscription_center?add_user_id=2rJLq19N0dGrxfib80M | ||
16 | replace_string: </p></div></body></html><!-- | ||
17 | |||
18 | find_string: <h3>More | ||
19 | replace_string: </div></body></html><!-- | ||
20 | |||
21 | test_url: http://hollywoodlife.com/2013/10/04/miriam-carey-dead-capitol-hill-car-chase-shooting-postpartum-depression/ | ||
22 | test_url: http://hollywoodlife.com/feed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hometheaterreview.com.txt b/inc/3rdparty/site_config/standard/hometheaterreview.com.txt index d43e6448..8ed26ff5 100644..100755 --- a/inc/3rdparty/site_config/standard/hometheaterreview.com.txt +++ b/inc/3rdparty/site_config/standard/hometheaterreview.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id='entry-body'] | 1 | body: //div[@id='entry-body'] |
2 | strip_id_or_class: paginate | 2 | strip_id_or_class: paginate |
3 | strip: //p[contains(., 'Additional Resources')] | 3 | strip: //p[contains(., 'Additional Resources')] |
4 | test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/ \ No newline at end of file | 4 | test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hosted.ap.org.txt b/inc/3rdparty/site_config/standard/hosted.ap.org.txt index e19dd526..dfd81937 100644..100755 --- a/inc/3rdparty/site_config/standard/hosted.ap.org.txt +++ b/inc/3rdparty/site_config/standard/hosted.ap.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content'] | 1 | body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content'] |
2 | tidy: no | 2 | tidy: no |
3 | strip_image_src: analytics.apnewsregistry | 3 | strip_image_src: analytics.apnewsregistry |
4 | 4 | ||
5 | test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50 \ No newline at end of file | 5 | test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/howtogeek.com.txt b/inc/3rdparty/site_config/standard/howtogeek.com.txt new file mode 100755 index 00000000..baa2ed4a --- /dev/null +++ b/inc/3rdparty/site_config/standard/howtogeek.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[contains(@class, 'thecontent')] | ||
2 | |||
3 | strip_image_src: loading.gif | ||
4 | find_string:src="http://cdn.howtogeek.com/public/images/blank.gif" | ||
5 | replace_string:- | ||
6 | find_string:data-href= | ||
7 | replace_string:src= | ||
8 | |||
9 | strip_id_or_class: relatedside | ||
10 | |||
11 | test_url: http://www.howtogeek.com/school/microsoft-excel-formulas-and-functions/lesson1/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hs.fi.txt b/inc/3rdparty/site_config/standard/hs.fi.txt index 67125fb5..360dc725 100644..100755 --- a/inc/3rdparty/site_config/standard/hs.fi.txt +++ b/inc/3rdparty/site_config/standard/hs.fi.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | prune: yes | 1 | prune: yes |
2 | tidy: yes | 2 | tidy: yes |
3 | test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405 \ No newline at end of file | 3 | test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ht.ly.txt b/inc/3rdparty/site_config/standard/ht.ly.txt index a8412d2a..46535088 100644..100755 --- a/inc/3rdparty/site_config/standard/ht.ly.txt +++ b/inc/3rdparty/site_config/standard/ht.ly.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //iframe[@id='hootFrame']/@src | 1 | single_page_link: //iframe[@id='hootFrame']/@src |
2 | 2 | ||
3 | test_url: http://ht.ly/bOiZV \ No newline at end of file | 3 | test_url: http://ht.ly/bOiZV \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/huffingtonpost.com.txt b/inc/3rdparty/site_config/standard/huffingtonpost.com.txt index d40513b2..d4618c14 100644..100755 --- a/inc/3rdparty/site_config/standard/huffingtonpost.com.txt +++ b/inc/3rdparty/site_config/standard/huffingtonpost.com.txt | |||
@@ -1,16 +1,21 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')] | 2 | body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')] |
3 | date: //meta[@name="publish_date"]/@content | 3 | date: //meta[@name="publish_date"]/@content |
4 | author: //a[@rel="author"] | 4 | author: //a[@rel="author"] |
5 | author: //meta[@name="author"]/@content | 5 | author: //meta[@name="author"]/@content |
6 | prune: no | 6 | |
7 | tidy: no | 7 | prune: no |
8 | strip: //footer | 8 | tidy: no |
9 | strip_id_or_class: ps-slideshow | 9 | |
10 | strip_id_or_class: fs-slideshow | 10 | strip: //footer |
11 | strip: //p[contains(., 'Related on HuffPost:')] | 11 | strip_id_or_class: ps-slideshow |
12 | # end early | 12 | strip_id_or_class: fs-slideshow |
13 | replace_string(<div class="sbm-main): </body></html><div class="not-interested | 13 | strip: //p[contains(., 'Related on HuffPost:')] |
14 | 14 | strip_id_or_class: contribute-story | |
15 | test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html | 15 | strip_id_or_class: promo_holder |
16 | test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html \ No newline at end of file | 16 | |
17 | # end early | ||
18 | replace_string(<div class="sbm-main): </body></html><div class="not-interested | ||
19 | |||
20 | test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html | ||
21 | test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html | ||
diff --git a/inc/3rdparty/site_config/standard/humantransit.org.txt b/inc/3rdparty/site_config/standard/humantransit.org.txt index ec7d3c06..92d3c678 100644..100755 --- a/inc/3rdparty/site_config/standard/humantransit.org.txt +++ b/inc/3rdparty/site_config/standard/humantransit.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h3[@class="entry-header"] | 1 | title: //h3[@class="entry-header"] |
2 | date: //h2[@class="date-header"] | 2 | date: //h2[@class="date-header"] |
3 | body: //div[contains(@class, 'entry')] | 3 | body: //div[contains(@class, 'entry')] |
4 | 4 | ||
5 | test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html \ No newline at end of file | 5 | test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt b/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt index ccf09dcc..68fd220a 100644..100755 --- a/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt +++ b/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='HaberDetayTitleHold Title']/h1 | 1 | title: //div[@class='HaberDetayTitleHold Title']/h1 |
2 | body: //div[@id='YazarDetayText'] | 2 | body: //div[@id='YazarDetayText'] |
3 | author: //div[@class='HaberDetayTitleHold Title']/h1 | 3 | author: //div[@class='HaberDetayTitleHold Title']/h1 |
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp | 6 | test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp |
7 | test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp \ No newline at end of file | 7 | test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hvg.hu.txt b/inc/3rdparty/site_config/standard/hvg.hu.txt index 06fa98d8..05e7b5f1 100644..100755 --- a/inc/3rdparty/site_config/standard/hvg.hu.txt +++ b/inc/3rdparty/site_config/standard/hvg.hu.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id='pg-content']//h1 | 1 | title: //div[@id='pg-content']//h1 |
2 | body: //div[@id='articleBody0'] | 2 | body: //div[@id='articleBody0'] |
3 | replace_string(</table>): </table><br /><br /> | 3 | replace_string(</table>): </table><br /><br /> |
4 | 4 | ||
5 | single_page_link: //div[@class="up-header"]/a | 5 | single_page_link: //div[@class="up-header"]/a |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://hvg.hu/w/20111125_sparta \ No newline at end of file | 9 | test_url: http://hvg.hu/w/20111125_sparta \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/hypebeast.com.txt b/inc/3rdparty/site_config/standard/hypebeast.com.txt index 49b46da5..23e47545 100644..100755 --- a/inc/3rdparty/site_config/standard/hypebeast.com.txt +++ b/inc/3rdparty/site_config/standard/hypebeast.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1] | 1 | body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1] |
2 | author: //span[@class='author']/a | 2 | author: //span[@class='author']/a |
3 | 3 | ||
4 | strip_id_or_class: disqus | 4 | strip_id_or_class: disqus |
5 | strip_id_or_class: paginator | 5 | strip_id_or_class: paginator |
6 | strip_id_or_class: photo-number | 6 | strip_id_or_class: photo-number |
7 | 7 | ||
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/ \ No newline at end of file | 10 | test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/icannabis.tumblr.com.txt b/inc/3rdparty/site_config/standard/icannabis.tumblr.com.txt new file mode 100755 index 00000000..3bda753c --- /dev/null +++ b/inc/3rdparty/site_config/standard/icannabis.tumblr.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | tidy:no | ||
2 | prune:no | ||
3 | |||
4 | body://div[contains(@id,'content')] | ||
5 | |||
6 | strip_id_or_class:meta | ||
7 | strip_id_or_class:notes | ||
8 | strip_id_or_class:pagination | ||
9 | test_url: http://icannabis.tumblr.com/post/28660592471/reviewmswireless3000 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/idealog.co.nz.txt b/inc/3rdparty/site_config/standard/idealog.co.nz.txt new file mode 100755 index 00000000..ca88f606 --- /dev/null +++ b/inc/3rdparty/site_config/standard/idealog.co.nz.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@class='content'] | ||
2 | |||
3 | strip: //p[@class='dateline'] | ||
4 | strip: //hr | ||
5 | strip_id_or_class: share | ||
6 | strip_id_or_class: comments | ||
7 | strip_id_or_class: tags | ||
8 | |||
9 | title: substring-before(//title,' ::') | ||
10 | author: substring-before(//p[@class='dateline'],',') | ||
11 | date: //p[@class='dateline']/time | ||
12 | test_url: http://www.idealog.co.nz/blog/2012/12/geeks-plane-help-kiwis-take-san-francisco \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/idlewords.com.txt b/inc/3rdparty/site_config/standard/idlewords.com.txt index e1badef7..f3b33796 100644..100755 --- a/inc/3rdparty/site_config/standard/idlewords.com.txt +++ b/inc/3rdparty/site_config/standard/idlewords.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //a[@class='post_title'] | 1 | title: //a[@class='post_title'] |
2 | body: //div[@class='entrybox'] | 2 | body: //div[@class='entrybox'] |
3 | strip_id_or_class: post_title | 3 | strip_id_or_class: post_title |
4 | date: //div[@class='entrybox']/b[1] | 4 | date: //div[@class='entrybox']/b[1] |
5 | strip: //div[@class='entrybox']/b[1] | 5 | strip: //div[@class='entrybox']/b[1] |
6 | author: string('Maciej Cegłowski') | 6 | author: string('Maciej Cegłowski') |
7 | test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm \ No newline at end of file | 7 | test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/igeneration.fr.txt b/inc/3rdparty/site_config/standard/igeneration.fr.txt index d7ec2da1..45dd5f25 100644..100755 --- a/inc/3rdparty/site_config/standard/igeneration.fr.txt +++ b/inc/3rdparty/site_config/standard/igeneration.fr.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ') | 1 | author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ') |
2 | date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- '))) | 2 | date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- '))) |
3 | body: //div[@class='content clear-block zoneApple'] | 3 | body: //div[@class='content clear-block zoneApple'] |
4 | 4 | ||
5 | test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112 \ No newline at end of file | 5 | test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt b/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt index f74178a9..60635301 100644..100755 --- a/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt +++ b/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title://h1[@class='page-title'] | 1 | title://h1[@class='page-title'] |
2 | body://*[@id='content']//div[contains(@class,'node-content')] | 2 | body://*[@id='content']//div[contains(@class,'node-content')] |
3 | 3 | ||
4 | author://*[@id='content']//div[contains(@class,'node-submitted')]/a | 4 | author://*[@id='content']//div[contains(@class,'node-submitted')]/a |
5 | 5 | ||
6 | date:substring-after(//div[contains(@class,'node-submitted')],' on ') | 6 | date:substring-after(//div[contains(@class,'node-submitted')],' on ') |
7 | test_url: http://ignoredbydinosaurs.com/2011/09/great-lie-lorem-ipsum \ No newline at end of file | 7 | test_url: http://ignoredbydinosaurs.com/2011/09/great-lie-lorem-ipsum \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ilounge.com.txt b/inc/3rdparty/site_config/standard/ilounge.com.txt index ca1e54a8..9880b51f 100644..100755 --- a/inc/3rdparty/site_config/standard/ilounge.com.txt +++ b/inc/3rdparty/site_config/standard/ilounge.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | # Get proper Title, Author and Date info | 1 | # Get proper Title, Author and Date info |
2 | title: substring-before(//title, '|') | 2 | title: substring-before(//title, '|') |
3 | author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By') | 3 | author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By') |
4 | date: //span[@class='instapaper_date'] | 4 | date: //span[@class='instapaper_date'] |
5 | 5 | ||
6 | # For Reviews & First Looks, get the intro paragraph and put it in front of the main body. | 6 | # For Reviews & First Looks, get the intro paragraph and put it in front of the main body. |
7 | move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body'] | 7 | move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body'] |
8 | body: //div[@id='instapaper_para1'] | 8 | body: //div[@id='instapaper_para1'] |
9 | strip: //div[@class='reviewinfo'] | 9 | strip: //div[@class='reviewinfo'] |
10 | 10 | ||
11 | # We don't use footnotes, so why bother checking for them? | 11 | # We don't use footnotes, so why bother checking for them? |
12 | footnotes: no | 12 | footnotes: no |
13 | test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter \ No newline at end of file | 13 | test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ilyabirman.ru.txt b/inc/3rdparty/site_config/standard/ilyabirman.ru.txt index da6a60f6..51a7eb9c 100644..100755 --- a/inc/3rdparty/site_config/standard/ilyabirman.ru.txt +++ b/inc/3rdparty/site_config/standard/ilyabirman.ru.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class='published visible e2-smart-title']//span | 1 | title: //div[@class='published visible e2-smart-title']//span |
2 | author: //span[@id='e2-blog-title'] | 2 | author: //span[@id='e2-blog-title'] |
3 | date: //p[@class='super-h'] | 3 | date: //p[@class='super-h'] |
4 | body: //div[@class='text published visible'] | 4 | body: //div[@class='text published visible'] |
5 | test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/ \ No newline at end of file | 5 | test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/inc.com.txt b/inc/3rdparty/site_config/standard/inc.com.txt index 0589aaae..5410e64e 100644..100755 --- a/inc/3rdparty/site_config/standard/inc.com.txt +++ b/inc/3rdparty/site_config/standard/inc.com.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | author: substring-after(substring-before(//div[@id='byline'],'|'),'By') | 1 | author: substring-after(substring-before(//div[@id='byline'],'|'),'By') |
2 | author: //div[@class='byline']/a | 2 | author: //div[@class='byline']/a |
3 | date: //span[@class='pubdate'] | 3 | date: //span[@class='pubdate'] |
4 | # print friendly page | 4 | # print friendly page |
5 | body: //div[@id='text'] | 5 | body: //div[@id='text'] |
6 | # regular page | 6 | # regular page |
7 | body: //div[@id= 'articlecontent'] | 7 | body: //div[@id= 'articlecontent'] |
8 | 8 | ||
9 | strip: //div[@id= 'articlecontent']/h1 | 9 | strip: //div[@id= 'articlecontent']/h1 |
10 | strip: //div[@id='articlecontent']/p[@class='deck'] | 10 | strip: //div[@id='articlecontent']/p[@class='deck'] |
11 | strip: //div[@id='articlecontent']/div[@class='byline'] | 11 | strip: //div[@id='articlecontent']/div[@class='byline'] |
12 | strip: //div[@id='articlespacer'] | 12 | strip: //div[@id='articlespacer'] |
13 | strip: //div[@id='incsharebox'] | 13 | strip: //div[@id='incsharebox'] |
14 | strip: //div[@id='articlesidebar'] | 14 | strip: //div[@id='articlesidebar'] |
15 | 15 | ||
16 | prune: no | 16 | prune: no |
17 | 17 | ||
18 | single_page_link: //a[contains(@href, 'Printer_Friendly.html')] | 18 | single_page_link: //a[contains(@href, 'Printer_Friendly.html')] |
19 | strip: //a[contains(., 'Dig Deeper')] | 19 | strip: //a[contains(., 'Dig Deeper')] |
20 | test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html | 20 | test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html |
21 | test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html \ No newline at end of file | 21 | test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/independent.co.uk.txt b/inc/3rdparty/site_config/standard/independent.co.uk.txt index 47baf36b..af742209 100644..100755 --- a/inc/3rdparty/site_config/standard/independent.co.uk.txt +++ b/inc/3rdparty/site_config/standard/independent.co.uk.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | body: //div[contains(@class, 'articleContent')] | 2 | body: //div[contains(@class, 'articleContent')] |
3 | date: //meta[@property='article:published_time']/@content | 3 | date: //meta[@property='article:published_time']/@content |
4 | author: //div[@id='main']//div[@class='byline']//span[@class='authorName'] | 4 | author: //div[@id='main']//div[@class='byline']//span[@class='authorName'] |
5 | 5 | ||
6 | strip_id_or_class: RelatedArtTag | 6 | strip_id_or_class: RelatedArtTag |
7 | 7 | ||
8 | tidy: no | 8 | tidy: no |
9 | test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html \ No newline at end of file | 9 | test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/indiatimes.com.txt b/inc/3rdparty/site_config/standard/indiatimes.com.txt index e7a35e84..8112105f 100644..100755 --- a/inc/3rdparty/site_config/standard/indiatimes.com.txt +++ b/inc/3rdparty/site_config/standard/indiatimes.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //figure[@class='mainVideo'] | 1 | body: //figure[@class='mainVideo'] |
2 | strip: //figcaption | 2 | strip: //figcaption |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html \ No newline at end of file | 6 | test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/inessential.com.txt b/inc/3rdparty/site_config/standard/inessential.com.txt index 312cec4b..52252455 100644..100755 --- a/inc/3rdparty/site_config/standard/inessential.com.txt +++ b/inc/3rdparty/site_config/standard/inessential.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class='weblogPost']/h3[1] | 1 | title: //div[@class='weblogPost']/h3[1] |
2 | author: ("Brent Simmons") | 2 | author: ("Brent Simmons") |
3 | date: //span[@class="weblogPostDisplayDate"] | 3 | date: //span[@class="weblogPostDisplayDate"] |
4 | body: //div[@class='weblogPostBody'] | 4 | body: //div[@class='weblogPostBody'] |
5 | test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo \ No newline at end of file | 5 | test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/info.abril.com.br.txt b/inc/3rdparty/site_config/standard/info.abril.com.br.txt index 64cf3c8e..dee69f80 100644..100755 --- a/inc/3rdparty/site_config/standard/info.abril.com.br.txt +++ b/inc/3rdparty/site_config/standard/info.abril.com.br.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | body://div[@id='texto_link'] | 2 | body://div[@id='texto_link'] |
3 | 3 | ||
4 | test_url: http://info.abril.com.br/noticias/internet/filme-do-youtube-vai-estrear-nos-cinemas-22042011-6.shl \ No newline at end of file | 4 | test_url: http://info.abril.com.br/noticias/internet/filme-do-youtube-vai-estrear-nos-cinemas-22042011-6.shl \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/infoq.com.txt b/inc/3rdparty/site_config/standard/infoq.com.txt index 3a4e402d..f4a328a6 100644..100755 --- a/inc/3rdparty/site_config/standard/infoq.com.txt +++ b/inc/3rdparty/site_config/standard/infoq.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | body: //div[@id="intTranscript"] | 1 | body: //div[@id="intTranscript"] |
2 | body: //div[@class="box-content"] | 2 | body: //div[@class="box-content"] |
3 | title: //div[@class="box-content"]//h1[1] | 3 | title: //div[@class="box-content"]//h1[1] |
4 | author: //p[@class="info"]/strong | 4 | author: //p[@class="info"]/strong |
5 | date: substring-before(substring-after(//p[@class="info"], "on"), "Length") | 5 | date: substring-before(substring-after(//p[@class="info"], "on"), "Length") |
6 | strip: //div[@class="box-content"]//h1[1] | 6 | strip: //div[@class="box-content"]//h1[1] |
7 | strip: //div[@class="box-content"]//p[@class="info"] | 7 | strip: //div[@class="box-content"]//p[@class="info"] |
8 | strip_id_or_class: vendor-content-box | 8 | strip_id_or_class: vendor-content-box |
9 | strip_id_or_class: tags2 | 9 | strip_id_or_class: tags2 |
10 | strip_id_or_class: instructions | 10 | strip_id_or_class: instructions |
11 | strip_id_or_class: comments | 11 | strip_id_or_class: comments |
12 | strip_id_or_class: forum-list-tree | 12 | strip_id_or_class: forum-list-tree |
13 | strip: //div[@class="addthis_toolbox addthis_default_style"] | 13 | strip: //div[@class="addthis_toolbox addthis_default_style"] |
14 | test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview \ No newline at end of file | 14 | test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/informador.com.mx.txt b/inc/3rdparty/site_config/standard/informador.com.mx.txt index eedec24f..77987493 100644..100755 --- a/inc/3rdparty/site_config/standard/informador.com.mx.txt +++ b/inc/3rdparty/site_config/standard/informador.com.mx.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@class='tituloInt'] | 1 | title: //div[@class='tituloInt'] |
2 | body: //div[@class='notaPortada'] | 2 | body: //div[@class='notaPortada'] |
3 | strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota'] | 3 | strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota'] |
4 | date: //span[@class='publi'] | 4 | date: //span[@class='publi'] |
5 | author: //span[@class='autor'] | 5 | author: //span[@class='autor'] |
6 | tidy: no | 6 | tidy: no |
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm \ No newline at end of file | 9 | test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/information.dk.txt b/inc/3rdparty/site_config/standard/information.dk.txt index 6e3c3b1a..3ade754d 100644..100755 --- a/inc/3rdparty/site_config/standard/information.dk.txt +++ b/inc/3rdparty/site_config/standard/information.dk.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | author: //*[@property='dc:creator'] | 2 | author: //*[@property='dc:creator'] |
3 | date: //*[@property='dc:date']/@content | 3 | date: //*[@property='dc:date']/@content |
4 | body: //div[@id='page-content']//div[contains(@class, 'article-body')] | 4 | body: //div[@id='page-content']//div[contains(@class, 'article-body')] |
5 | 5 | ||
6 | tidy: no | 6 | tidy: no |
7 | test_url: http://www.information.dk/282307 \ No newline at end of file | 7 | test_url: http://www.information.dk/282307 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/informationarchitects.net.txt b/inc/3rdparty/site_config/standard/informationarchitects.net.txt index 134306cd..1330a040 100644..100755 --- a/inc/3rdparty/site_config/standard/informationarchitects.net.txt +++ b/inc/3rdparty/site_config/standard/informationarchitects.net.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title://h1[@class="post_title"] | 1 | title://h1[@class="post_title"] |
2 | body://article[@class="post"] | 2 | body://article[@class="post"] |
3 | date://h1[@class="section_separator"] | 3 | date://h1[@class="section_separator"] |
4 | author://span[@class="post_author"] | 4 | author://span[@class="post_author"] |
5 | strip://nav[@class="arrow_nav"] | 5 | strip://nav[@class="arrow_nav"] |
6 | strip://section[@id="contact"] | 6 | strip://section[@id="contact"] |
7 | strip_id_or_class:post_title | 7 | strip_id_or_class:post_title |
8 | strip_id_or_class:post_author | 8 | strip_id_or_class:post_author |
9 | strip_id_or_class:section_separator | 9 | strip_id_or_class:section_separator |
10 | test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/ \ No newline at end of file | 10 | test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt b/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt index 0879e9e6..60b798e6 100644..100755 --- a/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt +++ b/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //head/title | 1 | title: //head/title |
2 | body: //table[@id='table3']//div[@class='postContent'] | 2 | body: //table[@id='table3']//div[@class='postContent'] |
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | test_url: http://www.informationclearinghouse.info/article28238.htm \ No newline at end of file | 6 | test_url: http://www.informationclearinghouse.info/article28238.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/informit.com.txt b/inc/3rdparty/site_config/standard/informit.com.txt index 84c1fdcf..24bf6242 100644..100755 --- a/inc/3rdparty/site_config/standard/informit.com.txt +++ b/inc/3rdparty/site_config/standard/informit.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@id='content']/h1 | 1 | title: //div[@id='content']/h1 |
2 | body: //div[@id="content"] | 2 | body: //div[@id="content"] |
3 | strip: //img[contains(@src, 'informit_printer.png')] | 3 | strip: //img[contains(@src, 'informit_printer.png')] |
4 | single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')] | 4 | single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')] |
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.informit.com/articles/article.aspx?p=1729268 \ No newline at end of file | 7 | test_url: http://www.informit.com/articles/article.aspx?p=1729268 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/infoworld.com.txt b/inc/3rdparty/site_config/standard/infoworld.com.txt index dd588ed8..d335bc4a 100644..100755 --- a/inc/3rdparty/site_config/standard/infoworld.com.txt +++ b/inc/3rdparty/site_config/standard/infoworld.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@id='main_text'] | 1 | body: //div[@id='main_text'] |
2 | title: //div[@id='main_text']/h1 | 2 | title: //div[@id='main_text']/h1 |
3 | strip: //div[@id='main_text']/h1 | 3 | strip: //div[@id='main_text']/h1 |
4 | strip: //div[@id='main_text']/h2 | 4 | strip: //div[@id='main_text']/h2 |
5 | strip_id_or_class: tools | 5 | strip_id_or_class: tools |
6 | strip_id_or_class: articleTools | 6 | strip_id_or_class: articleTools |
7 | strip_id_or_class: pagination | 7 | strip_id_or_class: pagination |
8 | strip_id_or_class: byline | 8 | strip_id_or_class: byline |
9 | strip_id_or_class: tweet | 9 | strip_id_or_class: tweet |
10 | date: //div[@class='date'] | 10 | date: //div[@class='date'] |
11 | strip: //div[@class='date'] | 11 | strip: //div[@class='date'] |
12 | test_url: http://www.infoworld.com/d/the-industry-standard/it-jobs-the-rise-both-offshore-and-in-us-187689 \ No newline at end of file | 12 | test_url: http://www.infoworld.com/d/the-industry-standard/it-jobs-the-rise-both-offshore-and-in-us-187689 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/infzm.com.txt b/inc/3rdparty/site_config/standard/infzm.com.txt index 012c873f..489d5aff 100644..100755 --- a/inc/3rdparty/site_config/standard/infzm.com.txt +++ b/inc/3rdparty/site_config/standard/infzm.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | # This filter is tested on: | 1 | # This filter is tested on: |
2 | # http://www.infzm.com/content/71068 | 2 | # http://www.infzm.com/content/71068 |
3 | # http://www.infzm.com/content/41577 | 3 | # http://www.infzm.com/content/41577 |
4 | 4 | ||
5 | author://em[contains(@class, 'toAuthor')] | 5 | author://em[contains(@class, 'toAuthor')] |
6 | date:substring(//em[contains(@class, 'pubTime')],1) | 6 | date:substring(//em[contains(@class, 'pubTime')],1) |
7 | body://section[contains(@id, 'articleContent')] | 7 | body://section[contains(@id, 'articleContent')] |
8 | title://h1[contains(@class ,'articleHeadline clearfix')] | 8 | title://h1[contains(@class ,'articleHeadline clearfix')] |
9 | test_url: http://www.infzm.com/content/41577 \ No newline at end of file | 9 | test_url: http://www.infzm.com/content/41577 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/inhabitat.com.txt b/inc/3rdparty/site_config/standard/inhabitat.com.txt index 6629dafe..c63f53a6 100644..100755 --- a/inc/3rdparty/site_config/standard/inhabitat.com.txt +++ b/inc/3rdparty/site_config/standard/inhabitat.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | # set body | 1 | # set body |
2 | body: //div[@class='post-listing'] | 2 | body: //div[@class='post-listing'] |
3 | 3 | ||
4 | # remove clutter | 4 | # remove clutter |
5 | strip: //a/big | 5 | strip: //a/big |
6 | strip: //a/em | 6 | strip: //a/em |
7 | strip: //p/em | 7 | strip: //p/em |
8 | test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/ \ No newline at end of file | 8 | test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/instagr.am.txt b/inc/3rdparty/site_config/standard/instagr.am.txt index ad9e8214..522caebc 100644..100755 --- a/inc/3rdparty/site_config/standard/instagr.am.txt +++ b/inc/3rdparty/site_config/standard/instagr.am.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='caption'] | 1 | title: //div[@class='caption'] |
2 | author: //p[@class='username'] | 2 | author: //p[@class='username'] |
3 | 3 | ||
4 | strip: //div[@class='contents']/h3 | 4 | strip: //div[@class='contents']/h3 |
5 | strip: //div[@class='location'] | 5 | strip: //div[@class='location'] |
6 | test_url: http://instagr.am/p/G-s_aciyDJ/ \ No newline at end of file | 6 | test_url: http://instagr.am/p/G-s_aciyDJ/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/interest.co.nz.txt b/inc/3rdparty/site_config/standard/interest.co.nz.txt index 28c3310a..28c3310a 100644..100755 --- a/inc/3rdparty/site_config/standard/interest.co.nz.txt +++ b/inc/3rdparty/site_config/standard/interest.co.nz.txt | |||
diff --git a/inc/3rdparty/site_config/standard/iolanguage.com.txt b/inc/3rdparty/site_config/standard/iolanguage.com.txt index 231875ad..231875ad 100644..100755 --- a/inc/3rdparty/site_config/standard/iolanguage.com.txt +++ b/inc/3rdparty/site_config/standard/iolanguage.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/ipadclub.nl.txt b/inc/3rdparty/site_config/standard/ipadclub.nl.txt index d196059e..afe058df 100644..100755 --- a/inc/3rdparty/site_config/standard/ipadclub.nl.txt +++ b/inc/3rdparty/site_config/standard/ipadclub.nl.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | 1 | body: //div[@id = 'post'] |
2 | strip: //div[@class = 'postinfo'] | 2 | strip: //div[@class = 'postinfo'] |
3 | strip: //div[@id = 'postmetanew'] | 3 | strip: //div[@id = 'postmetanew'] |
4 | strip: //div[@class = 'paginator'] | 4 | strip: //div[@class = 'paginator'] |
5 | strip: //div[@class = 'col-2'] | 5 | strip: //div[@class = 'col-2'] |
6 | strip: //div[@id = 'adfactor-label'] | 6 | strip: //div[@id = 'adfactor-label'] |
7 | test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/ \ No newline at end of file | 7 | test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ipadplanet.nl.txt b/inc/3rdparty/site_config/standard/ipadplanet.nl.txt index a2e49005..dedb5572 100644..100755 --- a/inc/3rdparty/site_config/standard/ipadplanet.nl.txt +++ b/inc/3rdparty/site_config/standard/ipadplanet.nl.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | 1 | body: //div[@id = 'post'] |
2 | strip: //div[@class = 'postinfo'] | 2 | strip: //div[@class = 'postinfo'] |
3 | strip: //div[@id = 'postmetanew'] | 3 | strip: //div[@id = 'postmetanew'] |
4 | strip: //div[@class = 'paginator'] | 4 | strip: //div[@class = 'paginator'] |
5 | strip: //div[@class = 'col-2'] | 5 | strip: //div[@class = 'col-2'] |
6 | strip: //div[@id = 'adfactor-label'] | 6 | strip: //div[@id = 'adfactor-label'] |
7 | test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/ \ No newline at end of file | 7 | test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/iphoneclub.nl.txt b/inc/3rdparty/site_config/standard/iphoneclub.nl.txt index f8d4f6a6..850a24e9 100644..100755 --- a/inc/3rdparty/site_config/standard/iphoneclub.nl.txt +++ b/inc/3rdparty/site_config/standard/iphoneclub.nl.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | 1 | body: //div[@id = 'post'] |
2 | strip: //div[@class = 'postinfo'] | 2 | strip: //div[@class = 'postinfo'] |
3 | strip: //div[@id = 'postmetanew'] | 3 | strip: //div[@id = 'postmetanew'] |
4 | strip: //div[@class = 'paginator'] | 4 | strip: //div[@class = 'paginator'] |
5 | strip: //div[@class = 'col-2'] | 5 | strip: //div[@class = 'col-2'] |
6 | strip: //div[@id = 'adfactor-label'] | 6 | strip: //div[@id = 'adfactor-label'] |
7 | test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/ \ No newline at end of file | 7 | test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/iphonehacks.com.txt b/inc/3rdparty/site_config/standard/iphonehacks.com.txt index c97ff43c..e8ccea06 100644..100755 --- a/inc/3rdparty/site_config/standard/iphonehacks.com.txt +++ b/inc/3rdparty/site_config/standard/iphonehacks.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //meta[@name='og:title']/@content | 1 | title: //meta[@name='og:title']/@content |
2 | body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')] | 2 | body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')] |
3 | 3 | ||
4 | strip: //span[@vanilla-identifier] | 4 | strip: //span[@vanilla-identifier] |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | tidy: no | 7 | tidy: no |
8 | 8 | ||
9 | test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html \ No newline at end of file | 9 | test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/iplaysoft.com.txt b/inc/3rdparty/site_config/standard/iplaysoft.com.txt index 4a944768..4a944768 100644..100755 --- a/inc/3rdparty/site_config/standard/iplaysoft.com.txt +++ b/inc/3rdparty/site_config/standard/iplaysoft.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/isource.com.txt b/inc/3rdparty/site_config/standard/isource.com.txt index a1c16a16..215fdf87 100644..100755 --- a/inc/3rdparty/site_config/standard/isource.com.txt +++ b/inc/3rdparty/site_config/standard/isource.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | # Remove social buttons | 1 | # Remove social buttons |
2 | strip: //div[@id='temp_Content_Right'] | 2 | strip: //div[@id='temp_Content_Right'] |
3 | 3 | ||
4 | # Remove duplicate article title | 4 | # Remove duplicate article title |
5 | strip: //*[(@class='storytitle')] | 5 | strip: //*[(@class='storytitle')] |
6 | test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/ \ No newline at end of file | 6 | test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/itavisen.no.txt b/inc/3rdparty/site_config/standard/itavisen.no.txt index 8da78cb0..3ba484a7 100644..100755 --- a/inc/3rdparty/site_config/standard/itavisen.no.txt +++ b/inc/3rdparty/site_config/standard/itavisen.no.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | author: //p[@class = 'writer'] | 1 | author: //p[@class = 'writer'] |
2 | 2 | ||
3 | date: //p[@class = 'published-time'] | 3 | date: //p[@class = 'published-time'] |
4 | 4 | ||
5 | body: //div[@class = 'text main'] | 5 | body: //div[@class = 'text main'] |
6 | test_url: http://www.itavisen.no/899786/old-republic-blir-gratis \ No newline at end of file | 6 | test_url: http://www.itavisen.no/899786/old-republic-blir-gratis \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/itmedia.co.jp.txt b/inc/3rdparty/site_config/standard/itmedia.co.jp.txt new file mode 100755 index 00000000..97f00ce8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/itmedia.co.jp.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[@id='cmsBody'] | ||
2 | |||
3 | next_page_link: //span[@id='next']/a | ||
4 | |||
5 | strip_id_or_class: cmsCopyright | ||
6 | strip_id_or_class: masterSocialbuttonBtm | ||
7 | |||
8 | test_url: http://www.itmedia.co.jp/enterprise/articles/0912/05/news002.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/itstactical.com.txt b/inc/3rdparty/site_config/standard/itstactical.com.txt index 550875ec..b8cb461c 100644..100755 --- a/inc/3rdparty/site_config/standard/itstactical.com.txt +++ b/inc/3rdparty/site_config/standard/itstactical.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h1[@class="entry-title"] | 1 | title: //h1[@class="entry-title"] |
2 | body: //div[@class='format_text entry-content'] | 2 | body: //div[@class='format_text entry-content'] |
3 | author: //span[@class="author vcard"]/a | 3 | author: //span[@class="author vcard"]/a |
4 | date: //abbr[@class="published"] | 4 | date: //abbr[@class="published"] |
5 | 5 | ||
6 | strip_id_or_class: related-posts | 6 | strip_id_or_class: related-posts |
7 | strip_id_or_class: membershipbox | 7 | strip_id_or_class: membershipbox |
8 | strip_id_or_class: share_this_compact_bt | 8 | strip_id_or_class: share_this_compact_bt |
9 | 9 | ||
10 | 10 | ||
11 | footnotes: no | 11 | footnotes: no |
12 | test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/ \ No newline at end of file | 12 | test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/itwire.com.txt b/inc/3rdparty/site_config/standard/itwire.com.txt new file mode 100755 index 00000000..72b41065 --- /dev/null +++ b/inc/3rdparty/site_config/standard/itwire.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: //a[@rel="author"] | ||
2 | date: //li[@class="itemDateCreated"] | ||
3 | strip: //div[contains(@class, 'legend-rounded')] | ||
4 | |||
5 | test_url: http://www.itwire.com/it-industry-news/market/59661-ibm-looks-to-high-value-solutions-to-meet-changing-demands | ||
diff --git a/inc/3rdparty/site_config/standard/itworld.com.txt b/inc/3rdparty/site_config/standard/itworld.com.txt index d4fa604e..1ee0ee58 100644..100755 --- a/inc/3rdparty/site_config/standard/itworld.com.txt +++ b/inc/3rdparty/site_config/standard/itworld.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //*[@id="article-title"] | 1 | title: //*[@id="article-title"] |
2 | author: //*[@id="article-info"]/strong | 2 | author: //*[@id="article-info"]/strong |
3 | date: //*[@class="article-dateline"]/strong | 3 | date: //*[@class="article-dateline"]/strong |
4 | body: //*[@id="article-content"] | 4 | body: //*[@id="article-content"] |
5 | test_url: http://www.itworld.com/open-source/140916/android-sued-microsoft-not-linux \ No newline at end of file | 5 | test_url: http://www.itworld.com/open-source/140916/android-sued-microsoft-not-linux \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/izismile.com.txt b/inc/3rdparty/site_config/standard/izismile.com.txt index af3f299a..b0114d35 100644..100755 --- a/inc/3rdparty/site_config/standard/izismile.com.txt +++ b/inc/3rdparty/site_config/standard/izismile.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[starts-with(@id, 'news-id-')] | 1 | body: //div[starts-with(@id, 'news-id-')] |
2 | prune: no | 2 | prune: no |
3 | 3 | ||
4 | test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html \ No newline at end of file | 4 | test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/jalopnik.com.txt b/inc/3rdparty/site_config/standard/jalopnik.com.txt index fc2eef8e..fc2eef8e 100644..100755 --- a/inc/3rdparty/site_config/standard/jalopnik.com.txt +++ b/inc/3rdparty/site_config/standard/jalopnik.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/jandan.net.txt b/inc/3rdparty/site_config/standard/jandan.net.txt index f1dd3d17..343fd6fb 100644..100755 --- a/inc/3rdparty/site_config/standard/jandan.net.txt +++ b/inc/3rdparty/site_config/standard/jandan.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@id='content']//div[@class = 'post f'] | 1 | body: //div[@id='content']//div[@class = 'post f'] |
2 | strip_id_or_class: comment-big | 2 | strip_id_or_class: comment-big |
3 | strip_id_or_class: avatar | 3 | strip_id_or_class: avatar |
4 | strip: //div[@class='time_s'] | 4 | strip: //div[@class='time_s'] |
5 | 5 | ||
6 | test_url: http://jandan.net/2011/04/03/iphone-5-sony.html \ No newline at end of file | 6 | test_url: http://jandan.net/2011/04/03/iphone-5-sony.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt index 6e8af934..00e4cf63 100644..100755 --- a/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt +++ b/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //p[contains(@class, 'author')]/a | 2 | author: //p[contains(@class, 'author')]/a |
3 | date: //p[contains(@class, 'time')] | 3 | date: //p[contains(@class, 'time')] |
4 | body: //div[@class='content']/div[contains(@class, 'text')] | 4 | body: //div[@class='content']/div[contains(@class, 'text')] |
5 | 5 | ||
6 | # prevent "no text" errors on multi-page articles | 6 | # prevent "no text" errors on multi-page articles |
7 | tidy: no | 7 | tidy: no |
8 | 8 | ||
9 | # we use a custom next-link detector instead of the print view because | 9 | # we use a custom next-link detector instead of the print view because |
10 | # it's pretty hard to strip out the unwanted parts in the print view | 10 | # it's pretty hard to strip out the unwanted parts in the print view |
11 | autodetect_next_page: no | 11 | autodetect_next_page: no |
12 | next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more '] | 12 | next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more '] |
13 | 13 | ||
14 | strip: //h1 | 14 | strip: //h1 |
15 | 15 | ||
16 | strip_id_or_class: meta | 16 | strip_id_or_class: meta |
17 | strip_id_or_class: author | 17 | strip_id_or_class: author |
18 | strip_id_or_class: paging | 18 | strip_id_or_class: paging |
19 | 19 | ||
20 | # prevent "Report an Error" from being recognized as footnote | 20 | # prevent "Report an Error" from being recognized as footnote |
21 | footnotes: no | 21 | footnotes: no |
22 | test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken \ No newline at end of file | 22 | test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/jjahnke.net.txt b/inc/3rdparty/site_config/standard/jjahnke.net.txt index 95c45ee7..d45c8899 100644..100755 --- a/inc/3rdparty/site_config/standard/jjahnke.net.txt +++ b/inc/3rdparty/site_config/standard/jjahnke.net.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class='entry'] | 1 | body: //div[@class='entry'] |
2 | prune: no | 2 | prune: no |
3 | 3 | ||
4 | test_url: http://www.jjahnke.net/rundbr87.html#2514 \ No newline at end of file | 4 | test_url: http://www.jjahnke.net/rundbr87.html#2514 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt b/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt index af8d7d17..1dbe2072 100644..100755 --- a/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt +++ b/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@id='formatCont_en'] | 1 | body: //div[@id='formatCont_en'] |
2 | 2 | ||
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922 \ No newline at end of file | 5 | test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/joelonsoftware.com.txt b/inc/3rdparty/site_config/standard/joelonsoftware.com.txt index 75fbee5a..241a361f 100644..100755 --- a/inc/3rdparty/site_config/standard/joelonsoftware.com.txt +++ b/inc/3rdparty/site_config/standard/joelonsoftware.com.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | # Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html | 1 | # Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html |
2 | 2 | ||
3 | author: substring-after(//div[@class="author"], 'by ') | 3 | author: substring-after(//div[@class="author"], 'by ') |
4 | date: //div[@class="date"] | 4 | date: //div[@class="date"] |
5 | 5 | ||
6 | ## Clean stuff at top ## | 6 | ## Clean stuff at top ## |
7 | 7 | ||
8 | strip: //h1[1] | 8 | strip: //h1[1] |
9 | strip: //h2[1] | 9 | strip: //h2[1] |
10 | strip: //div[@class="date"] | 10 | strip: //div[@class="date"] |
11 | strip: //div[@class="author"] | 11 | strip: //div[@class="author"] |
12 | 12 | ||
13 | ## Clean stuff at bottom ## | 13 | ## Clean stuff at bottom ## |
14 | 14 | ||
15 | strip: //blockquote[@class="textmessage"] | 15 | strip: //blockquote[@class="textmessage"] |
16 | strip: //div[@style="width:500px"]/p[last()] | 16 | strip: //div[@style="width:500px"]/p[last()] |
17 | strip: //div[@style="width:500px"]/p[last()-1] | 17 | strip: //div[@style="width:500px"]/p[last()-1] |
18 | strip: //div[@style="width:500px"]/h4[last()] | 18 | strip: //div[@style="width:500px"]/h4[last()] |
19 | strip: //div[@style="width:500px"]/h4[last()-1] | 19 | strip: //div[@style="width:500px"]/h4[last()-1] |
20 | strip: //div[@style="width:500px"]/div[last()] | 20 | strip: //div[@style="width:500px"]/div[last()] |
21 | test_url: http://www.joelonsoftware.com/items/2011/09/15.html \ No newline at end of file | 21 | test_url: http://www.joelonsoftware.com/items/2011/09/15.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/jouire.com.txt b/inc/3rdparty/site_config/standard/jouire.com.txt index 535a501e..3cf60672 100644..100755 --- a/inc/3rdparty/site_config/standard/jouire.com.txt +++ b/inc/3rdparty/site_config/standard/jouire.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | author: //h1 | 1 | author: //h1 |
2 | date: //p[contains(@class,'date')] | 2 | date: //p[contains(@class,'date')] |
3 | test_url: http://jouire.com/2011/01/exquisite-whispers/ \ No newline at end of file | 3 | test_url: http://jouire.com/2011/01/exquisite-whispers/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/joystiq.com.txt b/inc/3rdparty/site_config/standard/joystiq.com.txt index 7fbd467d..7a8e56f8 100644..100755 --- a/inc/3rdparty/site_config/standard/joystiq.com.txt +++ b/inc/3rdparty/site_config/standard/joystiq.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | author: //a[@class="byline-author"] | 1 | author: //a[@class="byline-author"] |
2 | title: //h1[@class="headline"] | 2 | title: //h1[@class="headline"] |
3 | strip: //div[@id="info-card"] | 3 | strip: //div[@id="info-card"] |
4 | strip: //div[@id="breaking-news"] | 4 | strip: //div[@id="breaking-news"] |
5 | strip: //div[@class="rmod list-post-mod"] | 5 | strip: //div[@class="rmod list-post-mod"] |
6 | strip: //div[@id="footer"] | 6 | strip: //div[@id="footer"] |
7 | strip: //div[@id="GH_strip"] | 7 | strip: //div[@id="GH_strip"] |
8 | test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/ \ No newline at end of file | 8 | test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt b/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt index be844e57..ff5a0244 100644..100755 --- a/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt +++ b/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | body: //div[@id='article_container'] | 1 | body: //div[@id='article_container'] |
2 | author: //h4//a[@class='author'] | 2 | author: //h4//a[@class='author'] |
3 | title: //h1 | 3 | title: //h1 |
4 | 4 | ||
5 | replace_string(lang="en"): lang="de" | 5 | replace_string(lang="en"): lang="de" |
6 | replace_string(/>1</a>):/></a> | 6 | replace_string(/>1</a>):/></a> |
7 | 7 | ||
8 | strip_id_or_class: share_toolbox | 8 | strip_id_or_class: share_toolbox |
9 | strip_id_or_class: article_header | 9 | strip_id_or_class: article_header |
10 | strip_id_or_class: phototext | 10 | strip_id_or_class: phototext |
11 | 11 | ||
12 | strip_image_src: icon_author.gif | 12 | strip_image_src: icon_author.gif |
13 | 13 | ||
14 | strip: //img[@src=''] | 14 | strip: //img[@src=''] |
15 | strip: //h4[@id='author'] | 15 | strip: //h4[@id='author'] |
16 | 16 | ||
17 | prune: no | 17 | prune: no |
18 | 18 | ||
19 | test_url: http://www.juedische-allgemeine.de/article/view/id/13366 \ No newline at end of file | 19 | test_url: http://www.juedische-allgemeine.de/article/view/id/13366 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/juppy.org.txt b/inc/3rdparty/site_config/standard/juppy.org.txt index e2d07f24..fdf7cdc9 100644..100755 --- a/inc/3rdparty/site_config/standard/juppy.org.txt +++ b/inc/3rdparty/site_config/standard/juppy.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | convert_double_br_tags: yes | 1 | convert_double_br_tags: yes |
2 | 2 | ||
3 | title: //div[@id="storycredits"]/p/span[@class="title"] | 3 | title: //div[@id="storycredits"]/p/span[@class="title"] |
4 | author: //div[@id="storycredits"]/p/br[1]/following-sibling::text() | 4 | author: //div[@id="storycredits"]/p/br[1]/following-sibling::text() |
5 | 5 | ||
6 | strip: //div[@id="storycredits"] | 6 | strip: //div[@id="storycredits"] |
7 | 7 | ||
8 | test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005 \ No newline at end of file | 8 | test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kachestvo.ru.txt b/inc/3rdparty/site_config/standard/kachestvo.ru.txt index 34404e96..535693c4 100644..100755 --- a/inc/3rdparty/site_config/standard/kachestvo.ru.txt +++ b/inc/3rdparty/site_config/standard/kachestvo.ru.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[contains(@class, 'inner_content')] | 1 | body: //div[contains(@class, 'inner_content')] |
2 | 2 | ||
3 | test_url: http://kachestvo.ru/promtovar/odezhda/denim.html \ No newline at end of file | 3 | test_url: http://kachestvo.ru/promtovar/odezhda/denim.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kachiblog.com.txt b/inc/3rdparty/site_config/standard/kachiblog.com.txt new file mode 100755 index 00000000..35baf8df --- /dev/null +++ b/inc/3rdparty/site_config/standard/kachiblog.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h3[contains(@class, 'entry-title')] | ||
2 | date: //abbr[@itemprop='datePublished']/@title | ||
3 | body: //div[@itemprop='articleBody'] | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://www.kachiblog.com/2013/05/samsung-galaxy-s4-vs-samsung-galaxy.html | ||
7 | test_url: http://www.kachiblog.com/feeds/posts/default \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kathimerini.gr.txt b/inc/3rdparty/site_config/standard/kathimerini.gr.txt new file mode 100755 index 00000000..2c7c518c --- /dev/null +++ b/inc/3rdparty/site_config/standard/kathimerini.gr.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //td[contains(@class, 'articleTitlos')] | ||
2 | body: //td[contains(@class, 'eelantext')] | ||
3 | |||
4 | test_url: http://www.kathimerini.gr/4dcgi/_w_articles_kathremote_1_03/12/2013_530490 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kenrockwell.com.txt b/inc/3rdparty/site_config/standard/kenrockwell.com.txt index e6d100ea..90c64cbf 100644..100755 --- a/inc/3rdparty/site_config/standard/kenrockwell.com.txt +++ b/inc/3rdparty/site_config/standard/kenrockwell.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | # Ads | 1 | # Ads |
2 | strip: //table[@align="right"][@width="120"] | 2 | strip: //table[@align="right"][@width="120"] |
3 | 3 | ||
4 | # Affiliate link paragraphs | 4 | # Affiliate link paragraphs |
5 | strip: //a[.="Adorama"]/parent::p[contains(., "goodies")] | 5 | strip: //a[.="Adorama"]/parent::p[contains(., "goodies")] |
6 | strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")] | 6 | strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")] |
7 | test_url: http://www.kenrockwell.com/tech/composition.htm \ No newline at end of file | 7 | test_url: http://www.kenrockwell.com/tech/composition.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kicker.de.txt b/inc/3rdparty/site_config/standard/kicker.de.txt index 7d5daa4b..db4f63c4 100644..100755 --- a/inc/3rdparty/site_config/standard/kicker.de.txt +++ b/inc/3rdparty/site_config/standard/kicker.de.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | # set body | 1 | # set body |
2 | body: //div[@id='ovArtikel'] | 2 | body: //div[@id='ovArtikel'] |
3 | 3 | ||
4 | # set title | 4 | # set title |
5 | title: //div[@id='ovArtikel']/h1 | 5 | title: //div[@id='ovArtikel']/h1 |
6 | # strip main title and leave sub title | 6 | # strip main title and leave sub title |
7 | strip: //div[@id='ovArtikel']/h1 | 7 | strip: //div[@id='ovArtikel']/h1 |
8 | 8 | ||
9 | date: //div[@class='publicdate'] | 9 | date: //div[@class='publicdate'] |
10 | 10 | ||
11 | #remove captions | 11 | #remove captions |
12 | strip: //*/div[@class='bu'] | 12 | strip: //*/div[@class='bu'] |
13 | strip: //*/div[@class='credit'] | 13 | strip: //*/div[@class='credit'] |
14 | 14 | ||
15 | #remove adds | 15 | #remove adds |
16 | strip: //*/div[@class='ad-head'] | 16 | strip: //*/div[@class='ad-head'] |
17 | strip: //*/div[@class='linksebay'] | 17 | strip: //*/div[@class='linksebay'] |
18 | 18 | ||
19 | # remove video content | 19 | # remove video content |
20 | strip: //*/div[@class='ovVideo'] | 20 | strip: //*/div[@class='ovVideo'] |
21 | test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html \ No newline at end of file | 21 | test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kickstarter.com.txt b/inc/3rdparty/site_config/standard/kickstarter.com.txt index c055659f..7b3daa58 100644..100755 --- a/inc/3rdparty/site_config/standard/kickstarter.com.txt +++ b/inc/3rdparty/site_config/standard/kickstarter.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1[@id='name'] | 1 | title: //h1[@id='name'] |
2 | body: //*[@id='leftcol'] | 2 | body: //*[@id='leftcol'] |
3 | 3 | ||
4 | strip_id_or_class: 'share-box' | 4 | strip_id_or_class: 'share-box' |
5 | strip_id_or_class: 'project-faqs' | 5 | strip_id_or_class: 'project-faqs' |
6 | strip_id_or_class: 'report-issue-wrap' | 6 | strip_id_or_class: 'report-issue-wrap' |
7 | test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone \ No newline at end of file | 7 | test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kingarthurflour.com.txt b/inc/3rdparty/site_config/standard/kingarthurflour.com.txt index 2f6783a3..b27539f5 100644..100755 --- a/inc/3rdparty/site_config/standard/kingarthurflour.com.txt +++ b/inc/3rdparty/site_config/standard/kingarthurflour.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //div[@class='post']/h2 | 1 | title: //div[@class='post']/h2 |
2 | body: //div[@class='entry'] | 2 | body: //div[@class='entry'] |
3 | strip: //p[contains(.,'Tags:')] | 3 | strip: //p[contains(.,'Tags:')] |
4 | test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/ \ No newline at end of file | 4 | test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kotaku.com.txt b/inc/3rdparty/site_config/standard/kotaku.com.txt index be439d75..be439d75 100644..100755 --- a/inc/3rdparty/site_config/standard/kotaku.com.txt +++ b/inc/3rdparty/site_config/standard/kotaku.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/kottke.org.txt b/inc/3rdparty/site_config/standard/kottke.org.txt index f93a61e7..582f251c 100644..100755 --- a/inc/3rdparty/site_config/standard/kottke.org.txt +++ b/inc/3rdparty/site_config/standard/kottke.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | author: //*[@id='main']/div/a[1] | 2 | author: //*[@id='main']/div/a[1] |
3 | date: substring-before(substring-after(//div[@class='meta'],'•'),'•') | 3 | date: substring-before(substring-after(//div[@class='meta'],'•'),'•') |
4 | body: //div[@id='main'] | 4 | body: //div[@id='main'] |
5 | strip: //div[@class='meta'] | 5 | strip: //div[@class='meta'] |
6 | test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters \ No newline at end of file | 6 | test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kumailplus.com.txt b/inc/3rdparty/site_config/standard/kumailplus.com.txt index 9e15cc34..2f604de0 100644..100755 --- a/inc/3rdparty/site_config/standard/kumailplus.com.txt +++ b/inc/3rdparty/site_config/standard/kumailplus.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class = "entry-full"] | 1 | body: //div[@class = "entry-full"] |
2 | 2 | ||
3 | test_url: http://www.kumailplus.com/2011/12/02/24308 \ No newline at end of file | 3 | test_url: http://www.kumailplus.com/2011/12/02/24308 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kumb.com.txt b/inc/3rdparty/site_config/standard/kumb.com.txt index 3f0d2369..fe350622 100644..100755 --- a/inc/3rdparty/site_config/standard/kumb.com.txt +++ b/inc/3rdparty/site_config/standard/kumb.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //div[@id='centrediv']/h1 | 1 | title: //div[@id='centrediv']/h1 |
2 | 2 | ||
3 | author: substring-after(//div[@id='centrediv']/h3,'By: ') | 3 | author: substring-after(//div[@id='centrediv']/h3,'By: ') |
4 | 4 | ||
5 | date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ') | 5 | date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ') |
6 | 6 | ||
7 | body: //div[@class='KonaBody'] | 7 | body: //div[@class='KonaBody'] |
8 | 8 | ||
9 | convert_double_br_tags: yes | 9 | convert_double_br_tags: yes |
10 | test_url: http://www.kumb.com/story.php?id=126084 \ No newline at end of file | 10 | test_url: http://www.kumb.com/story.php?id=126084 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/kwerfeldein.de.txt b/inc/3rdparty/site_config/standard/kwerfeldein.de.txt index 879b4d6c..cf4d3b8c 100644..100755 --- a/inc/3rdparty/site_config/standard/kwerfeldein.de.txt +++ b/inc/3rdparty/site_config/standard/kwerfeldein.de.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | date: //span[@class='datum'] | 1 | date: //span[@class='datum'] |
2 | title: //div[@class='artikel']/h2 | 2 | title: //div[@class='artikel']/h2 |
3 | body: //div[@class='entry'] | 3 | body: //div[@class='entry'] |
4 | strip: //p[@class='tags'] | 4 | strip: //p[@class='tags'] |
5 | author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ') | 5 | author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ') |
6 | strip: //div[@class='authorinfo'] | 6 | strip: //div[@class='authorinfo'] |
7 | strip: //div[@class='authorpic'] | 7 | strip: //div[@class='authorpic'] |
8 | 8 | ||
9 | test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/ \ No newline at end of file | 9 | test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/landetsfria.se.txt b/inc/3rdparty/site_config/standard/landetsfria.se.txt new file mode 100755 index 00000000..e5317a5a --- /dev/null +++ b/inc/3rdparty/site_config/standard/landetsfria.se.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.landetsfria.se/artikel/112070 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt b/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt index a34e39dd..d25999d0 100644..100755 --- a/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt +++ b/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //h1[@class='headline'] | 1 | title: //h1[@class='headline'] |
2 | body: //div[@class='article'] | 2 | body: //div[@class='article'] |
3 | strip: //div[@class='article']//h3[contains(@class, 'section')] | 3 | strip: //div[@class='article']//h3[contains(@class, 'section')] |
4 | strip: //div[@class='article']//ul[contains(@class, 'article-actions')] | 4 | strip: //div[@class='article']//ul[contains(@class, 'article-actions')] |
5 | strip: //div[@id='syndication-upper'] | 5 | strip: //div[@id='syndication-upper'] |
6 | strip: //a[@id='syndication'] | 6 | strip: //a[@id='syndication'] |
7 | strip: //dl[@id='article-tags'] | 7 | strip: //dl[@id='article-tags'] |
8 | strip: //div[@id='article-like'] | 8 | strip: //div[@id='article-like'] |
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | single_page_link: //li[@class='single-page']/a | 11 | single_page_link: //li[@class='single-page']/a |
12 | 12 | ||
13 | test_url: http://www.laphamsquarterly.org/essays/balanced-diets.php \ No newline at end of file | 13 | test_url: http://www.laphamsquarterly.org/essays/balanced-diets.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/laprensagrafica.com.txt b/inc/3rdparty/site_config/standard/laprensagrafica.com.txt index e771f81f..82374c0b 100644..100755 --- a/inc/3rdparty/site_config/standard/laprensagrafica.com.txt +++ b/inc/3rdparty/site_config/standard/laprensagrafica.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | 2 | ||
3 | test_url: http://www.laprensagrafica.com/opinion/editorial/229252-reflexiones-sobre-la-educacion-que-necesitamos.html \ No newline at end of file | 3 | test_url: http://www.laprensagrafica.com/opinion/editorial/229252-reflexiones-sobre-la-educacion-que-necesitamos.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/laquadrature.net.txt b/inc/3rdparty/site_config/standard/laquadrature.net.txt index 5bad8e65..746bfca7 100644..100755 --- a/inc/3rdparty/site_config/standard/laquadrature.net.txt +++ b/inc/3rdparty/site_config/standard/laquadrature.net.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@id='content-content']//div[@class='content'] | 1 | body: //div[@id='content-content']//div[@class='content'] |
2 | title: //h1[@class='title'] | 2 | title: //h1[@class='title'] |
3 | date: substring-after(//*[@class='submitted'],'Submitted on') | 3 | date: substring-after(//*[@class='submitted'],'Submitted on') |
4 | tidy: no | 4 | tidy: no |
5 | strip: //div[@class='terms terms-inline'] | 5 | strip: //div[@class='terms terms-inline'] |
6 | strip: //div[@class='more'] | 6 | strip: //div[@class='more'] |
7 | strip: //div[@class='share-links'] | 7 | strip: //div[@class='share-links'] |
8 | strip: //table[@id='attachments'] | 8 | strip: //table[@id='attachments'] |
9 | 9 | ||
10 | test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution \ No newline at end of file | 10 | test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt b/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt index 504dbea1..25e36543 100644..100755 --- a/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt +++ b/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | #meta data | 1 | #metadata |
2 | title:substring-after(title,'|') | 2 | title: substring-before(//title,' |') |
3 | 3 | author: //a[contains(@class,'person') and starts-with(@href, '/contributor')] | |
4 | author:substring-before( substring-after(//meta[@name = 'description']/@content, normalize-space(substring-after(//title,'|'))),' respond ') | 4 | |
5 | date://h5[@class = 'postDate'] | 5 | #text |
6 | 6 | body: //div[contains(@class, 'article_body')] | |
7 | #text | 7 | |
8 | body://div[@class = 'articleBody'] | 8 | #clean up |
9 | 9 | strip_id_or_class: recommended_section | |
10 | #clean up | 10 | |
11 | strip://center | 11 | test_url: http://lareviewofbooks.org/review/american-politics-redeembale-robert-gates-hillary-clinton-two-memoirs-washington-dc |
12 | test_url: http://lareviewofbooks.org/post/14066007115/literary-transactions-and-their-vicissitudes \ No newline at end of file | 12 | test_url: http://lareviewofbooks.org/interview/souvenirs-future |
diff --git a/inc/3rdparty/site_config/standard/latimes.com.txt b/inc/3rdparty/site_config/standard/latimes.com.txt index 0d6ac851..b2db37bf 100644..100755 --- a/inc/3rdparty/site_config/standard/latimes.com.txt +++ b/inc/3rdparty/site_config/standard/latimes.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | strip: //div[@id="tugs_story_display"] | 1 | strip: //div[@id="tugs_story_display"] |
2 | strip: //div[@id="search_overlay"] | 2 | strip: //div[@id="search_overlay"] |
3 | strip: //div[@id="adv_search"] | 3 | strip: //div[@id="adv_search"] |
4 | body: //div[@class='story'] | 4 | body: //div[@class='story'] |
5 | tidy: no | 5 | tidy: no |
6 | convert_double_br_tags: yes | 6 | convert_double_br_tags: yes |
7 | single_page_link: //a[contains(@href, ',print.')] | 7 | single_page_link: //a[contains(@href, ',print.')] |
8 | strip: //p[starts-with(., 'latimes.com')] | 8 | strip: //p[starts-with(., 'latimes.com')] |
9 | strip: //h1[starts-with(., 'latimes.com')] | 9 | strip: //h1[starts-with(., 'latimes.com')] |
10 | strip_id_or_class: cubead | 10 | strip_id_or_class: cubead |
11 | test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story \ No newline at end of file | 11 | test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/laughingsquid.com.txt b/inc/3rdparty/site_config/standard/laughingsquid.com.txt index 1814988a..ab2f834f 100644..100755 --- a/inc/3rdparty/site_config/standard/laughingsquid.com.txt +++ b/inc/3rdparty/site_config/standard/laughingsquid.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h1[@class='entry-title'] | 1 | title: //h1[@class='entry-title'] |
2 | body: //div[@class='entry-content'] | 2 | body: //div[@class='entry-content'] |
3 | test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/ \ No newline at end of file | 3 | test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/leancrew.com.txt b/inc/3rdparty/site_config/standard/leancrew.com.txt index 0a4c84ba..e78cf7e6 100644..100755 --- a/inc/3rdparty/site_config/standard/leancrew.com.txt +++ b/inc/3rdparty/site_config/standard/leancrew.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id="content"]/h1[1] | 1 | title: //div[@id="content"]/h1[1] |
2 | date: substring-before(//p[@class="postdate"], ' at ') | 2 | date: substring-before(//p[@class="postdate"], ' at ') |
3 | author: ("Dr. Drang") | 3 | author: ("Dr. Drang") |
4 | 4 | ||
5 | strip: //div[@id="content"]/h1[1] | 5 | strip: //div[@id="content"]/h1[1] |
6 | strip: //p[@class="postdate"] | 6 | strip: //p[@class="postdate"] |
7 | strip: //h2[@id="respond"] | 7 | strip: //h2[@id="respond"] |
8 | strip: //blockquote[@class="bbpTweet"]/p/span/a/img | 8 | strip: //blockquote[@class="bbpTweet"]/p/span/a/img |
9 | test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/ \ No newline at end of file | 9 | test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lefigaro.fr.txt b/inc/3rdparty/site_config/standard/lefigaro.fr.txt index f5494b96..e720e377 100644..100755 --- a/inc/3rdparty/site_config/standard/lefigaro.fr.txt +++ b/inc/3rdparty/site_config/standard/lefigaro.fr.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //meta[@name='title']/@content | 1 | title: //meta[@name='title']/@content |
2 | author: //span[@class='sign']//a[@class='journaliste'] | 2 | author: //span[@class='sign']//a[@class='journaliste'] |
3 | author: //meta[@name='author']/@content | 3 | author: //meta[@name='author']/@content |
4 | body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte'] | 4 | body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte'] |
5 | date: //time[@pubdate]/@datetime | 5 | date: //time[@pubdate]/@datetime |
6 | prune: no | 6 | prune: no |
7 | test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php | 7 | test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php |
8 | test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php \ No newline at end of file | 8 | test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lemonde.fr.txt b/inc/3rdparty/site_config/standard/lemonde.fr.txt index eb205275..097999b6 100644..100755 --- a/inc/3rdparty/site_config/standard/lemonde.fr.txt +++ b/inc/3rdparty/site_config/standard/lemonde.fr.txt | |||
@@ -1,13 +1,18 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | # they have a single component containing both author and date | 3 | # We can have multiple authors |
4 | #author: //p[@class='source'] | 4 | author: //a[@class='auteur'] |
5 | #date: //p[@class='source'] | 5 | |
6 | 6 | # Last edition date (if any) | |
7 | body: //div[@class='contenu_article'] | 7 | date: //time[@itemprop='dateModified']/@datetime |
8 | #Shoot the insane "conjugaison.lemonde.fr" links : | 8 | # Publication date |
9 | strip: //a[contains(@class, 'listLink')] | 9 | date: //time[@itemprop='datePublished']/@datetime |
10 | 10 | ||
11 | prune: no | 11 | |
12 | 12 | body: //div[@id='articleBody'] | |
13 | test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html \ No newline at end of file | 13 | #Shoot the insane "conjugaison.lemonde.fr" links : |
14 | #strip: //a[contains(@class, 'conjug')] | ||
15 | |||
16 | prune: no | ||
17 | |||
18 | test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html | ||
diff --git a/inc/3rdparty/site_config/standard/lesnumeriques.com.txt b/inc/3rdparty/site_config/standard/lesnumeriques.com.txt index 9b57f726..51e025ae 100644..100755 --- a/inc/3rdparty/site_config/standard/lesnumeriques.com.txt +++ b/inc/3rdparty/site_config/standard/lesnumeriques.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h1/following::span[@class='fn'] | 1 | title: //h1/following::span[@class='fn'] |
2 | # Author: should stop parsing until <br> reached, but I don't know how to do this. | 2 | # Author: should stop parsing until <br> reached, but I don't know how to do this. |
3 | author: //following::div[@class='PDate2'] | 3 | author: //following::div[@class='PDate2'] |
4 | date: //following::div[@class='PDate2']/strong | 4 | date: //following::div[@class='PDate2']/strong |
5 | 5 | ||
6 | body: //div[@class='ArTexte'] | 6 | body: //div[@class='ArTexte'] |
7 | body: //div[@id='prod_txt_b'] | 7 | body: //div[@id='prod_txt_b'] |
8 | body: //div[@class='ArPhotoP'] | 8 | body: //div[@class='ArPhotoP'] |
9 | test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html \ No newline at end of file | 9 | test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/letemps.ch.txt b/inc/3rdparty/site_config/standard/letemps.ch.txt index c4bee7ec..49b019f9 100644..100755 --- a/inc/3rdparty/site_config/standard/letemps.ch.txt +++ b/inc/3rdparty/site_config/standard/letemps.ch.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | strip_image_src: logo.gif | 2 | strip_image_src: logo.gif |
3 | test_url: http://www.letemps.ch/Facet/print/Uuid/7c9f912c-07c9-11e0-9b50-4d96c9eca37f \ No newline at end of file | 3 | test_url: http://www.letemps.ch/Facet/print/Uuid/7c9f912c-07c9-11e0-9b50-4d96c9eca37f \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/libcom.org.txt b/inc/3rdparty/site_config/standard/libcom.org.txt new file mode 100755 index 00000000..d1404d10 --- /dev/null +++ b/inc/3rdparty/site_config/standard/libcom.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | date: //span[contains(@class, 'page-date')] | ||
2 | body: //div[@id='node-page'] | ||
3 | strip_id_or_class: book-navigation | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://libcom.org/library/what-was-the-ussr-aufheben-1 | ||
7 | test_url: http://libcom.org/library-latest/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lifeandculture.fr.txt b/inc/3rdparty/site_config/standard/lifeandculture.fr.txt index c3888aa8..0e1dceb1 100644..100755 --- a/inc/3rdparty/site_config/standard/lifeandculture.fr.txt +++ b/inc/3rdparty/site_config/standard/lifeandculture.fr.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h2[@class="entry-title"] | 1 | title: //h2[@class="entry-title"] |
2 | body: //div[@class="entry-content"] | 2 | body: //div[@class="entry-content"] |
3 | test_url: http://www.lifeandculture.fr/digital/facebook-and-the-epiphanator-an-end-to-endings/ \ No newline at end of file | 3 | test_url: http://www.lifeandculture.fr/digital/facebook-and-the-epiphanator-an-end-to-endings/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lifehacker.com.txt b/inc/3rdparty/site_config/standard/lifehacker.com.txt index 32ade14a..ec97f06c 100644..100755 --- a/inc/3rdparty/site_config/standard/lifehacker.com.txt +++ b/inc/3rdparty/site_config/standard/lifehacker.com.txt | |||
@@ -1,42 +1,47 @@ | |||
1 | # Adds author text: Gawker sites commonly show as "Author: View Profile" | 1 | # Adds author text: Gawker sites commonly show as "Author: View Profile" |
2 | author://a[@class="plus-icon modfont"] | 2 | author://a[@class="plus-icon modfont"] |
3 | 3 | ||
4 | # Add date and time | 4 | # Add date and time |
5 | date: //span[@class="date"] | 5 | date: //span[@class="date"] |
6 | 6 | ||
7 | # Remove date and time from article text | 7 | body: //div[contains(@class, 'marquee-asset-wrapper') or contains(@class, 'post-content')] |
8 | strip: //span[@class="date"] | 8 | |
9 | 9 | # Remove date and time from article text | |
10 | # Remove login/comment text | 10 | strip: //span[@class="date"] |
11 | strip: //*[(@class="presence_control_external smalltype")] | 11 | |
12 | 12 | # Remove login/comment text | |
13 | strip: //div[@class="nodebyline modfont"] | 13 | strip: //*[(@class="presence_control_external smalltype")] |
14 | 14 | ||
15 | # Remove right sidebar | 15 | strip: //div[@class="nodebyline modfont"] |
16 | strip: //div[@id="rightwrapper"] | 16 | |
17 | 17 | # Remove right sidebar | |
18 | # Remove print header | 18 | strip: //div[@id="rightwrapper"] |
19 | strip: //div[@id='printhead']/h1 | 19 | |
20 | 20 | # Remove print header | |
21 | # Remove 'content is restricted' | 21 | strip: //div[@id='printhead']/h1 |
22 | strip: //div[@id='agegate_IDHERE'] | 22 | |
23 | 23 | # Remove 'content is restricted' | |
24 | # Remove follow text | 24 | strip: //div[@id='agegate_IDHERE'] |
25 | strip: //*[(@class="permalink_ads")] | 25 | |
26 | 26 | # Remove follow text | |
27 | # Remove view/comment count | 27 | strip: //*[(@class="permalink_ads")] |
28 | strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] | 28 | |
29 | 29 | strip_id_or_class: inset_groups | |
30 | # Remove contact text | 30 | |
31 | strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] | 31 | # Remove view/comment count |
32 | 32 | strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] | |
33 | # Remove medium duplicates of the article image | 33 | |
34 | strip_image_src: medium.jpg | 34 | # Remove contact text |
35 | 35 | strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] | |
36 | # Remove "arrow" class at bottom of page | 36 | |
37 | strip: //p[@class="arrow"] | 37 | # Remove medium duplicates of the article image |
38 | 38 | strip_image_src: medium.jpg | |
39 | # Remove "track" image from article body | 39 | |
40 | strip: //img[@alt="track"] | 40 | # Remove "arrow" class at bottom of page |
41 | test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos | 41 | strip: //p[@class="arrow"] |
42 | test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse \ No newline at end of file | 42 | |
43 | # Remove "track" image from article body | ||
44 | strip: //img[@alt="track"] | ||
45 | test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos | ||
46 | test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse | ||
47 | test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lifestyle.inquirer.net.txt b/inc/3rdparty/site_config/standard/lifestyle.inquirer.net.txt new file mode 100755 index 00000000..25d544ae --- /dev/null +++ b/inc/3rdparty/site_config/standard/lifestyle.inquirer.net.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@class='singlePageTitle'] | ||
2 | |||
3 | strip: //p[contains(text(), 'Follow Us')] | ||
4 | strip: //p/strong[contains(text(), 'Recent Stories:')] | ||
5 | strip: //div[@id="sharefeature"] | ||
6 | |||
7 | test_url: http://lifestyle.inquirer.net/100223/dusting-your-ceiling-fan | ||
diff --git a/inc/3rdparty/site_config/standard/lifeweek.com.cn.txt b/inc/3rdparty/site_config/standard/lifeweek.com.cn.txt new file mode 100755 index 00000000..e09f6692 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lifeweek.com.cn.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.lifeweek.com.cn/2012/1211/39439.shtml | ||
3 | # http://www.lifeweek.com.cn/2013/0308/40213.shtml | ||
4 | |||
5 | title:substring-before(//h1, '(') | ||
6 | title://h1 | ||
7 | date://ul[@class='authorbox']/li | ||
8 | author: substring-after(//ul[@class='authorbox']/li/following-sibling::li, '作者:') | ||
9 | |||
10 | next_page_link: //div[@class='pageturn_list']/a[@class='pagedown'] | ||
11 | body: //div[@class='original '] | ||
12 | |||
13 | strip://h1 | ||
14 | strip://ul[@class='authorbox'] | ||
15 | strip://span[@class='app_p'] | ||
16 | strip://div[@style='text-align:right;'] | ||
17 | strip://div[@class='pageturn_list'] | ||
18 | strip://div[@class='lifespeaks'] | ||
19 | strip://div[@class='vright fr'] | ||
20 | strip://div[@class='copyrt mg20'] | ||
21 | strip://div[@class='keyabout mg20'] | ||
22 | strip://ul[@class='readabout mg20'] | ||
23 | test_url: http://www.lifeweek.com.cn/2013/0308/40213.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/linkedin.com.txt b/inc/3rdparty/site_config/standard/linkedin.com.txt index 37e83cf6..37e83cf6 100644..100755 --- a/inc/3rdparty/site_config/standard/linkedin.com.txt +++ b/inc/3rdparty/site_config/standard/linkedin.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/livescience.com.txt b/inc/3rdparty/site_config/standard/livescience.com.txt new file mode 100755 index 00000000..5275d34a --- /dev/null +++ b/inc/3rdparty/site_config/standard/livescience.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | title: //div[@class="album_title"]//h1 | ||
2 | author: substring-before(//div[@class='by_line'], ',') | ||
3 | date: substring-after(substring-before(//div[@class="album_time"], ' Time'), 'Date: ') | ||
4 | body: //div[@class="about_text"] | ||
5 | |||
6 | strip: //div[@class='large_popper'] | ||
7 | strip: //span[contains(@id, 'mag_glass')] | ||
8 | strip: //span[contains(@class, 'img_overlay')] | ||
9 | strip: //td//span | ||
10 | strip: //div[@class="center_adsense"] | ||
11 | strip: //div[@class="article_info"]//div[@class='asset_section'] | ||
12 | strip: //div[@class="article_additional"] | ||
13 | strip: //div[contains(@style, 'overflow:hidden')] | ||
14 | strip: //div[@class="aa_text"] | ||
15 | strip: //div[@id='nointelliTXT'] | ||
16 | |||
17 | prune: no | ||
18 | autodetect_on_failure: no | ||
19 | |||
20 | test_url: http://www.livescience.com/34569-why-flowers-close-at-night-nyctinasty.html | ||
diff --git a/inc/3rdparty/site_config/standard/longform.org.txt b/inc/3rdparty/site_config/standard/longform.org.txt index 48d5e1a7..1310ec0d 100644..100755 --- a/inc/3rdparty/site_config/standard/longform.org.txt +++ b/inc/3rdparty/site_config/standard/longform.org.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //div[@class="post"]/div[@class="title"]/a | 1 | single_page_link: //div[@class="post"]/div[@class="title"]/a |
2 | 2 | ||
3 | test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/ \ No newline at end of file | 3 | test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/loopinsight.com.txt b/inc/3rdparty/site_config/standard/loopinsight.com.txt index 08ad90c3..730af947 100644..100755 --- a/inc/3rdparty/site_config/standard/loopinsight.com.txt +++ b/inc/3rdparty/site_config/standard/loopinsight.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@class='container_16']//div[@class='grid_11'] | 1 | body: //div[@class='container_16']//div[@class='grid_11'] |
2 | strip: //h2[@class='mast'] | 2 | strip: //h2[@class='mast'] |
3 | strip: //div[@class='container_16']//div[@class='grid_11']/h1 | 3 | strip: //div[@class='container_16']//div[@class='grid_11']/h1 |
4 | strip: //div[@class='container_16']//div[@class='grid_11']/p[1] | 4 | strip: //div[@class='container_16']//div[@class='grid_11']/p[1] |
5 | strip: //div[@class='container_16']//div[@class='grid_11']/div | 5 | strip: //div[@class='container_16']//div[@class='grid_11']/div |
6 | author: //a[starts-with(@title, 'Posts by')] | 6 | author: //a[starts-with(@title, 'Posts by')] |
7 | date: substring-before(substring-after(//time, 'Posted on '), ' at') | 7 | date: substring-before(substring-after(//time, 'Posted on '), ' at') |
8 | test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/ | 8 | test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/ |
9 | test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/ \ No newline at end of file | 9 | test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lostgarden.com.txt b/inc/3rdparty/site_config/standard/lostgarden.com.txt index a823e649..d7eb0fa0 100644..100755 --- a/inc/3rdparty/site_config/standard/lostgarden.com.txt +++ b/inc/3rdparty/site_config/standard/lostgarden.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | prune: no | 1 | prune: no |
2 | convert_double_br_tags: yes | 2 | convert_double_br_tags: yes |
3 | test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html \ No newline at end of file | 3 | test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/lovefm.com.txt b/inc/3rdparty/site_config/standard/lovefm.com.txt new file mode 100755 index 00000000..20d26c56 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lovefm.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //*[@id='title'] | ||
2 | date: //*[@id='date'] | ||
3 | body: //*[@id='desc'] | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://www.lovefm.com/local_news.php?item=2176 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lovetv.com.bz.txt b/inc/3rdparty/site_config/standard/lovetv.com.bz.txt new file mode 100755 index 00000000..a71fccdd --- /dev/null +++ b/inc/3rdparty/site_config/standard/lovetv.com.bz.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[contains(@class, 'post')]//h1 | ||
2 | body: //div[contains(@class, 'post')] | ||
3 | strip: //hr | ||
4 | strip_id_or_class: post-meta | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.lovetv.com.bz/2013/06/28/recently-discovered-ancient-maya-wooden-canoe-paddle-to-be-handed-over-to-archaeology/ | ||
9 | test_url: http://www.lovetv.com.bz/feed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lrb.co.uk.txt b/inc/3rdparty/site_config/standard/lrb.co.uk.txt index ce5053d4..f1aacb7d 100644..100755 --- a/inc/3rdparty/site_config/standard/lrb.co.uk.txt +++ b/inc/3rdparty/site_config/standard/lrb.co.uk.txt | |||
@@ -1,8 +1,12 @@ | |||
1 | title: substring-before(//title, ' · LRB') | 1 | title: //div[contains(@class, "article-body")]/hgroup/h1 |
2 | 2 | body: //div[contains(@class, "article-body")] | |
3 | body: //div[@class="article-body indent"] | 3 | |
4 | 4 | date: substring-after(//p[@class="meta-info"]/a, '· ') | |
5 | date: substring-after(//p[@class="meta-info"]/a, '· ') | 5 | |
6 | 6 | author: //div[contains(@class, "article-body")]/hgroup/h2 | |
7 | prune: no | 7 | |
8 | test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened \ No newline at end of file | 8 | strip_id_or_class: print-hide |
9 | strip_id_or_class: books | ||
10 | |||
11 | test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened | ||
12 | test_url: http://www.lrb.co.uk/v36/n13/benjamin-kunkel/paupers-and-richlings | ||
diff --git a/inc/3rdparty/site_config/standard/luminous-landscape.com.txt b/inc/3rdparty/site_config/standard/luminous-landscape.com.txt index 92ccf3ba..b445f5eb 100644..100755 --- a/inc/3rdparty/site_config/standard/luminous-landscape.com.txt +++ b/inc/3rdparty/site_config/standard/luminous-landscape.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | 2 | ||
3 | body: // div[@id='content'] | 3 | body: // div[@id='content'] |
4 | 4 | ||
5 | strip: //div[@class='sidebar_wrapper'] | 5 | strip: //div[@class='sidebar_wrapper'] |
6 | test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml \ No newline at end of file | 6 | test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/luxuo.com.txt b/inc/3rdparty/site_config/standard/luxuo.com.txt new file mode 100755 index 00000000..a3d5cb17 --- /dev/null +++ b/inc/3rdparty/site_config/standard/luxuo.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class='post-content'] | ||
2 | prune: no | ||
3 | |||
4 | test_url: http://www.luxuo.com/watches/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt b/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt index a8af5438..d1ff0b43 100644..100755 --- a/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt +++ b/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@class="story-body"]/div[@class="story-inner"]/h1 | 1 | title: //div[@class="story-body"]/div[@class="story-inner"]/h1 |
2 | body: //div[@class="story-body"] | 2 | body: //div[@class="story-body"] |
3 | date: //p[@class='date']/strong | 3 | date: //p[@class='date']/strong |
4 | author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By') | 4 | author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By') |
5 | 5 | ||
6 | strip: //div[@class="story-inner"]/div[@class="byline"] | 6 | strip: //div[@class="story-inner"]/div[@class="byline"] |
7 | 7 | ||
8 | test_url: http://m.bbc.co.uk/news/science-environment-19144464 \ No newline at end of file | 8 | test_url: http://m.bbc.co.uk/news/science-environment-19144464 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/m.douban.com.txt b/inc/3rdparty/site_config/standard/m.douban.com.txt new file mode 100755 index 00000000..ce9a3167 --- /dev/null +++ b/inc/3rdparty/site_config/standard/m.douban.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://m.douban.com/note/240776310/?session=6ac86d1e | ||
3 | # http://m.douban.com/note/208270705/?session=e00ec732_3433229 | ||
4 | |||
5 | title: //h2 | ||
6 | author: //a[@class='founder'] | ||
7 | date: substring-after(//span[@class='info'],' | ') | ||
8 | body: //div[contains(@class,'entry item')] | ||
9 | |||
10 | strip://span[contains(@class,'info')] | ||
11 | |||
12 | convert_double_br_tags: yes | ||
13 | test_url: http://m.douban.com/note/240776310/?session=6ac86d1e \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/m.vanityfair.com.txt b/inc/3rdparty/site_config/standard/m.vanityfair.com.txt new file mode 100755 index 00000000..e47ce2ce --- /dev/null +++ b/inc/3rdparty/site_config/standard/m.vanityfair.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # Article Metadata | ||
2 | title: //h1 | ||
3 | author: //span[@class="name"]/a | ||
4 | date: //time | ||
5 | |||
6 | # Content Pruning | ||
7 | strip: //h5 | ||
8 | strip: //time | ||
9 | strip: //div[@class="byline"] | ||
10 | strip: //h2[@class="headline "] | ||
11 | test_url: http://m.vanityfair.com/politics/2012/10/michael-lewis-profile-barack-obama \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mac4ever.com.txt b/inc/3rdparty/site_config/standard/mac4ever.com.txt index 892b47f5..9999758b 100644..100755 --- a/inc/3rdparty/site_config/standard/mac4ever.com.txt +++ b/inc/3rdparty/site_config/standard/mac4ever.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: substring-after(//div[@class='author'],'Par ') | 1 | author: substring-after(//div[@class='author'],'Par ') |
2 | date: //div[@class='date'] | 2 | date: //div[@class='date'] |
3 | body: //div[@class='content'] | 3 | body: //div[@class='content'] |
4 | 4 | ||
5 | test_url: http://www.mac4ever.com/news/64182/icloud_les_prix_en_euros_et_en_chf/ \ No newline at end of file | 5 | test_url: http://www.mac4ever.com/news/64182/icloud_les_prix_en_euros_et_en_chf/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macdrifter.com.txt b/inc/3rdparty/site_config/standard/macdrifter.com.txt index fd1ede7d..e57bd640 100644..100755 --- a/inc/3rdparty/site_config/standard/macdrifter.com.txt +++ b/inc/3rdparty/site_config/standard/macdrifter.com.txt | |||
@@ -1,2 +1,2 @@ | |||
1 | title: substring-before(//title,' « Macdrifter') | 1 | title: substring-before(//title,' « Macdrifter') |
2 | test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/ \ No newline at end of file | 2 | test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macformat.techradar.com.txt b/inc/3rdparty/site_config/standard/macformat.techradar.com.txt index 109eae45..522efb49 100644..100755 --- a/inc/3rdparty/site_config/standard/macformat.techradar.com.txt +++ b/inc/3rdparty/site_config/standard/macformat.techradar.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | # Remove news feed | 1 | # Remove news feed |
2 | strip: //div[@id='news_feed_front'] | 2 | strip: //div[@id='news_feed_front'] |
3 | 3 | ||
4 | # Remove pull quote | 4 | # Remove pull quote |
5 | strip: //div[@class='field field-type-text field-field-pull-quote'] | 5 | strip: //div[@class='field field-type-text field-field-pull-quote'] |
6 | 6 | ||
7 | # Remove login | 7 | # Remove login |
8 | strip: //div[@class='right_bar_login'] | 8 | strip: //div[@class='right_bar_login'] |
9 | test_url: http://macformat.techradar.com/blog/solid-state-storage-bringing-parity-back-mac-29-10-10&article=89189666 \ No newline at end of file | 9 | test_url: http://macformat.techradar.com/blog/solid-state-storage-bringing-parity-back-mac-29-10-10&article=89189666 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macgeneration.com.txt b/inc/3rdparty/site_config/standard/macgeneration.com.txt index e6bbe28e..739eff4e 100644..100755 --- a/inc/3rdparty/site_config/standard/macgeneration.com.txt +++ b/inc/3rdparty/site_config/standard/macgeneration.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le') | 1 | author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le') |
2 | date: substring-after(//div[@class='dateNews'],' le ') | 2 | date: substring-after(//div[@class='dateNews'],' le ') |
3 | body: //div[@class='singleNews zoneApple'] | 3 | body: //div[@class='singleNews zoneApple'] |
4 | 4 | ||
5 | test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox \ No newline at end of file | 5 | test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macmagazine.com.br.txt b/inc/3rdparty/site_config/standard/macmagazine.com.br.txt index 47ebfd79..da7df695 100644..100755 --- a/inc/3rdparty/site_config/standard/macmagazine.com.br.txt +++ b/inc/3rdparty/site_config/standard/macmagazine.com.br.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | # Remove sliders | 1 | # Remove sliders |
2 | strip: //*[(@class="slides_container")] | 2 | strip: //*[(@class="slides_container")] |
3 | strip: //div[(@id="slides_two")] | 3 | strip: //div[(@id="slides_two")] |
4 | 4 | ||
5 | # Remove tag cloud | 5 | # Remove tag cloud |
6 | strip: //span[(@class="secao")] | 6 | strip: //span[(@class="secao")] |
7 | 7 | ||
8 | # Fix date article | 8 | # Fix date article |
9 | # TODO | 9 | # TODO |
10 | 10 | ||
11 | # Remove other stuff | 11 | # Remove other stuff |
12 | strip: //div[(@id="idc-container")] | 12 | strip: //div[(@id="idc-container")] |
13 | strip: //div[(@id="idc-noscript")] | 13 | strip: //div[(@id="idc-noscript")] |
14 | strip: //div[(@class="linkwithin_div")] | 14 | strip: //div[(@class="linkwithin_div")] |
15 | strip: //div[(@class="navPosts")] | 15 | strip: //div[(@class="navPosts")] |
16 | strip: //div[(@id="lateral")] | 16 | strip: //div[(@id="lateral")] |
17 | strip: //div[(@id="autor")] | 17 | strip: //div[(@id="autor")] |
18 | strip: //div[(@id="rodape")] | 18 | strip: //div[(@id="rodape")] |
19 | strip: //div[(@id="post")]/h1 | 19 | strip: //div[(@id="post")]/h1 |
20 | strip: //div[(@id="post")]/div[(@id="boxInformacoes")] | 20 | strip: //div[(@id="post")]/div[(@id="boxInformacoes")] |
21 | test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/ \ No newline at end of file | 21 | test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macrumors.com.txt b/inc/3rdparty/site_config/standard/macrumors.com.txt index 76f999d3..83cfb4a6 100644..100755 --- a/inc/3rdparty/site_config/standard/macrumors.com.txt +++ b/inc/3rdparty/site_config/standard/macrumors.com.txt | |||
@@ -1,10 +1,12 @@ | |||
1 | author: substring-after(//div[@class='byline'], " by ") | 1 | author: substring-after(//div[@class='byline'], " by ") |
2 | date: substring-before(//div[@class='byline'], " by ") | 2 | date: substring-before(//div[@class='byline'], " by ") |
3 | 3 | ||
4 | # set body | 4 | # set body |
5 | body: //div[@class='content'] | 5 | body: //div[@class='content'] |
6 | 6 | strip_id_or_class: commentsContainer | |
7 | # set title | 7 | strip_id_or_class: linkback |
8 | title: //h3 | 8 | |
9 | # set title | ||
10 | title: //h3 | ||
9 | #strip: //div[@class='content']/h3 | 11 | #strip: //div[@class='content']/h3 |
10 | test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/ \ No newline at end of file | 12 | test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/ |
diff --git a/inc/3rdparty/site_config/standard/macstories.net.txt b/inc/3rdparty/site_config/standard/macstories.net.txt index 6e651ca0..639fdd19 100644..100755 --- a/inc/3rdparty/site_config/standard/macstories.net.txt +++ b/inc/3rdparty/site_config/standard/macstories.net.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | strip: //*[(@id = "featured")] | 1 | strip: //*[(@id = "featured")] |
2 | 2 | ||
3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') | 3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') |
4 | 4 | ||
5 | date: concat(//div[@class='month'],' ',//div[@class='day']) | 5 | date: concat(//div[@class='month'],' ',//div[@class='day']) |
6 | 6 | ||
7 | #macstories doesn't provide a year, but month/day is better than nothing | 7 | #macstories doesn't provide a year, but month/day is better than nothing |
8 | test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/ \ No newline at end of file | 8 | test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mactalk.com.au.txt b/inc/3rdparty/site_config/standard/mactalk.com.au.txt index e8d60522..9be865af 100644..100755 --- a/inc/3rdparty/site_config/standard/mactalk.com.au.txt +++ b/inc/3rdparty/site_config/standard/mactalk.com.au.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author://div[@class="article_username_container_full"] | 1 | author://div[@class="article_username_container_full"] |
2 | date://div[@class="article_username_container"] | 2 | date://div[@class="article_username_container"] |
3 | body://div[@class="article cms_clear restore postcontainer"] | 3 | body://div[@class="article cms_clear restore postcontainer"] |
4 | test_url: http://www.mactalk.com.au/content/chat-basil-shkara-developer-taptax-2452/ \ No newline at end of file | 4 | test_url: http://www.mactalk.com.au/content/chat-basil-shkara-developer-taptax-2452/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mactechnews.de.txt b/inc/3rdparty/site_config/standard/mactechnews.de.txt index c3fc0e44..5c03518a 100644..100755 --- a/inc/3rdparty/site_config/standard/mactechnews.de.txt +++ b/inc/3rdparty/site_config/standard/mactechnews.de.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: substring-after(substring-after(//title, '>'), '>') | 1 | title: substring-after(substring-after(//title, '>'), '>') |
2 | body: //div[@class='NewsArticleContent'] | 2 | body: //div[@class='NewsArticleContent'] |
3 | test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html \ No newline at end of file | 3 | test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/macworld.com.txt b/inc/3rdparty/site_config/standard/macworld.com.txt index 96175872..e7d97202 100644..100755 --- a/inc/3rdparty/site_config/standard/macworld.com.txt +++ b/inc/3rdparty/site_config/standard/macworld.com.txt | |||
@@ -1,24 +1,24 @@ | |||
1 | title: //article//h1 | 1 | title: //article//h1 |
2 | date: //meta[@name="date"]/@content | 2 | date: //meta[@name="date"]/@content |
3 | author: //div[@class="author-name" or @class="article-byline"]/a[1] | 3 | author: //div[@class="author-name" or @class="article-byline"]/a[1] |
4 | 4 | ||
5 | body: //section[@class="page"] | 5 | body: //section[@class="page"] |
6 | 6 | ||
7 | # remove 'From the Lab' and 'Recent posts' text | 7 | # remove 'From the Lab' and 'Recent posts' text |
8 | strip: //div[@class='blogLabel'] | 8 | strip: //div[@class='blogLabel'] |
9 | 9 | ||
10 | # remove byline and meta info | 10 | # remove byline and meta info |
11 | strip: //div[@class="article-meta"] | 11 | strip: //div[@class="article-meta"] |
12 | strip: //div[@class="author-info"] | 12 | strip: //div[@class="author-info"] |
13 | 13 | ||
14 | #strip tags and categories | 14 | #strip tags and categories |
15 | strip: //div[@class="department"] | 15 | strip: //div[@class="department"] |
16 | 16 | ||
17 | #strip product cap links | 17 | #strip product cap links |
18 | strip: //div[@class="cap-main"] | 18 | strip: //div[@class="cap-main"] |
19 | strip: //div[@id="compare-lede"] | 19 | strip: //div[@id="compare-lede"] |
20 | 20 | ||
21 | prune: no | 21 | prune: no |
22 | 22 | ||
23 | # copes less well with Review pages, seems fine for News | 23 | # copes less well with Review pages, seems fine for News |
24 | test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html \ No newline at end of file | 24 | test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mainichi.jp.txt b/inc/3rdparty/site_config/standard/mainichi.jp.txt index e701207f..414a2f53 100644..100755 --- a/inc/3rdparty/site_config/standard/mainichi.jp.txt +++ b/inc/3rdparty/site_config/standard/mainichi.jp.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='NewsArticle'] | 1 | body: //div[@class='NewsArticle'] |
2 | 2 | ||
3 | test_url: http://mainichi.jp/select/weathernews/20110311/news/20110520k0000e040062000c.html \ No newline at end of file | 3 | test_url: http://mainichi.jp/select/weathernews/20110311/news/20110520k0000e040062000c.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mainpost.de.txt b/inc/3rdparty/site_config/standard/mainpost.de.txt index a2d25d56..2136de3f 100644..100755 --- a/inc/3rdparty/site_config/standard/mainpost.de.txt +++ b/inc/3rdparty/site_config/standard/mainpost.de.txt | |||
@@ -1,28 +1,28 @@ | |||
1 | title: substring-before(//title, '|') | 1 | title: substring-before(//title, '|') |
2 | body: //*[@id='content-left'] | 2 | body: //*[@id='content-left'] |
3 | 3 | ||
4 | # Why is this not working here? | 4 | # Why is this not working here? |
5 | # body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail'] | 5 | # body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail'] |
6 | 6 | ||
7 | 7 | ||
8 | #Header | 8 | #Header |
9 | strip_id_or_class: 'subHead' | 9 | strip_id_or_class: 'subHead' |
10 | strip_id_or_class: 'fl_right' | 10 | strip_id_or_class: 'fl_right' |
11 | strip_id_or_class: 'infolink' | 11 | strip_id_or_class: 'infolink' |
12 | strip_id_or_class: 'content-head' | 12 | strip_id_or_class: 'content-head' |
13 | strip_id_or_class: 'tab' | 13 | strip_id_or_class: 'tab' |
14 | strip_id_or_class: 'tab-active' | 14 | strip_id_or_class: 'tab-active' |
15 | strip: //*[contains(@class,'trenner')] | 15 | strip: //*[contains(@class,'trenner')] |
16 | 16 | ||
17 | # Headline | 17 | # Headline |
18 | strip: //h1/* | 18 | strip: //h1/* |
19 | strip_id_or_class: 'font16' | 19 | strip_id_or_class: 'font16' |
20 | 20 | ||
21 | #Images | 21 | #Images |
22 | strip_id_or_class: 'leftimage' | 22 | strip_id_or_class: 'leftimage' |
23 | strip_id_or_class: 'rightimage' | 23 | strip_id_or_class: 'rightimage' |
24 | 24 | ||
25 | #Comments | 25 | #Comments |
26 | strip: //table | 26 | strip: //table |
27 | strip: //p/following-sibling::*[0] | 27 | strip: //p/following-sibling::*[0] |
28 | test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211 \ No newline at end of file | 28 | test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/makeuseof.com.txt b/inc/3rdparty/site_config/standard/makeuseof.com.txt index 6809afed..078e8d08 100644..100755 --- a/inc/3rdparty/site_config/standard/makeuseof.com.txt +++ b/inc/3rdparty/site_config/standard/makeuseof.com.txt | |||
@@ -1,3 +1,9 @@ | |||
1 | tidy: no | 1 | title: //h1[@class='entry-title'] |
2 | 2 | ||
3 | test_url: http://www.makeuseof.com/dir/kindle-it-web-pages-kindle-friendly/ \ No newline at end of file | 3 | body: //article//header//img | //article//section[@class='post'] |
4 | |||
5 | strip: //article//section[@class='post']/aside | ||
6 | strip: //article//section[@class='post']/footer | ||
7 | |||
8 | test_url: http://www.makeuseof.com/tag/cool-websites-and-tools-advanced-photo-editor-keep-your-kids-stuff-online-identify-60-languages/ | ||
9 | test_url: http://www.makeuseof.com/tag/what-do-you-think-of-our-new-look-makeuseof-poll/ | ||
diff --git a/inc/3rdparty/site_config/standard/manager.co.th.txt b/inc/3rdparty/site_config/standard/manager.co.th.txt new file mode 100755 index 00000000..cd6c5c01 --- /dev/null +++ b/inc/3rdparty/site_config/standard/manager.co.th.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | title: //td[@class="headline"] | ||
2 | author: //font[@color="#003366"] | ||
3 | date: //td[@class="date"] | ||
4 | |||
5 | strip: //td[@class="headline"] | ||
6 | strip: //font[@color="#003366"] | ||
7 | strip: //td[@class="date"] | ||
8 | |||
9 | strip: //img[@src="images/2009/logo_en.gif"] | ||
10 | |||
11 | body: //tbody[@class="body"] | ||
12 | convert_double_br_tags:yes | ||
13 | |||
14 | strip: //img[@src="/images/TabOver.gif"] | ||
15 | strip: //td[@width="160"] | ||
16 | strip: //img[@src="/images/TabUnder.gif"] | ||
17 | |||
18 | strip: //td[@class="small"] | ||
19 | strip: //td[@height="47"] | ||
20 | |||
21 | strip: //td[@valign="middle"] | ||
22 | strip: //td[@background="/images/menu_bottombg.gif"] | ||
23 | strip: //img[@src="/images/sc_footer_l.gif"] | ||
24 | strip: //img[@src="/images/sc_footer_m.gif"] | ||
25 | strip: //img[@src="/images/sc_footer_r.gif"] | ||
26 | test_url: http://www.manager.co.th/Entertainment/ViewNews.aspx?NewsID=9550000101979 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/marco.org.txt b/inc/3rdparty/site_config/standard/marco.org.txt index ef2e03d3..4bb24a62 100644..100755 --- a/inc/3rdparty/site_config/standard/marco.org.txt +++ b/inc/3rdparty/site_config/standard/marco.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | prune: no | 2 | prune: no |
3 | date: //article//time[@pubdate] | 3 | date: //article//time[@pubdate] |
4 | title: //article/header/h2 | 4 | title: //article/header/h2 |
5 | body: //article | 5 | body: //article |
6 | strip: //header | 6 | strip: //header |
7 | test_url: http://www.marco.org/2012/09/08/businessweek-gruber | 7 | test_url: http://www.marco.org/2012/09/08/businessweek-gruber |
8 | test_url: http://www.marco.org/2012/04/24/might-upgrade-someday \ No newline at end of file | 8 | test_url: http://www.marco.org/2012/04/24/might-upgrade-someday \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/marksdailyapple.com.txt b/inc/3rdparty/site_config/standard/marksdailyapple.com.txt index 0077f560..0077f560 100644..100755 --- a/inc/3rdparty/site_config/standard/marksdailyapple.com.txt +++ b/inc/3rdparty/site_config/standard/marksdailyapple.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/martinfowler.com.txt b/inc/3rdparty/site_config/standard/martinfowler.com.txt index 8e0e349f..4ff4a9c2 100644..100755 --- a/inc/3rdparty/site_config/standard/martinfowler.com.txt +++ b/inc/3rdparty/site_config/standard/martinfowler.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | date: //div[@id="main"]/p[@class="date"] | 1 | date: //div[@id="main"]/p[@class="date"] |
2 | author: string("Martin Fowler") | 2 | author: string("Martin Fowler") |
3 | body: //div[@id="main"] | 3 | body: //div[@id="main"] |
4 | strip_id_or_class: date | 4 | strip_id_or_class: date |
5 | strip_id_or_class: tags | 5 | strip_id_or_class: tags |
6 | strip_id_or_class: tagLabel | 6 | strip_id_or_class: tagLabel |
7 | strip: //div[@id="main"]/h1[1] | 7 | strip: //div[@id="main"]/h1[1] |
8 | test_url: http://martinfowler.com/bliki/DatabaseThaw.html \ No newline at end of file | 8 | test_url: http://martinfowler.com/bliki/DatabaseThaw.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mashable.com.txt b/inc/3rdparty/site_config/standard/mashable.com.txt index 2c5a14a6..b6efb6c5 100644..100755 --- a/inc/3rdparty/site_config/standard/mashable.com.txt +++ b/inc/3rdparty/site_config/standard/mashable.com.txt | |||
@@ -1,4 +1,11 @@ | |||
1 | title: //header[@class='entry-title']/h1 | 1 | title: //h1[@class='title'] |
2 | body: //div[@class='description'] | 2 | author: substring-after(//span[@class='author_name'], 'By ') |
3 | date: //time | ||
4 | |||
5 | body: //article | ||
3 | strip: //div[@class='ytm-gallery-box'] | 6 | strip: //div[@class='ytm-gallery-box'] |
4 | test_url: http://mashable.com/2011/12/05/india-wants-google-and-facebook-to-censor-user-content/ \ No newline at end of file | 7 | strip: //div[contains(@class, 'adsense')] |
8 | strip: //aside[contains(@class, 'social')] | ||
9 | strip_id_or_class: article-topics | ||
10 | |||
11 | test_url: http://mashable.com/2013/05/24/myspace-architects-rebuilding-a-brand/ | ||
diff --git a/inc/3rdparty/site_config/standard/matt.might.net.txt b/inc/3rdparty/site_config/standard/matt.might.net.txt new file mode 100755 index 00000000..30d585cf --- /dev/null +++ b/inc/3rdparty/site_config/standard/matt.might.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1 | ||
2 | author: string("Matt Might") | ||
3 | strip: //h1/following-sibling::div | ||
4 | |||
5 | test_url: http://matt.might.net/articles/oo-cesk/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mattcutts.com.txt b/inc/3rdparty/site_config/standard/mattcutts.com.txt index 76b1eac6..76b1eac6 100644..100755 --- a/inc/3rdparty/site_config/standard/mattcutts.com.txt +++ b/inc/3rdparty/site_config/standard/mattcutts.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/mbl.is.txt b/inc/3rdparty/site_config/standard/mbl.is.txt index fd26f091..fd26f091 100644..100755 --- a/inc/3rdparty/site_config/standard/mbl.is.txt +++ b/inc/3rdparty/site_config/standard/mbl.is.txt | |||
diff --git a/inc/3rdparty/site_config/standard/medialens.org.txt b/inc/3rdparty/site_config/standard/medialens.org.txt index 94f27b71..4c333aa1 100644..100755 --- a/inc/3rdparty/site_config/standard/medialens.org.txt +++ b/inc/3rdparty/site_config/standard/medialens.org.txt | |||
@@ -1,2 +1,4 @@ | |||
1 | strip: //div[contains(@class, 'article-tools')] | 1 | strip_id_or_class: article-tools |
2 | strip_id_or_class: pagenav | ||
3 | prune: no | ||
2 | test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html \ No newline at end of file | 4 | test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/medium.com.txt b/inc/3rdparty/site_config/standard/medium.com.txt new file mode 100755 index 00000000..acf7cc90 --- /dev/null +++ b/inc/3rdparty/site_config/standard/medium.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'post-content-inner')] | ||
2 | strip_id_or_class: follow-ups | ||
3 | strip_id_or_class: footer | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: https://medium.com/p/6844c0d7893b \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/megamp3.eu.txt b/inc/3rdparty/site_config/standard/megamp3.eu.txt new file mode 100755 index 00000000..1b6a1279 --- /dev/null +++ b/inc/3rdparty/site_config/standard/megamp3.eu.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h3[@class='episode_title'] | ||
2 | body: //ul[contains(@class, 'episode_imgdesc')]/li/descendant::* | ||
3 | prune: no | ||
4 | strip://*[contains(@class, 'plugin')] | ||
5 | strip://*[contains(@class, 'episode_keywords')] | ||
6 | |||
7 | test_url: http://www.megamp3.eu/?p=episode&name=2013-04-19_la_filiere_progressive_431.mp3 | ||
8 | test_url: http://www.megamp3.eu/feed.xml | ||
diff --git a/inc/3rdparty/site_config/standard/menshealth.com.txt b/inc/3rdparty/site_config/standard/menshealth.com.txt index e7e1e269..a1a46f63 100644..100755 --- a/inc/3rdparty/site_config/standard/menshealth.com.txt +++ b/inc/3rdparty/site_config/standard/menshealth.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | # need to find a way to eliminate <span> content for "related content" without eliminating important content | 1 | # need to find a way to eliminate <span> content for "related content" without eliminating important content |
2 | 2 | ||
3 | convert_double_br_tags: [yes] | 3 | convert_double_br_tags: [yes] |
4 | #body: //div[@id='leftside'] | 4 | #body: //div[@id='leftside'] |
5 | title: //h1 | 5 | title: //h1 |
6 | title: //h2 | 6 | title: //h2 |
7 | Author: substring-after(//h4, 'By ') | 7 | Author: substring-after(//h4, 'By ') |
8 | Author: substring-after(//h4, 'By: ') | 8 | Author: substring-after(//h4, 'By: ') |
9 | #Strip: //span | 9 | #Strip: //span |
10 | strip_id_or_class: morefromcat | 10 | strip_id_or_class: morefromcat |
11 | strip_id_or_class: mostpopular | 11 | strip_id_or_class: mostpopular |
12 | strip_id_or_class: articlepagination | 12 | strip_id_or_class: articlepagination |
13 | strip_id_or_class: toolbar | 13 | strip_id_or_class: toolbar |
14 | body: //div[@id='zmodcontent'] | 14 | body: //div[@id='zmodcontent'] |
15 | single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')] | 15 | single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')] |
16 | test_url: http://www.menshealth.com/mhlists/pursuit_of_happiness/index.php \ No newline at end of file | 16 | test_url: http://www.menshealth.com/mhlists/pursuit_of_happiness/index.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/metafilter.com.txt b/inc/3rdparty/site_config/standard/metafilter.com.txt new file mode 100755 index 00000000..a2f3ada9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/metafilter.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[contains(@class, 'copy') or contains(@class, 'comments')] | ||
2 | strip_id_or_class: related | ||
3 | strip: //a[. = 'Subscribe'] | ||
4 | strip: //h1/span[@class = 'smallcopy'] | ||
5 | strip: //a[@class = 'skip'] | ||
6 | strip: //div[@id = 'logo'] | ||
7 | strip: //div[contains(@class, 'comments') and contains(., 'You are not currently logged in')] | ||
8 | test_url: http://www.metafilter.com/128101/Probably-more-secure-than-the-Drafts-folder-on-a-shared-Gmail-account \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mforum.cari.com.my.txt b/inc/3rdparty/site_config/standard/mforum.cari.com.my.txt new file mode 100755 index 00000000..c295d734 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mforum.cari.com.my.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: (//td[starts-with(@id, 'postmessage_')])[1] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://mforum.cari.com.my/forum.php?mod=viewthread&tid=788033 | ||
6 | test_url: http://mforum.cari.com.my/forum.php?mod=rss&fid=265&auth=0 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mikeash.com.txt b/inc/3rdparty/site_config/standard/mikeash.com.txt index af8a7d30..abaa6a81 100644..100755 --- a/inc/3rdparty/site_config/standard/mikeash.com.txt +++ b/inc/3rdparty/site_config/standard/mikeash.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class="blogtitle"] | 1 | title: //div[@class="blogtitle"] |
2 | strip: //div[@class="blogtitle"] | 2 | strip: //div[@class="blogtitle"] |
3 | 3 | ||
4 | author: substring-after(//span[@class="blogheader"], 'Author: ') | 4 | author: substring-after(//span[@class="blogheader"], 'Author: ') |
5 | test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html \ No newline at end of file | 5 | test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mikeindustries.com.txt b/inc/3rdparty/site_config/standard/mikeindustries.com.txt index 3d488e13..fb4636cc 100644..100755 --- a/inc/3rdparty/site_config/standard/mikeindustries.com.txt +++ b/inc/3rdparty/site_config/standard/mikeindustries.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@class='post_content']/h2 | 1 | title: //div[@class='post_content']/h2 |
2 | date: //div[@class='dateline'] | 2 | date: //div[@class='dateline'] |
3 | body: //div[@class='entry'] | 3 | body: //div[@class='entry'] |
4 | 4 | ||
5 | strip: //div[@class='closer'] | 5 | strip: //div[@class='closer'] |
6 | strip: //div[@class='navigation'] | 6 | strip: //div[@class='navigation'] |
7 | strip: //div[@class='aux_pane'] | 7 | strip: //div[@class='aux_pane'] |
8 | strip: //div[@class='aux_aux_pane'] | 8 | strip: //div[@class='aux_aux_pane'] |
9 | test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another \ No newline at end of file | 9 | test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt b/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt index 7e43d63c..773a627c 100644..100755 --- a/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt +++ b/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //*[@class="article"]/h1 | 1 | title: //*[@class="article"]/h1 |
2 | date: //*[@class="article"]/div[@class="date"] | 2 | date: //*[@class="article"]/div[@class="date"] |
3 | 3 | ||
4 | # strip the title and date from the article text | 4 | # strip the title and date from the article text |
5 | strip: //*[@class="article"]/h1 | 5 | strip: //*[@class="article"]/h1 |
6 | strip: //*[@class="article"]/div[@class="date"] | 6 | strip: //*[@class="article"]/div[@class="date"] |
7 | 7 | ||
8 | # strip annoying <br> between metadata and article | 8 | # strip annoying <br> between metadata and article |
9 | strip: //*[@class="article"]/div[@class="date"]/following-sibling::br | 9 | strip: //*[@class="article"]/div[@class="date"]/following-sibling::br |
10 | test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/ \ No newline at end of file | 10 | test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/minnpost.com.txt b/inc/3rdparty/site_config/standard/minnpost.com.txt index 51a0630b..dc926a6f 100644..100755 --- a/inc/3rdparty/site_config/standard/minnpost.com.txt +++ b/inc/3rdparty/site_config/standard/minnpost.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //*[@id="content-header"]/h1 | 1 | title: //*[@id="content-header"]/h1 |
2 | author: //*[contains(@class, 'byline')]/a/text() | 2 | author: //*[contains(@class, 'byline')]/a/text() |
3 | date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|') | 3 | date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|') |
4 | body: //*[contains(@class, 'node-body')] | 4 | body: //*[contains(@class, 'node-body')] |
5 | test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court \ No newline at end of file | 5 | test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt b/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt index 4215a051..2033cf33 100644..100755 --- a/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt +++ b/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | # Remove extra links | 1 | # Remove extra links |
2 | strip: //*[@class='appended_html'] | 2 | strip: //*[@class='appended_html'] |
3 | test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html \ No newline at end of file | 3 | test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mises.org.txt b/inc/3rdparty/site_config/standard/mises.org.txt index ae542aa6..73c485e6 100644..100755 --- a/inc/3rdparty/site_config/standard/mises.org.txt +++ b/inc/3rdparty/site_config/standard/mises.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | strip_id_or_class: 'book-ad' | 1 | strip_id_or_class: 'book-ad' |
2 | strip_id_or_class: 'bigger pullquote' | 2 | strip_id_or_class: 'bigger pullquote' |
3 | strip_id_or_class: 'subscribe' | 3 | strip_id_or_class: 'subscribe' |
4 | strip_id_or_class: 'blog-link' | 4 | strip_id_or_class: 'blog-link' |
5 | test_url: http://mises.org/daily/4804 \ No newline at end of file | 5 | test_url: http://mises.org/daily/4804 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mlb.mlb.com.txt b/inc/3rdparty/site_config/standard/mlb.mlb.com.txt index 30e8aff2..765fab3f 100644..100755 --- a/inc/3rdparty/site_config/standard/mlb.mlb.com.txt +++ b/inc/3rdparty/site_config/standard/mlb.mlb.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //h1[@class='article-headline'] | 1 | title: //h1[@class='article-headline'] |
2 | date: //span[@class='timeStamp'] | 2 | date: //span[@class='timeStamp'] |
3 | author: substring-before(//p[@class='article-byline'], '/') | 3 | author: substring-before(//p[@class='article-byline'], '/') |
4 | body: //div[@id='article'] | 4 | body: //div[@id='article'] |
5 | #strip: //div[@class='inner'] | 5 | #strip: //div[@class='inner'] |
6 | strip: //div[@id='article_head'] | 6 | strip: //div[@id='article_head'] |
7 | strip: //p[@class='tagLine'] | 7 | strip: //p[@class='tagLine'] |
8 | strip: //div[@id='article_related_links'] | 8 | strip: //div[@id='article_related_links'] |
9 | strip: //div[@id='article_related_mlb'] | 9 | strip: //div[@id='article_related_mlb'] |
10 | strip: //span[@class='more'] | 10 | strip: //span[@class='more'] |
11 | strip: //div[@class='article_component'] | 11 | strip: //div[@class='article_component'] |
12 | strip: //span[@class='screen_reader'] | 12 | strip: //span[@class='screen_reader'] |
13 | strip: //ul[@class='columnists_blurb'] | 13 | strip: //ul[@class='columnists_blurb'] |
14 | test_url: http://mlb.mlb.com/news/article.jsp?ymd=20120403&content_id=27880830 \ No newline at end of file | 14 | test_url: http://mlb.mlb.com/news/article.jsp?ymd=20120403&content_id=27880830 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt b/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt index c4e3389e..8480e302 100644..100755 --- a/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt +++ b/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //h1[@id = 'stream_title'] | 1 | title: //h1[@id = 'stream_title'] |
2 | author: //p[@class = 'byline']/a | 2 | author: //p[@class = 'byline']/a |
3 | date: //span[@class = 'datetime'] | 3 | date: //span[@class = 'datetime'] |
4 | 4 | ||
5 | body: //div[@id = 'stream_container'] | 5 | body: //div[@id = 'stream_container'] |
6 | strip: //p[@class = 'byline'] | 6 | strip: //p[@class = 'byline'] |
7 | strip_id_or_class: stream_summary | 7 | strip_id_or_class: stream_summary |
8 | strip_id_or_class: social-spoken | 8 | strip_id_or_class: social-spoken |
9 | strip_id_or_class: datetime | 9 | strip_id_or_class: datetime |
10 | strip_id_or_class: author-mini-profile | 10 | strip_id_or_class: author-mini-profile |
11 | strip_id_or_class: social-tools | 11 | strip_id_or_class: social-tools |
12 | strip_id_or_class: entry-tags | 12 | strip_id_or_class: entry-tags |
13 | strip_id_or_class: fb-like-box | 13 | strip_id_or_class: fb-like-box |
14 | test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage \ No newline at end of file | 14 | test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mlssoccer.com.txt b/inc/3rdparty/site_config/standard/mlssoccer.com.txt index 41e15136..5d706f88 100644..100755 --- a/inc/3rdparty/site_config/standard/mlssoccer.com.txt +++ b/inc/3rdparty/site_config/standard/mlssoccer.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //*[@class="header_title"]/h1 | 1 | title: //*[@class="header_title"]/h1 |
2 | date: //*[@class="field-date"] | 2 | date: //*[@class="field-date"] |
3 | author: //*[@class="field-author"] | 3 | author: //*[@class="field-author"] |
4 | body: //div[contains(@class, 'content')] | 4 | body: //div[contains(@class, 'content')] |
5 | 5 | ||
6 | test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation \ No newline at end of file | 6 | test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mmo-champion.com.txt b/inc/3rdparty/site_config/standard/mmo-champion.com.txt index 918fae36..50d8a24f 100644..100755 --- a/inc/3rdparty/site_config/standard/mmo-champion.com.txt +++ b/inc/3rdparty/site_config/standard/mmo-champion.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id = 'article_content']/div[contains(@class,'article')] | 2 | body: //div[@id = 'article_content']/div[contains(@class,'article')] |
3 | author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')] | 3 | author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')] |
4 | date: //div[@class = 'article_username_container'] | 4 | date: //div[@class = 'article_username_container'] |
5 | test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot \ No newline at end of file | 5 | test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mnn.com.txt b/inc/3rdparty/site_config/standard/mnn.com.txt index ddfe6fa2..d3576df2 100644..100755 --- a/inc/3rdparty/site_config/standard/mnn.com.txt +++ b/inc/3rdparty/site_config/standard/mnn.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text() | 2 | author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text() |
3 | date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2] | 3 | date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2] |
4 | body: //div[@class="node"] | 4 | body: //div[@class="node"] |
5 | 5 | ||
6 | strip_id_or_class: vertical-social-bar | 6 | strip_id_or_class: vertical-social-bar |
7 | strip_id_or_class: blogs_paginator | 7 | strip_id_or_class: blogs_paginator |
8 | strip_id_or_class: horizontal-social-links | 8 | strip_id_or_class: horizontal-social-links |
9 | strip_id_or_class: servicelinksdiv | 9 | strip_id_or_class: servicelinksdiv |
10 | 10 | ||
11 | test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal \ No newline at end of file | 11 | test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mno.hu.txt b/inc/3rdparty/site_config/standard/mno.hu.txt index ba158953..8a3f9391 100644..100755 --- a/inc/3rdparty/site_config/standard/mno.hu.txt +++ b/inc/3rdparty/site_config/standard/mno.hu.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //title | 1 | title: //title |
2 | 2 | ||
3 | author: //div[@class="author"] | 3 | author: //div[@class="author"] |
4 | 4 | ||
5 | strip_id_or_class: 'header' | 5 | strip_id_or_class: 'header' |
6 | strip_id_or_class: 'cikk_ajanlo' | 6 | strip_id_or_class: 'cikk_ajanlo' |
7 | strip_id_or_class: 'buttons' | 7 | strip_id_or_class: 'buttons' |
8 | strip_id_or_class: 'related' | 8 | strip_id_or_class: 'related' |
9 | strip_id_or_class: 'adbox ad_cikk_kozepre' | 9 | strip_id_or_class: 'adbox ad_cikk_kozepre' |
10 | strip_id_or_class: 'cikk-cimkek' | 10 | strip_id_or_class: 'cikk-cimkek' |
11 | strip_id_or_class: 'cikk_ertekeles' | 11 | strip_id_or_class: 'cikk_ertekeles' |
12 | 12 | ||
13 | strip_comments: yes | 13 | strip_comments: yes |
14 | test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351 \ No newline at end of file | 14 | test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mobile.nytimes.com.txt b/inc/3rdparty/site_config/standard/mobile.nytimes.com.txt new file mode 100755 index 00000000..c60252ef --- /dev/null +++ b/inc/3rdparty/site_config/standard/mobile.nytimes.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h1[contains(@class, 'headline')] | ||
2 | body: //article[contains(@class, 'full-art')] | ||
3 | strip_id_or_class: image-credit | ||
4 | test_url: http://mobile.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mobile.slate.com.txt b/inc/3rdparty/site_config/standard/mobile.slate.com.txt index d5d81034..6ffcd18f 100644..100755 --- a/inc/3rdparty/site_config/standard/mobile.slate.com.txt +++ b/inc/3rdparty/site_config/standard/mobile.slate.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h2[@class="article_title"] | 1 | title: //h2[@class="article_title"] |
2 | strip: //a[@class="houseAdLink"] | 2 | strip: //a[@class="houseAdLink"] |
3 | strip: //h1 | 3 | strip: //h1 |
4 | strip: //div[@class="more_articles"] | 4 | strip: //div[@class="more_articles"] |
5 | test_url: http://mobile.slate.com/rss.jsp?rssid=411&item=http%3a%2f%2fwww.slate.com%2fdefault.aspx%3fdisplaymode%3d201%26id%3d2293749%26device%3drss \ No newline at end of file | 5 | test_url: http://mobile.slate.com/rss.jsp?rssid=411&item=http%3a%2f%2fwww.slate.com%2fdefault.aspx%3fdisplaymode%3d201%26id%3d2293749%26device%3drss \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt b/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt index a1cc5317..82da4aec 100644..100755 --- a/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt +++ b/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class='post uncustomized-post-template'] | 1 | body: //div[@class='post uncustomized-post-template'] |
2 | 2 | ||
3 | # remove duplicate of post title, which is a link | 3 | # remove duplicate of post title, which is a link |
4 | strip: //h3[@class='post-title'] | 4 | strip: //h3[@class='post-title'] |
5 | 5 | ||
6 | # remove permalink and timestamp, which isn't useful as it's a time with no date | 6 | # remove permalink and timestamp, which isn't useful as it's a time with no date |
7 | strip: //span[@class='post-timestamp'] | 7 | strip: //span[@class='post-timestamp'] |
8 | 8 | ||
9 | # remove labels (tags) | 9 | # remove labels (tags) |
10 | strip: //span[@class='post-labels'] | 10 | strip: //span[@class='post-labels'] |
11 | test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html \ No newline at end of file | 11 | test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/modernghana.com.txt b/inc/3rdparty/site_config/standard/modernghana.com.txt index 4c93d0cf..306ef8d9 100644..100755 --- a/inc/3rdparty/site_config/standard/modernghana.com.txt +++ b/inc/3rdparty/site_config/standard/modernghana.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | author: //meta[@name="author"]/@content | 2 | author: //meta[@name="author"]/@content |
3 | date: //span[@class='date1'] | 3 | date: //span[@class='date1'] |
4 | body: //div[@id='newsimage'] | //div[@id='bodytext'] | 4 | body: //div[@id='newsimage'] | //div[@id='bodytext'] |
5 | tidy: no | 5 | tidy: no |
6 | prune: no | 6 | prune: no |
7 | 7 | ||
8 | test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html \ No newline at end of file | 8 | test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/money.cnn.com.txt b/inc/3rdparty/site_config/standard/money.cnn.com.txt index a0d1628a..d5e03d20 100644..100755 --- a/inc/3rdparty/site_config/standard/money.cnn.com.txt +++ b/inc/3rdparty/site_config/standard/money.cnn.com.txt | |||
@@ -1,24 +1,24 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | title: //h1[@class='storyheadline'] | 2 | title: //h1[@class='storyheadline'] |
3 | author: //meta[@name="AUTHOR"]/@content | 3 | author: //meta[@name="AUTHOR"]/@content |
4 | date: //span[@class='cnnDateStamp'] | 4 | date: //span[@class='cnnDateStamp'] |
5 | date: //meta[@name="DATE"]/@content | 5 | date: //meta[@name="DATE"]/@content |
6 | body: //div[@id='storytext' or @class='storytext'] | 6 | body: //div[@id='storytext' or @class='storytext'] |
7 | 7 | ||
8 | strip_id_or_class: ie_column | 8 | strip_id_or_class: ie_column |
9 | strip_id_or_class: sharewidgets | 9 | strip_id_or_class: sharewidgets |
10 | strip_image_src: bug.gif | 10 | strip_image_src: bug.gif |
11 | 11 | ||
12 | strip: //div[@class="hed_side"] | 12 | strip: //div[@class="hed_side"] |
13 | strip: //span[@class="byline"] | 13 | strip: //span[@class="byline"] |
14 | strip: //a[@class="soc-twtname"] | 14 | strip: //a[@class="soc-twtname"] |
15 | strip: //span[@class="cnnDateStamp"] | 15 | strip: //span[@class="cnnDateStamp"] |
16 | strip: //div[@class="storytimestamp"] | 16 | strip: //div[@class="storytimestamp"] |
17 | strip: //div[@class="cnnCol_side"] | 17 | strip: //div[@class="cnnCol_side"] |
18 | 18 | ||
19 | prune: no | 19 | prune: no |
20 | tidy: no | 20 | tidy: no |
21 | 21 | ||
22 | test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 | 22 | test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 |
23 | test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm | 23 | test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm |
24 | test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm \ No newline at end of file | 24 | test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/monkeyzen.com.txt b/inc/3rdparty/site_config/standard/monkeyzen.com.txt index f779c38e..f779c38e 100644..100755 --- a/inc/3rdparty/site_config/standard/monkeyzen.com.txt +++ b/inc/3rdparty/site_config/standard/monkeyzen.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/moonsault.de.txt b/inc/3rdparty/site_config/standard/moonsault.de.txt index 061a8d5c..55026eeb 100644..100755 --- a/inc/3rdparty/site_config/standard/moonsault.de.txt +++ b/inc/3rdparty/site_config/standard/moonsault.de.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | strip_image_src: menu | 1 | strip_image_src: menu |
2 | strip_image_src: templates | 2 | strip_image_src: templates |
3 | strip: //div/a | 3 | strip: //div/a |
4 | strip: //div/b | 4 | strip: //div/b |
5 | strip: //div/strong | 5 | strip: //div/strong |
6 | strip: //td[@width='30%'] | 6 | strip: //td[@width='30%'] |
7 | strip: //br[1] | 7 | strip: //br[1] |
8 | strip: //br[2] | 8 | strip: //br[2] |
9 | strip: //br[3] | 9 | strip: //br[3] |
10 | strip: //br[4] | 10 | strip: //br[4] |
11 | strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home'] | 11 | strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home'] |
12 | strip_id_or_class: cse-branding-right | 12 | strip_id_or_class: cse-branding-right |
13 | test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous \ No newline at end of file | 13 | test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt b/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt index a7e59c30..780cca4f 100644..100755 --- a/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt +++ b/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1[@class='print-title'] | 1 | title: //h1[@class='print-title'] |
2 | body: //div[@class='print-submitted' or @class='print-created' or @class='print-content'] | 2 | body: //div[@class='print-submitted' or @class='print-created' or @class='print-content'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | single_page_link: //li[@class='print']/a | 5 | single_page_link: //li[@class='print']/a |
6 | 6 | ||
7 | test_url: http://moreintelligentlife.com/content/places/paul-markillie/they-trash-cars-dont-they \ No newline at end of file | 7 | test_url: http://moreintelligentlife.com/content/places/paul-markillie/they-trash-cars-dont-they \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/motherboard.vice.com.txt b/inc/3rdparty/site_config/standard/motherboard.vice.com.txt index 6faf1c9a..c6312c0e 100644..100755 --- a/inc/3rdparty/site_config/standard/motherboard.vice.com.txt +++ b/inc/3rdparty/site_config/standard/motherboard.vice.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: //span[@class="author"]/a | 1 | author: //span[@class="author"]/a |
2 | date: //span[@class="date"] | 2 | date: //span[@class="date"] |
3 | body: //div[@class="story-content"] | 3 | body: //div[@class="story-content"] |
4 | strip: //aside | 4 | strip: //aside |
5 | test_url: http://motherboard.vice.com/blog/you-can-carry-a-copy-of-the-pirate-bay-in-your-pocket \ No newline at end of file | 5 | test_url: http://motherboard.vice.com/blog/you-can-carry-a-copy-of-the-pirate-bay-in-your-pocket \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/mothering.com.txt b/inc/3rdparty/site_config/standard/mothering.com.txt index a9d9195f..a34adff7 100644..100755 --- a/inc/3rdparty/site_config/standard/mothering.com.txt +++ b/inc/3rdparty/site_config/standard/mothering.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h2[contains(@class,'post_headline')] | 1 | title: //h2[contains(@class,'post_headline')] |
2 | body: //div[@class='entry'] | 2 | body: //div[@class='entry'] |
3 | convert_double_br_tags: yes | 3 | convert_double_br_tags: yes |
4 | strip_image_src: _selected.gif | 4 | strip_image_src: _selected.gif |
5 | strip_id_or_class: addthis_ | 5 | strip_id_or_class: addthis_ |
6 | strip: //a[contains(@href,'feedburner.com')] | 6 | strip: //a[contains(@href,'feedburner.com')] |
7 | test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down \ No newline at end of file | 7 | test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/motherjones.com.txt b/inc/3rdparty/site_config/standard/motherjones.com.txt index d58c7d2c..851feb7e 100644..100755 --- a/inc/3rdparty/site_config/standard/motherjones.com.txt +++ b/inc/3rdparty/site_config/standard/motherjones.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id = 'content-area'] | 2 | body: //div[@id = 'content-area'] |
3 | next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')] | 3 | next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')] |
4 | tidy: no | 4 | tidy: no |
5 | author: //p[contains(@class, 'byline')]/a | 5 | author: //p[contains(@class, 'byline')]/a |
6 | 6 | ||
7 | strip_id_or_class: node-header | 7 | strip_id_or_class: node-header |
8 | strip_id_or_class: hdr-tools | 8 | strip_id_or_class: hdr-tools |
9 | strip_id_or_class: node-body-break | 9 | strip_id_or_class: node-body-break |
10 | strip_id_or_class: pullquote | 10 | strip_id_or_class: pullquote |
11 | strip_id_or_class: node-pager | 11 | strip_id_or_class: node-pager |
12 | strip_id_or_class: author-bio | 12 | strip_id_or_class: author-bio |
13 | strip_id_or_class: node-footer | 13 | strip_id_or_class: node-footer |
14 | 14 | ||
15 | test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor \ No newline at end of file | 15 | test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/motorfull.com.txt b/inc/3rdparty/site_config/standard/motorfull.com.txt index c6bec7e9..c6bec7e9 100644..100755 --- a/inc/3rdparty/site_config/standard/motorfull.com.txt +++ b/inc/3rdparty/site_config/standard/motorfull.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/movie.douban.com.txt b/inc/3rdparty/site_config/standard/movie.douban.com.txt new file mode 100755 index 00000000..eae211ed --- /dev/null +++ b/inc/3rdparty/site_config/standard/movie.douban.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://movie.douban.com/review/1062013/ | ||
3 | |||
4 | title: //span[contains(@property, 'v:summary')] | ||
5 | author: //span[contains(@property, 'v:reviewer')] | ||
6 | date://span[contains(@property, 'v:dtreviewed')] | ||
7 | body://div[contains(@class, 'main-bd')] | ||
8 | |||
9 | strip://img[contains(@class,'rating')]|//img[contains(@class,'review-stat')] | ||
10 | convert_double_br_tags: yes | ||
11 | test_url: http://movie.douban.com/review/1062013/ | ||
12 | test_url: http://movie.douban.com/review/1021870/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt b/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt index f4f20450..7a284275 100644..100755 --- a/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt +++ b/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[class="mainBody"] | 1 | body: //div[class="mainBody"] |
2 | footnotes: no | 2 | footnotes: no |
3 | test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx \ No newline at end of file | 3 | test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/msnbc.msn.com.txt b/inc/3rdparty/site_config/standard/msnbc.msn.com.txt index ad89cda8..f008d2d1 100644..100755 --- a/inc/3rdparty/site_config/standard/msnbc.msn.com.txt +++ b/inc/3rdparty/site_config/standard/msnbc.msn.com.txt | |||
@@ -1,21 +1,21 @@ | |||
1 | title: //title | 1 | title: //title |
2 | author: //div[@id='byline'] | 2 | author: //div[@id='byline'] |
3 | 3 | ||
4 | date: //div[contains(@class,'timestamp')]/abbr/text() | 4 | date: //div[contains(@class,'timestamp')]/abbr/text() |
5 | 5 | ||
6 | body: //div[@id='intellitTXT'] | 6 | body: //div[@id='intellitTXT'] |
7 | 7 | ||
8 | strip: //div[@id='byline'] | 8 | strip: //div[@id='byline'] |
9 | strip: //div[contains(@class,'timestamp')] | 9 | strip: //div[contains(@class,'timestamp')] |
10 | strip: //div[contains(@class, 'ad-label')] | 10 | strip: //div[contains(@class, 'ad-label')] |
11 | strip: //div[contains(@class, 'ad-break')] | 11 | strip: //div[contains(@class, 'ad-break')] |
12 | strip: //span[contains(@class, 'x-video')] | 12 | strip: //span[contains(@class, 'x-video')] |
13 | strip: //span[contains(@class, 'inline')] | 13 | strip: //span[contains(@class, 'inline')] |
14 | strip: //div[contains(@class, 'video')] | 14 | strip: //div[contains(@class, 'video')] |
15 | strip: //div[contains(@class, 'discuss')] | 15 | strip: //div[contains(@class, 'discuss')] |
16 | strip: //div[@id='most-popular'] | 16 | strip: //div[@id='most-popular'] |
17 | strip: //div[contains(@class,'drawer')] | 17 | strip: //div[contains(@class,'drawer')] |
18 | strip: //*[contains(@class, 'hide')] | 18 | strip: //*[contains(@class, 'hide')] |
19 | 19 | ||
20 | footnotes: no | 20 | footnotes: no |
21 | test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE \ No newline at end of file | 21 | test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/myfoxatlanta.com.txt b/inc/3rdparty/site_config/standard/myfoxatlanta.com.txt new file mode 100755 index 00000000..8a7590ab --- /dev/null +++ b/inc/3rdparty/site_config/standard/myfoxatlanta.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id='WNStoryBody'] | ||
2 | author: //div[@id='WNStoryByline'] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.myfoxatlanta.com/category/233685/local-news?clienttype=rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/myfoxboston.com.txt b/inc/3rdparty/site_config/standard/myfoxboston.com.txt index 1a35b4fc..9ad8ce05 100644..100755 --- a/inc/3rdparty/site_config/standard/myfoxboston.com.txt +++ b/inc/3rdparty/site_config/standard/myfoxboston.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"] | 1 | body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"] |
2 | tidy: no | 2 | tidy: no |
3 | 3 | ||
4 | test_url: http://www.myfoxboston.com/dpp/news/local/transit-police-say-woman-spat-on-mbta-bus-driver-2010611 \ No newline at end of file | 4 | test_url: http://www.myfoxboston.com/dpp/news/local/transit-police-say-woman-spat-on-mbta-bus-driver-2010611 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/myrecipes.com.txt b/inc/3rdparty/site_config/standard/myrecipes.com.txt index 8b99d22d..956be1e6 100644..100755 --- a/inc/3rdparty/site_config/standard/myrecipes.com.txt +++ b/inc/3rdparty/site_config/standard/myrecipes.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h2[contains(@class, 'name')] | 1 | title: //h2[contains(@class, 'name')] |
2 | body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')] | 2 | body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')] |
3 | 3 | ||
4 | strip_id_or_class: photoBy | 4 | strip_id_or_class: photoBy |
5 | strip_id_or_class: link | 5 | strip_id_or_class: link |
6 | 6 | ||
7 | single_page_link: //li[@class='print']/a[contains(@href, '/print/')] | 7 | single_page_link: //li[@class='print']/a[contains(@href, '/print/')] |
8 | 8 | ||
9 | prune: no | 9 | prune: no |
10 | tidy: no | 10 | tidy: no |
11 | 11 | ||
12 | test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/ \ No newline at end of file | 12 | test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/narenji.ir.txt b/inc/3rdparty/site_config/standard/narenji.ir.txt index 6c3d0c24..6c3d0c24 100644..100755 --- a/inc/3rdparty/site_config/standard/narenji.ir.txt +++ b/inc/3rdparty/site_config/standard/narenji.ir.txt | |||
diff --git a/inc/3rdparty/site_config/standard/nasa.gov.txt b/inc/3rdparty/site_config/standard/nasa.gov.txt index d95530f3..7df1112b 100644..100755 --- a/inc/3rdparty/site_config/standard/nasa.gov.txt +++ b/inc/3rdparty/site_config/standard/nasa.gov.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@class='address']/span | 1 | title: //div[@class='address']/span |
2 | author: substring-before(//span[@class='credits'],',') | 2 | author: substring-before(//span[@class='credits'],',') |
3 | date: //div[@class='promodatepress']/span | 3 | date: //div[@class='promodatepress']/span |
4 | body: //div[@class='default_style_wrap'] | 4 | body: //div[@class='default_style_wrap'] |
5 | strip: //div[@class='text_adjust'] | 5 | strip: //div[@class='text_adjust'] |
6 | strip: //div[@class='skiplink'] | 6 | strip: //div[@class='skiplink'] |
7 | strip: //h2 | 7 | strip: //h2 |
8 | test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html \ No newline at end of file | 8 | test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nbweekly.com.txt b/inc/3rdparty/site_config/standard/nbweekly.com.txt index 0b722d33..2645d406 100644..100755 --- a/inc/3rdparty/site_config/standard/nbweekly.com.txt +++ b/inc/3rdparty/site_config/standard/nbweekly.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | date://span[contains(@class,'date')] | 1 | date://span[contains(@class,'date')] |
2 | 2 | ||
3 | body://div[contains(@class,'contWarp')] | 3 | body://div[contains(@class,'contWarp')] |
4 | 4 | ||
5 | strip://div[contains(@class,'keyWord')] | 5 | strip://div[contains(@class,'keyWord')] |
6 | strip://div[contains(@class,'submitComt')] | 6 | strip://div[contains(@class,'submitComt')] |
7 | strip://div[contains(@class,'cmts')] | 7 | strip://div[contains(@class,'cmts')] |
8 | strip://div[contains(@class,'notice')] | 8 | strip://div[contains(@class,'notice')] |
9 | strip://div[contains(@class,'part pt-second')] | 9 | strip://div[contains(@class,'part pt-second')] |
10 | test_url: http://www.nbweekly.com/news/china/201203/29316.aspx \ No newline at end of file | 10 | test_url: http://www.nbweekly.com/news/china/201203/29316.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/neh.gov.txt b/inc/3rdparty/site_config/standard/neh.gov.txt index 45136a2b..e7cc4313 100644..100755 --- a/inc/3rdparty/site_config/standard/neh.gov.txt +++ b/inc/3rdparty/site_config/standard/neh.gov.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | #host configuration should be http://www.neh.gov/news/humanities/ | 1 | #host configuration should be http://www.neh.gov/news/humanities/ |
2 | 2 | ||
3 | 3 | ||
4 | #meta data | 4 | #meta data |
5 | title:substring-after(substring-after(//title,':'),':') | 5 | title:substring-after(substring-after(//title,':'),':') |
6 | author:substring-after(//h2[@class = 'subHead'],'By') | 6 | author:substring-after(//h2[@class = 'subHead'],'By') |
7 | date:substring-before(substring-after(//title,':'),':') | 7 | date:substring-before(substring-after(//title,':'),':') |
8 | 8 | ||
9 | #img and caption handling | 9 | #img and caption handling |
10 | wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text() | 10 | wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text() |
11 | wrap_in(fieldset)://div[@id = 'mainContent']/table | 11 | wrap_in(fieldset)://div[@id = 'mainContent']/table |
12 | 12 | ||
13 | # clean up | 13 | # clean up |
14 | strip: //table[@class = 'marginpaddingTop'] | 14 | strip: //table[@class = 'marginpaddingTop'] |
15 | strip: //h2[@class = 'subHead'] | 15 | strip: //h2[@class = 'subHead'] |
16 | 16 | ||
17 | test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html \ No newline at end of file | 17 | test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/neomoney.co.txt b/inc/3rdparty/site_config/standard/neomoney.co.txt index 564d5492..2089fc39 100644..100755 --- a/inc/3rdparty/site_config/standard/neomoney.co.txt +++ b/inc/3rdparty/site_config/standard/neomoney.co.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //*[@class="header_title"]/h1 | 1 | title: //*[@class="header_title"]/h1 |
2 | body: //div[contains(@class, 'content')] | 2 | body: //div[contains(@class, 'content')] |
3 | test_url: http://neomoney.co/personal/expatriate-and-migrant-loans/expatriate-loans/ \ No newline at end of file | 3 | test_url: http://neomoney.co/personal/expatriate-and-migrant-loans/expatriate-loans/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/net-security.org.txt b/inc/3rdparty/site_config/standard/net-security.org.txt index 4e6d66d4..b7fedbf3 100644..100755 --- a/inc/3rdparty/site_config/standard/net-security.org.txt +++ b/inc/3rdparty/site_config/standard/net-security.org.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='content-title'] | 1 | title: //div[@class='content-title'] |
2 | #date: substring-after(//div[@class='dernek-text-under'],'Posted on') | 2 | #date: substring-after(//div[@class='dernek-text-under'],'Posted on') |
3 | body: //div[@class='content-item'] | 3 | body: //div[@class='content-item'] |
4 | next_page_link: //li[@class='next']/a | 4 | next_page_link: //li[@class='next']/a |
5 | convert_double_br_tags: yes | 5 | convert_double_br_tags: yes |
6 | 6 | ||
7 | test_url: http://www.net-security.org/article.php?id=1732 \ No newline at end of file | 7 | test_url: http://www.net-security.org/article.php?id=1732 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/netmagazine.com.txt b/inc/3rdparty/site_config/standard/netmagazine.com.txt index 86885445..dcea047c 100644..100755 --- a/inc/3rdparty/site_config/standard/netmagazine.com.txt +++ b/inc/3rdparty/site_config/standard/netmagazine.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | author: //div[@class="submitted"]/span | 2 | author: //div[@class="submitted"]/span |
3 | 3 | ||
4 | # seems like this should work, but nothing is returned. Issue with xpath parser? | 4 | # seems like this should work, but nothing is returned. Issue with xpath parser? |
5 | date: //div[@class="submitted"]/time | 5 | date: //div[@class="submitted"]/time |
6 | 6 | ||
7 | body: //div[@id="main-content"] | 7 | body: //div[@id="main-content"] |
8 | 8 | ||
9 | strip_comments: no | 9 | strip_comments: no |
10 | 10 | ||
11 | strip: //h1 | 11 | strip: //h1 |
12 | strip: //div[@class="submitted"] | 12 | strip: //div[@class="submitted"] |
13 | strip: //dd[@class="profile-avatar"] | 13 | strip: //dd[@class="profile-avatar"] |
14 | strip: //div[@class="author-profile"]/dl/dt[1] | 14 | strip: //div[@class="author-profile"]/dl/dt[1] |
15 | strip: //div[@id="right-col"] | 15 | strip: //div[@id="right-col"] |
16 | test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile \ No newline at end of file | 16 | test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/netzpolitik.org.txt b/inc/3rdparty/site_config/standard/netzpolitik.org.txt index 87dc3cdf..7fa43fd7 100644..100755 --- a/inc/3rdparty/site_config/standard/netzpolitik.org.txt +++ b/inc/3rdparty/site_config/standard/netzpolitik.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class='entry-title'] | 1 | title: //h1[@class='entry-title'] |
2 | author: //a[@ref='author'] | 2 | author: //a[@ref='author'] |
3 | date: //span[@class='entry-date'] | 3 | date: //span[@class='entry-date'] |
4 | body: //div[@class='entry-content'] | 4 | body: //div[@class='entry-content'] |
5 | 5 | ||
6 | test_url: http://netzpolitik.org/2011/buch-generation-facebook/ \ No newline at end of file | 6 | test_url: http://netzpolitik.org/2011/buch-generation-facebook/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/newleftproject.org.txt b/inc/3rdparty/site_config/standard/newleftproject.org.txt new file mode 100755 index 00000000..d9af99d8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/newleftproject.org.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //div[contains(@class, 'article_header')]//h3 | ||
2 | |||
3 | test_url: http://www.newleftproject.org/index.php/site/article_comments/do_we_need_a_facebook_of_the_left \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newmatilda.com.txt b/inc/3rdparty/site_config/standard/newmatilda.com.txt index ab766847..f17ecdc6 100644..100755 --- a/inc/3rdparty/site_config/standard/newmatilda.com.txt +++ b/inc/3rdparty/site_config/standard/newmatilda.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id="maincontent"]/h1 | 1 | title: //div[@id="maincontent"]/h1 |
2 | body: //div[@id="maincontent"] | 2 | body: //div[@id="maincontent"] |
3 | date: //div[@id="maincontent"]/p[2] | 3 | date: //div[@id="maincontent"]/p[2] |
4 | author: //ul[@id="contributors"]/li/p/b | 4 | author: //ul[@id="contributors"]/li/p/b |
5 | 5 | ||
6 | strip: //p[@*] | 6 | strip: //p[@*] |
7 | strip: //h1 | 7 | strip: //h1 |
8 | strip: //div[@id="maincontent"]/div | 8 | strip: //div[@id="maincontent"]/div |
9 | test_url: http://newmatilda.com/2011/07/22/turnbull-makes-sense-climate \ No newline at end of file | 9 | test_url: http://newmatilda.com/2011/07/22/turnbull-makes-sense-climate \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/newrepublic.com.txt b/inc/3rdparty/site_config/standard/newrepublic.com.txt new file mode 100755 index 00000000..039f0385 --- /dev/null +++ b/inc/3rdparty/site_config/standard/newrepublic.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | author: //span[@class="authors"] | ||
2 | date: //span[@class="date"] | ||
3 | body: //div[@class="primary"] | ||
4 | |||
5 | strip: //div[@id="controls"] | ||
6 | strip: //div[@id="read-next"] | ||
7 | |||
8 | test_url: http://www.newrepublic.com/article/112731/moocs-will-online-education-ruin-university-experience \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news-gazette.com.txt b/inc/3rdparty/site_config/standard/news-gazette.com.txt index 1f1e5d3a..2b352707 100644..100755 --- a/inc/3rdparty/site_config/standard/news-gazette.com.txt +++ b/inc/3rdparty/site_config/standard/news-gazette.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@id="main-content"]//h2 | 1 | title: //div[@id="main-content"]//h2 |
2 | 2 | ||
3 | author: //div[@id="main-content"]//span[@class="authors"] | 3 | author: //div[@id="main-content"]//span[@class="authors"] |
4 | 4 | ||
5 | date: //div[@id="main-content"]//span[@class="timestamp"] | 5 | date: //div[@id="main-content"]//span[@class="timestamp"] |
6 | 6 | ||
7 | body: //div[@id="main-content"]//div[@class="content"] | 7 | body: //div[@id="main-content"]//div[@class="content"] |
8 | test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html \ No newline at end of file | 8 | test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.cnet.com.txt b/inc/3rdparty/site_config/standard/news.cnet.com.txt index b7ab224a..78af70f4 100644..100755 --- a/inc/3rdparty/site_config/standard/news.cnet.com.txt +++ b/inc/3rdparty/site_config/standard/news.cnet.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | #This should apply to *.cnet.com. Not just news.cnet.com. | 1 | #This should apply to *.cnet.com. Not just news.cnet.com. |
2 | title: //h1 | 2 | title: //h1 |
3 | author: //img[@class="mugshot"]/@alt | 3 | author: //img[@class="mugshot"]/@alt |
4 | strip: //h1 | 4 | strip: //h1 |
5 | strip_id_or_class: breadcrumb | 5 | strip_id_or_class: breadcrumb |
6 | strip: //p[@id="introP"] | 6 | strip: //p[@id="introP"] |
7 | strip: //div[@class="postByline"] | 7 | strip: //div[@class="postByline"] |
8 | strip: //div[@class="editorBio"] | 8 | strip: //div[@class="editorBio"] |
9 | strip: //div[@class="inline-slideshow"] | 9 | strip: //div[@class="inline-slideshow"] |
10 | strip: //div[@class="related"] | 10 | strip: //div[@class="related"] |
11 | body: //div[@class="postBody txtWrap"] | 11 | body: //div[@class="postBody txtWrap"] |
12 | test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts \ No newline at end of file | 12 | test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.detik.com.txt b/inc/3rdparty/site_config/standard/news.detik.com.txt index 3ed1dc85..629bc917 100644..100755 --- a/inc/3rdparty/site_config/standard/news.detik.com.txt +++ b/inc/3rdparty/site_config/standard/news.detik.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title://div[@class="content_detail"]/h1 | 1 | title://div[@class="content_detail"]/h1 |
2 | 2 | ||
3 | author://div[@class="author"]/strong | 3 | author://div[@class="author"]/strong |
4 | 4 | ||
5 | date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB') | 5 | date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB') |
6 | 6 | ||
7 | body://div[@class="text_detail"] | 7 | body://div[@class="text_detail"] |
8 | test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai \ No newline at end of file | 8 | test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt b/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt index 6fc86137..5754d47a 100644..100755 --- a/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt +++ b/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@id='main'] | 1 | body: //div[@id='main'] |
2 | strip: //div[@id='sbs'] | 2 | strip: //div[@id='sbs'] |
3 | strip: //div[@id='fsizeSwitch'] | 3 | strip: //div[@id='fsizeSwitch'] |
4 | strip: //div[@id='googleAd'] | 4 | strip: //div[@id='googleAd'] |
5 | strip: //div[@id='detailFoot'] | 5 | strip: //div[@id='detailFoot'] |
6 | strip_image_src: counter?key | 6 | strip_image_src: counter?key |
7 | convert_double_br_tags: yes | 7 | convert_double_br_tags: yes |
8 | 8 | ||
9 | test_url: http://news.kanaloco.jp/localnews/article/1105200018/ \ No newline at end of file | 9 | test_url: http://news.kanaloco.jp/localnews/article/1105200018/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.mynavi.jp.txt b/inc/3rdparty/site_config/standard/news.mynavi.jp.txt index ded680f1..1df47314 100644..100755 --- a/inc/3rdparty/site_config/standard/news.mynavi.jp.txt +++ b/inc/3rdparty/site_config/standard/news.mynavi.jp.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h2[@class="lyt-hdg-02-04"] | 1 | title: //h2[@class="lyt-hdg-02-04"] |
2 | 2 | ||
3 | author: //div[@class="lyt-namearea"]/a | 3 | author: //div[@class="lyt-namearea"]/a |
4 | 4 | ||
5 | date: //div[@class="lyt-namearea"]/text() | 5 | date: //div[@class="lyt-namearea"]/text() |
6 | 6 | ||
7 | body: //div[@class="articleContent"] | 7 | body: //div[@class="articleContent"] |
8 | 8 | ||
9 | strip: //div[@id="tab-aside"] | 9 | strip: //div[@id="tab-aside"] |
10 | 10 | ||
11 | test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html \ No newline at end of file | 11 | test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.orf.at.txt b/inc/3rdparty/site_config/standard/news.orf.at.txt index b60deea4..3b1d3ccb 100644..100755 --- a/inc/3rdparty/site_config/standard/news.orf.at.txt +++ b/inc/3rdparty/site_config/standard/news.orf.at.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | 1 | single_page_link: //div[@id='content']//p[@class='readMore']/a |
2 | 2 | ||
3 | title: //div[@class='hidden offscreen']/h2 | 3 | title: //div[@class='hidden offscreen']/h2 |
4 | body: //div[@id="storyText"] | 4 | body: //div[@id="storyText"] |
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | 5 | move_into(//div[@id='storyText']): //div[@class='fact'] |
6 | strip: //small[@class='credit'] | 6 | strip: //small[@class='credit'] |
7 | strip: //small[@class='caption'] | 7 | strip: //small[@class='caption'] |
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | 8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') |
9 | strip: //p[@class='toplink'] | 9 | strip: //p[@class='toplink'] |
10 | 10 | ||
11 | test_url: http://news.orf.at/stories/2084731/ \ No newline at end of file | 11 | test_url: http://news.orf.at/stories/2084731/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.rambler.ru.txt b/inc/3rdparty/site_config/standard/news.rambler.ru.txt index 743245f8..1d547334 100644..100755 --- a/inc/3rdparty/site_config/standard/news.rambler.ru.txt +++ b/inc/3rdparty/site_config/standard/news.rambler.ru.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //article | 1 | body: //article |
2 | title: //h1 | 2 | title: //h1 |
3 | author: //span[@class='b-article-source-dropdown'] | 3 | author: //span[@class='b-article-source-dropdown'] |
4 | strip: //span[@class='b-article-photo-incut__source'] | 4 | strip: //span[@class='b-article-photo-incut__source'] |
5 | strip: //a[@class='b-read-more b-read-more_bottom'] | 5 | strip: //a[@class='b-read-more b-read-more_bottom'] |
6 | 6 | ||
7 | 7 | ||
8 | tidy:no | 8 | tidy:no |
9 | test_url: http://news.rambler.ru/12972208/ \ No newline at end of file | 9 | test_url: http://news.rambler.ru/12972208/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.techmeme.com.txt b/inc/3rdparty/site_config/standard/news.techmeme.com.txt index c80c3327..ba4db828 100644..100755 --- a/inc/3rdparty/site_config/standard/news.techmeme.com.txt +++ b/inc/3rdparty/site_config/standard/news.techmeme.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class='main']/div[@class='item'] | 1 | body: //div[@class='main']/div[@class='item'] |
2 | strip: //div[@class='right'] | 2 | strip: //div[@class='right'] |
3 | 3 | ||
4 | test_url: http://news.techmeme.com/110516/fh-rip \ No newline at end of file | 4 | test_url: http://news.techmeme.com/110516/fh-rip \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.yahoo.com.txt b/inc/3rdparty/site_config/standard/news.yahoo.com.txt index 5ee04049..fc1739c8 100644..100755 --- a/inc/3rdparty/site_config/standard/news.yahoo.com.txt +++ b/inc/3rdparty/site_config/standard/news.yahoo.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | title: //h1[@class='headline'] | 2 | title: //h1[@class='headline'] |
3 | author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn'] | 3 | author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn'] |
4 | date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title | 4 | date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title |
5 | body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')] | 5 | body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')] |
6 | #strip: //cite/abbr | 6 | #strip: //cite/abbr |
7 | strip_id_or_class: action | 7 | strip_id_or_class: action |
8 | strip_id_or_class: prefetch | 8 | strip_id_or_class: prefetch |
9 | tidy: no | 9 | tidy: no |
10 | prune: no | 10 | prune: no |
11 | 11 | ||
12 | test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html \ No newline at end of file | 12 | test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.ycombinator.com.txt b/inc/3rdparty/site_config/standard/news.ycombinator.com.txt index 0b01f8a1..f7441d17 100644..100755 --- a/inc/3rdparty/site_config/standard/news.ycombinator.com.txt +++ b/inc/3rdparty/site_config/standard/news.ycombinator.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | strip_comments: no | 1 | strip_comments: no |
2 | strip: //a[. = 'reply'] | 2 | strip: //a[. = 'reply'] |
3 | test_url: http://news.ycombinator.com/item?id=1516461 \ No newline at end of file | 3 | test_url: http://news.ycombinator.com/item?id=1516461 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/news.zing.vn.txt b/inc/3rdparty/site_config/standard/news.zing.vn.txt new file mode 100755 index 00000000..af81e90e --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.zing.vn.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body://div[@class="newsdetail_wrapper"] | ||
2 | strip://div[@class="more_news"] | ||
3 | test_url: http://news.zing.vn/xa-hoi/s-phat-nang-xe-may-di-duong-tren-cao-ha-noi/a280838.html#home_noibat1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news247.gr.txt b/inc/3rdparty/site_config/standard/news247.gr.txt new file mode 100755 index 00000000..87637bed --- /dev/null +++ b/inc/3rdparty/site_config/standard/news247.gr.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='title'] | ||
2 | |||
3 | body: //img[@id='relPicsMainPic'] | //div[contains(@class, 'storyContent')] | ||
4 | |||
5 | test_url: http://news247.gr/eidiseis/katatheseis_fwtia_htan_apofasismenoi_akomh_kai_na_afairesoyn_zwes_an_thewrousan_oti_to_thuma_htan_antipalos_toys.2433351.html | ||
6 | test_url: http://news247.gr/?widget=rssfeed&view=feed&contentId=38291 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newsbomb.gr.txt b/inc/3rdparty/site_config/standard/newsbomb.gr.txt index 0500890f..5eb0ea46 100644..100755 --- a/inc/3rdparty/site_config/standard/newsbomb.gr.txt +++ b/inc/3rdparty/site_config/standard/newsbomb.gr.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | date: //meta[@name='og:article:published_time']/@value | 1 | date: //meta[@name='og:article:published_time']/@value |
2 | 2 | ||
3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] | 3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] |
4 | 4 | ||
5 | strip_id_or_class: itemImageGallery | 5 | strip_id_or_class: itemImageGallery |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex \ No newline at end of file | 9 | test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/newsle.com.txt b/inc/3rdparty/site_config/standard/newsle.com.txt index e500ddcc..e500ddcc 100644..100755 --- a/inc/3rdparty/site_config/standard/newsle.com.txt +++ b/inc/3rdparty/site_config/standard/newsle.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/newsmill.se.txt b/inc/3rdparty/site_config/standard/newsmill.se.txt index eb7d3350..1a990319 100644..100755 --- a/inc/3rdparty/site_config/standard/newsmill.se.txt +++ b/inc/3rdparty/site_config/standard/newsmill.se.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent'] | 2 | body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent'] |
3 | author: //div[@class='byline']//a[contains(@href, '/user/')] | 3 | author: //div[@class='byline']//a[contains(@href, '/user/')] |
4 | 4 | ||
5 | strip_id_or_class: facts | 5 | strip_id_or_class: facts |
6 | strip_id_or_class: articleBlogsHolder | 6 | strip_id_or_class: articleBlogsHolder |
7 | strip_id_or_class: byline | 7 | strip_id_or_class: byline |
8 | 8 | ||
9 | prune: no | 9 | prune: no |
10 | tidy: no | 10 | tidy: no |
11 | 11 | ||
12 | test_url: http://www.newsmill.se/artikel/2012/05/06/medielogiken-v-ger-tyngre-n-reportrarnas-sikter \ No newline at end of file | 12 | test_url: http://www.newsmill.se/artikel/2012/05/06/medielogiken-v-ger-tyngre-n-reportrarnas-sikter \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/newsunspun.org.txt b/inc/3rdparty/site_config/standard/newsunspun.org.txt index 860ad66b..247bbebb 100644..100755 --- a/inc/3rdparty/site_config/standard/newsunspun.org.txt +++ b/inc/3rdparty/site_config/standard/newsunspun.org.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@class='right']//div[@class='articles'] | 1 | body: //div[@class='right']//div[@class='articles'] |
2 | author: //div[@id='artinfo']//a[contains(@href, '/author/')] | 2 | author: //div[@id='artinfo']//a[contains(@href, '/author/')] |
3 | strip: //div[@id='artinfo'] | 3 | strip: //div[@id='artinfo'] |
4 | strip: //table[//a[contains(@href, 'twitter.com')]] | 4 | strip: //table[//a[contains(@href, 'twitter.com')]] |
5 | strip_id_or_class: twitter | 5 | strip_id_or_class: twitter |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | tidy: no | 8 | tidy: no |
9 | 9 | ||
10 | test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb \ No newline at end of file | 10 | test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/newsweek.com.txt b/inc/3rdparty/site_config/standard/newsweek.com.txt new file mode 100755 index 00000000..565648ba --- /dev/null +++ b/inc/3rdparty/site_config/standard/newsweek.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class = 'article-body'] | ||
2 | title: //h1[@class = 'article-title'] | ||
3 | strip: //aside | ||
4 | |||
5 | test_url: http://www.newsweek.com/day-steve-mcqueen-met-his-new-nazi-neighbor-keith-moon-229741 | ||
6 | test_url: http://www.newsweek.com/2014/06/13/how-greylock-partners-finds-next-facebook-253329.html | ||
diff --git a/inc/3rdparty/site_config/standard/newswise.com.txt b/inc/3rdparty/site_config/standard/newswise.com.txt new file mode 100755 index 00000000..10120ea1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/newswise.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | prune: no | ||
2 | tidy: no | ||
3 | |||
4 | title: //h1/a[2] | ||
5 | body: //div[@id="main"] | ||
6 | author: //span[@id="articlesource"] | ||
7 | date: //span[contains(@class, 'releasedate')] | ||
8 | |||
9 | strip: //div[@class="inst-logo"] | ||
10 | strip: //h1[1] | ||
11 | |||
12 | strip_id_or_class: addthis | ||
13 | strip_id_or_class: released | ||
14 | strip_id_or_class: skiptranslate | ||
15 | strip_id_or_class: flash | ||
16 | |||
17 | test_url: http://www.newswise.com/articles/first-heat-wave-of-season-puts-elderly-at-risk | ||
diff --git a/inc/3rdparty/site_config/standard/newyorker.com.txt b/inc/3rdparty/site_config/standard/newyorker.com.txt index 5624aa8c..950324a3 100644..100755 --- a/inc/3rdparty/site_config/standard/newyorker.com.txt +++ b/inc/3rdparty/site_config/standard/newyorker.com.txt | |||
@@ -1,10 +1,11 @@ | |||
1 | title: //h1[@id='articlehed'] | //h2[@id="articleintro"] | 1 | title: //h1[@id='articlehed'] | //h2[@id="articleintro"] |
2 | body: //div[@id='articletext'] | 2 | body: //div[@id='articletext'] |
3 | 3 | ||
4 | strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"] | 4 | strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"] | //div[@class="cartoon"] |
5 | 5 | ||
6 | date: //h4[@id='articleauthor']/span[@class='dd dds'] | 6 | date: //h4[@id='articleauthor']/span[@class='dd dds'] |
7 | date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published'] | 7 | date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published'] |
8 | 8 | ||
9 | single_page_link: //div[@class='paginationViewSinglePage']/a | 9 | single_page_link: //div[@class='paginationViewSinglePage']/a |
10 | test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html \ No newline at end of file | 10 | test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html |
11 | test_url: http://www.newyorker.com/reporting/2013/04/22/130422fa_fact_bilger?currentPage=all&mobify=0 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/next-gen.biz.txt b/inc/3rdparty/site_config/standard/next-gen.biz.txt index 806a3dfd..b8d235db 100644..100755 --- a/inc/3rdparty/site_config/standard/next-gen.biz.txt +++ b/inc/3rdparty/site_config/standard/next-gen.biz.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | # 2011-08-22 [carlo@...] initial version | 1 | # 2011-08-22 [carlo@...] initial version |
2 | # 2011-08-22 [carlo@...] removed comments & social links | 2 | # 2011-08-22 [carlo@...] removed comments & social links |
3 | 3 | ||
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | single_page_link: //a[@class="single active"] | 6 | single_page_link: //a[@class="single active"] |
7 | 7 | ||
8 | body: //div[@id="main"]//div[@class="content-region"]/article | 8 | body: //div[@id="main"]//div[@class="content-region"]/article |
9 | author: //span[@class="author-name"] | 9 | author: //span[@class="author-name"] |
10 | date: //time/text() | 10 | date: //time/text() |
11 | 11 | ||
12 | strip_id_or_class: //aside[@id="related"] | 12 | strip_id_or_class: //aside[@id="related"] |
13 | strip: //footer | 13 | strip: //footer |
14 | 14 | ||
15 | title: //h1 | 15 | title: //h1 |
16 | test_url: http://www.next-gen.biz/reviews/deus-ex-human-revolution-review \ No newline at end of file | 16 | test_url: http://www.next-gen.biz/reviews/deus-ex-human-revolution-review \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nfl.com.txt b/inc/3rdparty/site_config/standard/nfl.com.txt index 70f92473..956b288f 100644..100755 --- a/inc/3rdparty/site_config/standard/nfl.com.txt +++ b/inc/3rdparty/site_config/standard/nfl.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | # doesn't look like selecting an attribute value works? | 1 | # doesn't look like selecting an attribute value works? |
2 | # author: //meta[@id="authorName"]@value | 2 | # author: //meta[@id="authorName"]@value |
3 | 3 | ||
4 | author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ") | 4 | author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ") |
5 | date: //abbr[@id="article-time"] | 5 | date: //abbr[@id="article-time"] |
6 | title: //div[@id="article-hdr"]/h1 | 6 | title: //div[@id="article-hdr"]/h1 |
7 | body: //div[@class="articleText"] | 7 | body: //div[@class="articleText"] |
8 | 8 | ||
9 | # strip miscellaneous teasers & etc | 9 | # strip miscellaneous teasers & etc |
10 | strip: //div[@class="removeformobile"] | 10 | strip: //div[@class="removeformobile"] |
11 | test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream \ No newline at end of file | 11 | test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt b/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt index 60834862..44a82a95 100644..100755 --- a/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt +++ b/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | next_page_link: //div[@class='nextpage_continue']/a | 1 | next_page_link: //div[@class='nextpage_continue']/a |
2 | strip: //div[@class='nextpage_continue'] | 2 | strip: //div[@class='nextpage_continue'] |
3 | strip_id_or_class: nextpage | 3 | strip_id_or_class: nextpage |
4 | title: //div[@class='article_title']//h1 | 4 | title: //div[@class='article_title']//h1 |
5 | body: //div[@class='article_title']/.. | 5 | body: //div[@class='article_title']/.. |
6 | body: //div[@class='content'] | 6 | body: //div[@class='content'] |
7 | test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text \ No newline at end of file | 7 | test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nhk.or.jp.txt b/inc/3rdparty/site_config/standard/nhk.or.jp.txt index 0a3bb913..0a3bb913 100644..100755 --- a/inc/3rdparty/site_config/standard/nhk.or.jp.txt +++ b/inc/3rdparty/site_config/standard/nhk.or.jp.txt | |||
diff --git a/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt b/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt index 409a8977..f0e28afb 100644..100755 --- a/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt +++ b/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | body: //div[@id="main"] | 1 | body: //div[@id="main"] |
2 | title: //div[@id="main"]/h3 | 2 | title: //div[@id="main"]/h3 |
3 | 3 | ||
4 | # Remove ‘Review’ and ‘Wii’. | 4 | # Remove ‘Review’ and ‘Wii’. |
5 | strip: //div[@class="badge"] | 5 | strip: //div[@class="badge"] |
6 | 6 | ||
7 | # Remove duplicate title and country flag. | 7 | # Remove duplicate title and country flag. |
8 | strip: //h3 | 8 | strip: //h3 |
9 | 9 | ||
10 | # Commented out below are attempts to extract the author and date, which did not work. | 10 | # Commented out below are attempts to extract the author and date, which did not work. |
11 | # author: //p[@class="extra "]/a | 11 | # author: //p[@class="extra "]/a |
12 | # date: //p[@class="extra "]/span[@class="when"] | 12 | # date: //p[@class="extra "]/span[@class="when"] |
13 | test_url: http://www.nintendoworldreport.com/review/28400 \ No newline at end of file | 13 | test_url: http://www.nintendoworldreport.com/review/28400 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nojesguiden.se.txt b/inc/3rdparty/site_config/standard/nojesguiden.se.txt index ae2d7e41..b15f0612 100644..100755 --- a/inc/3rdparty/site_config/standard/nojesguiden.se.txt +++ b/inc/3rdparty/site_config/standard/nojesguiden.se.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: //span[@class='meta']/span[@class='username'] | 1 | author: //span[@class='meta']/span[@class='username'] |
2 | body: //div[@class='article-content'] | 2 | body: //div[@class='article-content'] |
3 | 3 | ||
4 | strip_id_or_class: 'article-actions' | 4 | strip_id_or_class: 'article-actions' |
5 | test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i \ No newline at end of file | 5 | test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/northumberlandview.ca.txt b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt index 04a0a34d..88429a78 100644..100755 --- a/inc/3rdparty/site_config/standard/northumberlandview.ca.txt +++ b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='pn-maincontent'] | 2 | body: //div[@id='pn-maincontent'] |
3 | strip_id_or_class: z-menu | 3 | strip_id_or_class: z-menu |
4 | strip_id_or_class: news_category | 4 | strip_id_or_class: news_category |
5 | strip_id_or_class: news_title | 5 | strip_id_or_class: news_title |
6 | strip_id_or_class: news_modify | 6 | strip_id_or_class: news_modify |
7 | strip_id_or_class: news_morearticlesincat | 7 | strip_id_or_class: news_morearticlesincat |
8 | strip_id_or_class: ezc_comments | 8 | strip_id_or_class: ezc_comments |
9 | strip_comments: yes | 9 | strip_comments: yes |
10 | 10 | ||
11 | test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972 \ No newline at end of file | 11 | test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nosalty.hu.txt b/inc/3rdparty/site_config/standard/nosalty.hu.txt new file mode 100755 index 00000000..7e20cadf --- /dev/null +++ b/inc/3rdparty/site_config/standard/nosalty.hu.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@id='tab-recept']//h1 | ||
2 | body: //div[@id='tab-recept']//div[contains(@class, 'column-container')] | ||
3 | strip_id_or_class: ajanlo-box | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.nosalty.hu/recept/szupergyors-fank \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nplusonemag.com.txt b/inc/3rdparty/site_config/standard/nplusonemag.com.txt index 205b1af4..1b817c04 100644..100755 --- a/inc/3rdparty/site_config/standard/nplusonemag.com.txt +++ b/inc/3rdparty/site_config/standard/nplusonemag.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: /html/body/div[3]/div/div/h1 | 1 | title: /html/body/div[3]/div/div/h1 |
2 | 2 | ||
3 | body: //*[@id="article-body"] | 3 | body: //*[@id="article-body"] |
4 | 4 | ||
5 | 5 | ||
6 | test_url: http://nplusonemag.com/the-outskirts-of-progress \ No newline at end of file | 6 | test_url: http://nplusonemag.com/the-outskirts-of-progress \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/npr.org.txt b/inc/3rdparty/site_config/standard/npr.org.txt index afab0eb3..acd73e48 100644..100755 --- a/inc/3rdparty/site_config/standard/npr.org.txt +++ b/inc/3rdparty/site_config/standard/npr.org.txt | |||
@@ -1,32 +1,34 @@ | |||
1 | title: //div[contains(@class, 'storytitle')]//h1 | 1 | title: //div[contains(@class, 'storytitle')]//h1 |
2 | author: //p[@class="byline"]/span | 2 | author: //p[@class="byline"]/span |
3 | body: //div[@id='storyspan02']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext'] | //div[@class='transcript'] | 3 | body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')] |
4 | date: //meta[@name="date"]/@content | 4 | date: //meta[@name="date"]/@content |
5 | 5 | ||
6 | strip: //div[@class='enlarge_measure'] | 6 | strip_id_or_class: enlarge_measure |
7 | strip: //div[@class='enlarge_html'] | 7 | strip_id_or_class: enlarge_html |
8 | strip: //a[@class='enlargeicon'] | 8 | strip: //a[contains(@class, 'enlargeicon')] |
9 | strip: //div[contains(@class, 'bookedition')] | 9 | strip: //div[contains(@class, 'bookedition')] |
10 | strip: //div[@class='textsize'] | 10 | strip: //div[@class='textsize'] |
11 | strip: //ul[@class='genres'] | 11 | strip: //ul[@class='genres'] |
12 | strip: //span[@class='bull'] | 12 | strip: //span[@class='bull'] |
13 | strip_id_or_class: secondary | 13 | strip_id_or_class: secondary |
14 | strip_id_or_class: con1col | 14 | strip_id_or_class: con1col |
15 | strip: //h3[@class='conheader'] | 15 | strip: //h3[@class='conheader'] |
16 | 16 | ||
17 | replace_string(<a name="more"> </a>): <!-- no more --> | 17 | replace_string(<a name="more"> </a>): <!-- no more --> |
18 | replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2> | 18 | replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2> |
19 | 19 | replace_string(<div class="transcript storytext">): <div class="transcript storytext"><h2>Transcript</h2> | |
20 | prune: no | 20 | |
21 | strip://div[@class="ecommercepop"] | 21 | prune: no |
22 | strip://span[@class="bull"] | 22 | strip://div[@class="ecommercepop"] |
23 | strip://span[@class="purchaseLink"] | 23 | strip://span[@class="bull"] |
24 | strip://div[@class="enlarge_html"] | 24 | strip://span[@class="purchaseLink"] |
25 | strip://div[@class="enlarge_measure"] | 25 | strip://div[@class="enlarge_html"] |
26 | strip://div[@class="container con1col small"] | 26 | strip://div[@class="enlarge_measure"] |
27 | strip://a[contains(@class, "enlargebtn")] | 27 | strip://div[@class="container con1col small"] |
28 | strip://div[contains(@class, "bucketwrap internallink")] | 28 | strip://a[contains(@class, "enlargebtn")] |
29 | 29 | strip://div[contains(@class, "bucketwrap internallink")] | |
30 | test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates | 30 | |
31 | test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right | 31 | test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates |
32 | test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres \ No newline at end of file | 32 | test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right |
33 | test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres | ||
34 | test_url: http://www.npr.org/templates/story/story.php?storyId=229103221 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nybooks.com.txt b/inc/3rdparty/site_config/standard/nybooks.com.txt index 8ecb8961..d95ec68e 100644..100755 --- a/inc/3rdparty/site_config/standard/nybooks.com.txt +++ b/inc/3rdparty/site_config/standard/nybooks.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | strip_id_or_class: sIFR-alternate | 1 | strip_id_or_class: sIFR-alternate |
2 | title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2 | 2 | title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2 |
3 | single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))] | 3 | single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))] |
4 | 4 | ||
5 | body: //div[@id = 'article-body'] | 5 | body: //div[@id = 'article-body'] |
6 | strip_id_or_class:article-tools | 6 | strip_id_or_class:article-tools |
7 | strip_id_or_class:js_target | 7 | strip_id_or_class:js_target |
8 | strip_id_or_class:marker | 8 | strip_id_or_class:marker |
9 | author://div[@id = 'page-title']/h3 | 9 | author://div[@id = 'page-title']/h3 |
10 | date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')] | 10 | date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')] |
11 | 11 | ||
12 | 12 | ||
13 | test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/ \ No newline at end of file | 13 | test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nymag.com.txt b/inc/3rdparty/site_config/standard/nymag.com.txt index f664c93d..7a1d62d9 100644..100755 --- a/inc/3rdparty/site_config/standard/nymag.com.txt +++ b/inc/3rdparty/site_config/standard/nymag.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h2[contains(@class, 'primary')] | 1 | title: //h2[contains(@class, 'primary')] |
2 | body: //div[@id='story'] | 2 | body: //div[@id='story'] |
3 | author: //*[@class='by']/a | 3 | author: //*[@class='by']/a |
4 | date: substring-after(//*[@class='date'], 'Published') | 4 | date: substring-after(//*[@class='date'], 'Published') |
5 | 5 | ||
6 | next_page_link: //div[@class='page-navigation']//li[@class='next']/a | 6 | next_page_link: //div[@class='page-navigation']//li[@class='next']/a |
7 | 7 | ||
8 | test_url: http://nymag.com/news/features/wall-street-2012-2/ \ No newline at end of file | 8 | test_url: http://nymag.com/news/features/wall-street-2012-2/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nyteknik.se.txt b/inc/3rdparty/site_config/standard/nyteknik.se.txt index 8c9e37f4..f4bedb6a 100644..100755 --- a/inc/3rdparty/site_config/standard/nyteknik.se.txt +++ b/inc/3rdparty/site_config/standard/nyteknik.se.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@class="article default-article"]/h1 | 1 | title: //div[@class="article default-article"]/h1 |
2 | author: //p[@class="author"]/a[2] | 2 | author: //p[@class="author"]/a[2] |
3 | 3 | ||
4 | # Article introduction: | 4 | # Article introduction: |
5 | #move_into(//div[@class="article-bread"]): //p[@class="lead"] | 5 | #move_into(//div[@class="article-bread"]): //p[@class="lead"] |
6 | 6 | ||
7 | body: //div[@class="article-bread"] | 7 | body: //div[@class="article-bread"] |
8 | test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece \ No newline at end of file | 8 | test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/nytimes.com.txt b/inc/3rdparty/site_config/standard/nytimes.com.txt index 8d9a794a..23c9ad11 100644..100755 --- a/inc/3rdparty/site_config/standard/nytimes.com.txt +++ b/inc/3rdparty/site_config/standard/nytimes.com.txt | |||
@@ -1,36 +1,49 @@ | |||
1 | title://h1[@class="articleHeadline"] | 1 | title://h1[@class="articleHeadline"] |
2 | body://div[@id="article"] | 2 | body://div[@id="article"] |
3 | strip_id_or_class:articleTools | 3 | body://*[@itemprop="articleBody"] |
4 | strip_id_or_class:readerscomment | 4 | strip_id_or_class:articleTools |
5 | #strip://div[contains(@class, "articleInline runaroundLeft")] | 5 | strip_id_or_class:readerscomment |
6 | strip: //div[contains(@class, "doubleRule")] | 6 | #strip://div[contains(@class, "articleInline runaroundLeft")] |
7 | # strip image credit - appears as a bold heading | 7 | strip: //div[contains(@class, "doubleRule")] |
8 | strip: //div[contains(@class, "articleInline")]//h6 | 8 | # strip image credit - appears as a bold heading |
9 | strip_id_or_class:enlargeThis | 9 | strip: //div[contains(@class, "articleInline")]//h6 |
10 | strip_id_or_class:pageLinks | 10 | strip_id_or_class:enlargeThis |
11 | strip_id_or_class:memberTools | 11 | strip_id_or_class:pageLinks |
12 | strip_id_or_class:articleExtras | 12 | strip_id_or_class:memberTools |
13 | strip_id_or_class:singleAd | 13 | strip_id_or_class:articleExtras |
14 | strip_id_or_class:byline | 14 | strip_id_or_class:singleAd |
15 | strip_id_or_class:dateline | 15 | strip_id_or_class:byline |
16 | strip_id_or_class:articleheadline | 16 | strip_id_or_class:dateline |
17 | strip_id_or_class:articleBottomExtra | 17 | strip_id_or_class:articleheadline |
18 | strip://a[contains(@href, 'nytimes.com/adx/')] | 18 | strip_id_or_class:articleBottomExtra |
19 | strip: //nyt_byline | 19 | strip_id_or_class:shareTools |
20 | strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')] | 20 | strip://a[contains(@href, 'nytimes.com/adx/')] |
21 | strip: //p[@class='caption']//a[contains(., 'More Photos')] | 21 | strip: //nyt_byline |
22 | 22 | strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')] | |
23 | prune: no | 23 | strip: //p[@class='caption']//a[contains(., 'More Photos')] |
24 | tidy: no | 24 | |
25 | 25 | prune: no | |
26 | date: substring-after(//*[contains(@class, 'dateline')], 'Published:') | 26 | tidy: no |
27 | 27 | ||
28 | single_page_link: //link[contains(@href, 'pagewanted=all')] | 28 | find_string: <script |
29 | #single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))] | 29 | replace_string: <div style="display:none" |
30 | 30 | find_string: </script> | |
31 | strip://ul[@id = 'toolsList'] | 31 | replace_string: </div> |
32 | strip://h6[@class = 'kicker'] | 32 | |
33 | author:substring-after(//h6[@class='byline'],'By ') | 33 | date: substring-after(//*[contains(@class, 'dateline')], 'Published:') |
34 | 34 | ||
35 | test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html | 35 | single_page_link: //link[contains(@href, 'pagewanted=all')] |
36 | test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html \ No newline at end of file | 36 | single_page_link: //link[@rel='alternate' and contains(@href, 'mobile.nytimes.com')]/@href |
37 | single_page_link: concat(substring-before(//div[@id='pageLinks']//a[contains(@href, 'pagewanted=')]/@href, 'pagewanted='), 'pagewanted=all') | ||
38 | #single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))] | ||
39 | |||
40 | strip://ul[@id = 'toolsList'] | ||
41 | strip://h6[@class = 'kicker'] | ||
42 | author:substring-after(//h6[@class='byline'],'By ') | ||
43 | |||
44 | test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html | ||
45 | test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html | ||
46 | test_url: http://www.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html | ||
47 | test_url: http://www.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html | ||
48 | test_url: http://www.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html | ||
49 | test_url: http://www.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nzz.ch.txt b/inc/3rdparty/site_config/standard/nzz.ch.txt index 81faabae..749f4f2a 100644..100755 --- a/inc/3rdparty/site_config/standard/nzz.ch.txt +++ b/inc/3rdparty/site_config/standard/nzz.ch.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //*[@class='article-full'] | 1 | body: //*[@class='article-full'] |
2 | title: //h3 | 2 | title: //h3 |
3 | strip: //header[@class='group'] | 3 | strip: //header[@class='group'] |
4 | #body: //p[@class='lead'] | 4 | #body: //p[@class='lead'] |
5 | #move_into(//p[@class='lead']): //*[@class='article-full']/figure | 5 | #move_into(//p[@class='lead']): //*[@class='article-full']/figure |
6 | #move_into(//p[@class='lead']): //div[@id='articleBodyText'] | 6 | #move_into(//p[@class='lead']): //div[@id='articleBodyText'] |
7 | strip: //div[@id='social-media-floater'] | 7 | strip: //div[@id='social-media-floater'] |
8 | strip: //div[@class='advertisement'] | 8 | strip: //div[@class='advertisement'] |
9 | strip: //div[@class='infobox'] | 9 | strip: //div[@class='infobox'] |
10 | strip: //div[@id='articleComments'] | 10 | strip: //div[@id='articleComments'] |
11 | 11 | ||
12 | test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213 \ No newline at end of file | 12 | test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/observer.com.txt b/inc/3rdparty/site_config/standard/observer.com.txt index e409ca2e..0b107538 100644..100755 --- a/inc/3rdparty/site_config/standard/observer.com.txt +++ b/inc/3rdparty/site_config/standard/observer.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //article[contains(@class, 'instapaper_body')] | 1 | body: //article[contains(@class, 'instapaper_body')] |
2 | 2 | ||
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | single_page_link: //a[@id='print-button'] | 5 | single_page_link: //a[@id='print-button'] |
6 | 6 | ||
7 | test_url: http://www.observer.com/2008/would-you-take-tumblr-man \ No newline at end of file | 7 | test_url: http://www.observer.com/2008/would-you-take-tumblr-man \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/off.net.mk.txt b/inc/3rdparty/site_config/standard/off.net.mk.txt index a2fb5f21..bf107876 100644..100755 --- a/inc/3rdparty/site_config/standard/off.net.mk.txt +++ b/inc/3rdparty/site_config/standard/off.net.mk.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[(@id = "content")] | 1 | body: //div[(@id = "content")] |
2 | strip: //div[(@class = "links-bar")] | 2 | strip: //div[(@class = "links-bar")] |
3 | strip: //div[(@class = "povrzani")] | 3 | strip: //div[(@class = "povrzani")] |
4 | strip: //div[(@class = "povrzani-dolu")] | 4 | strip: //div[(@class = "povrzani-dolu")] |
5 | strip: //div[(@class = "tags")] | 5 | strip: //div[(@class = "tags")] |
6 | strip: //h1[(@id = "page-title")] | 6 | strip: //h1[(@id = "page-title")] |
7 | test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi \ No newline at end of file | 7 | test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/omaha.com.txt b/inc/3rdparty/site_config/standard/omaha.com.txt index 53db061d..53db061d 100644..100755 --- a/inc/3rdparty/site_config/standard/omaha.com.txt +++ b/inc/3rdparty/site_config/standard/omaha.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/omiliya.org.txt b/inc/3rdparty/site_config/standard/omiliya.org.txt index 1b39b625..4b3a7202 100644..100755 --- a/inc/3rdparty/site_config/standard/omiliya.org.txt +++ b/inc/3rdparty/site_config/standard/omiliya.org.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id='squeeze']/h1 | 1 | title: //div[@id='squeeze']/h1 |
2 | strip: //div[@id='squeeze']/h1 | 2 | strip: //div[@id='squeeze']/h1 |
3 | author: //div[@class='submitted']/a | 3 | author: //div[@class='submitted']/a |
4 | strip: //div[@class='submitted']/a | 4 | strip: //div[@class='submitted']/a |
5 | convert_double_br_tags: yes | 5 | convert_double_br_tags: yes |
6 | 6 | ||
7 | 7 | ||
8 | 8 | ||
9 | test_url: http://omiliya.org/content/predchuvstvie.html \ No newline at end of file | 9 | test_url: http://omiliya.org/content/predchuvstvie.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/on.net.mk.txt b/inc/3rdparty/site_config/standard/on.net.mk.txt index be7a17ef..a95c2b0f 100644..100755 --- a/inc/3rdparty/site_config/standard/on.net.mk.txt +++ b/inc/3rdparty/site_config/standard/on.net.mk.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[(@class = "statija")] | 1 | body: //div[(@class = "statija")] |
2 | strip: //div[(@class = "relatedBlock")] | 2 | strip: //div[(@class = "relatedBlock")] |
3 | strip: //div[(@class = "swftools")] | 3 | strip: //div[(@class = "swftools")] |
4 | strip: //table[(@class = "links")] | 4 | strip: //table[(@class = "links")] |
5 | test_url: http://on.net.mk/video/na-trkala/lamborghini-aventador-avionot-shto-ne-leta \ No newline at end of file | 5 | test_url: http://on.net.mk/video/na-trkala/lamborghini-aventador-avionot-shto-ne-leta \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/online.wsj.com.txt b/inc/3rdparty/site_config/standard/online.wsj.com.txt index edb52855..448bb7e1 100644..100755 --- a/inc/3rdparty/site_config/standard/online.wsj.com.txt +++ b/inc/3rdparty/site_config/standard/online.wsj.com.txt | |||
@@ -1,23 +1,25 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | body: //div[@id='article_story_body'] | 2 | body: //div[@id='article_story_body'] |
3 | 3 | ||
4 | author: //h3[@class='byline']/a | 4 | author: //h3[@class='byline']/a |
5 | # for slid show content | 5 | # for slide show content |
6 | body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] | 6 | body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] |
7 | date: //li[@class='dateStamp']/small | 7 | date: //li[@class='dateStamp']/small |
8 | 8 | ||
9 | strip_id_or_class: insetFullBracket | 9 | strip_id_or_class: insetFullBracket |
10 | strip_id_or_class: insettipBox | 10 | strip_id_or_class: insettipBox |
11 | #strip_id_or_class: legacyInset | 11 | #strip_id_or_class: legacyInset |
12 | strip_id_or_class: recipeACShopAndBuyText | 12 | strip_id_or_class: recipeACShopAndBuyText |
13 | 13 | ||
14 | strip: //div[contains(@class, 'insetContent')]//cite | 14 | strip: //div[contains(@class, 'insetContent')]//cite |
15 | strip: //*[contains(@style, 'visibility: hidden;')] | 15 | strip: //*[contains(@style, 'visibility: hidden;')] |
16 | strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] | 16 | strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] |
17 | 17 | strip: //div[contains(@class, 'carousel')] | |
18 | prune: no | 18 | |
19 | tidy: no | 19 | prune: no |
20 | 20 | tidy: no | |
21 | test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html | 21 | |
22 | # slide show | 22 | test_url: http://online.wsj.com/news/articles/SB10001424052702304626304579509100018004342 |
23 | test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html \ No newline at end of file | 23 | test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html |
24 | # slide show | ||
25 | test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html | ||
diff --git a/inc/3rdparty/site_config/standard/onlinewelten.com.txt b/inc/3rdparty/site_config/standard/onlinewelten.com.txt index 1609fa83..1609fa83 100644..100755 --- a/inc/3rdparty/site_config/standard/onlinewelten.com.txt +++ b/inc/3rdparty/site_config/standard/onlinewelten.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/onstartups.com.txt b/inc/3rdparty/site_config/standard/onstartups.com.txt index cccce8cd..cccce8cd 100644..100755 --- a/inc/3rdparty/site_config/standard/onstartups.com.txt +++ b/inc/3rdparty/site_config/standard/onstartups.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/ontologicalgeek.com.txt b/inc/3rdparty/site_config/standard/ontologicalgeek.com.txt new file mode 100755 index 00000000..a9bf71ef --- /dev/null +++ b/inc/3rdparty/site_config/standard/ontologicalgeek.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | |||
3 | author: //a[@rel='author'] | ||
4 | |||
5 | date: substring-before(//aside[@class='entry-meta'], '|') | ||
6 | |||
7 | body: //div[@class='entry-content'] | ||
8 | test_url: http://ontologicalgeek.com/change-or-live-final-fantasy-x-as-catholic-dystopia/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/opensource.org.txt b/inc/3rdparty/site_config/standard/opensource.org.txt index 2bd3ccdb..2bd3ccdb 100644..100755 --- a/inc/3rdparty/site_config/standard/opensource.org.txt +++ b/inc/3rdparty/site_config/standard/opensource.org.txt | |||
diff --git a/inc/3rdparty/site_config/standard/openthemagazine.com.txt b/inc/3rdparty/site_config/standard/openthemagazine.com.txt index 510eb252..6913eb0e 100644..100755 --- a/inc/3rdparty/site_config/standard/openthemagazine.com.txt +++ b/inc/3rdparty/site_config/standard/openthemagazine.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id = 'content-inner'] | 1 | body: //div[@id = 'content-inner'] |
2 | strip: //div[@id = 'content-bottom'] | 2 | strip: //div[@id = 'content-bottom'] |
3 | strip_id_or_class: print_sharebutton | 3 | strip_id_or_class: print_sharebutton |
4 | test_url: http://openthemagazine.com/article/nation/sania-vs-saina \ No newline at end of file | 4 | test_url: http://openthemagazine.com/article/nation/sania-vs-saina \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/openwebx.org.txt b/inc/3rdparty/site_config/standard/openwebx.org.txt index b7663540..a5dcdb59 100644..100755 --- a/inc/3rdparty/site_config/standard/openwebx.org.txt +++ b/inc/3rdparty/site_config/standard/openwebx.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class="chapter"] | 1 | body: //div[@class="chapter"] |
2 | prune: no | 2 | prune: no |
3 | tidy: no | 3 | tidy: no |
4 | test_url: http://openwebx.org/docs/springext.html \ No newline at end of file | 4 | test_url: http://openwebx.org/docs/springext.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/orf.at.txt b/inc/3rdparty/site_config/standard/orf.at.txt index ff16ca79..fb4f2181 100644..100755 --- a/inc/3rdparty/site_config/standard/orf.at.txt +++ b/inc/3rdparty/site_config/standard/orf.at.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | 1 | single_page_link: //div[@id='content']//p[@class='readMore']/a |
2 | 2 | ||
3 | title: //div[@class='hidden offscreen']/h2 | 3 | title: //div[@class='hidden offscreen']/h2 |
4 | body: //div[@id="storyText"] | 4 | body: //div[@id="storyText"] |
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | 5 | move_into(//div[@id='storyText']): //div[@class='fact'] |
6 | strip: //small[@class='credit'] | 6 | strip: //small[@class='credit'] |
7 | strip: //small[@class='caption'] | 7 | strip: //small[@class='caption'] |
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | 8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') |
9 | strip: //p[@class='toplink'] | 9 | strip: //p[@class='toplink'] |
10 | 10 | ||
11 | test_url: http://orf.at/stories/2084731/ \ No newline at end of file | 11 | test_url: http://orf.at/stories/2084731/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/origo.hu.txt b/inc/3rdparty/site_config/standard/origo.hu.txt index 0dedac3d..50717f25 100644..100755 --- a/inc/3rdparty/site_config/standard/origo.hu.txt +++ b/inc/3rdparty/site_config/standard/origo.hu.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | title: /html/body/div[5]/div[2]/h1 | 1 | title: /html/body/div[5]/div[2]/h1 |
2 | body: /html/body/div[5]/div[2]/div[6]/div/div | 2 | body: /html/body/div[5]/div[2]/div[6]/div/div |
3 | body: //*[@id="cikk"] | 3 | body: //*[@id="cikk"] |
4 | strip: /html/body/div[5]/div[2]/h1 | 4 | strip: /html/body/div[5]/div[2]/h1 |
5 | strip: /html/body/div[5]/div[2]/div[4] | 5 | strip: /html/body/div[5]/div[2]/div[4] |
6 | strip: //*[@id="multidoboz"] | 6 | strip: //*[@id="multidoboz"] |
7 | strip: /html/body/div[5]/div[2]/div[6]/div[2] | 7 | strip: /html/body/div[5]/div[2]/div[6]/div[2] |
8 | strip: //*[@id="comments"] | 8 | strip: //*[@id="comments"] |
9 | strip: //*[@id="rating-doboz"] | 9 | strip: //*[@id="rating-doboz"] |
10 | strip: /html/body/div[5]/div[2]/div[10] | 10 | strip: /html/body/div[5]/div[2]/div[10] |
11 | strip: /html/body/div[5]/div[2]/a | 11 | strip: /html/body/div[5]/div[2]/a |
12 | strip: /html/body/div[5]/div[2]/span | 12 | strip: /html/body/div[5]/div[2]/span |
13 | strip: /html/body/div[5]/div[2]/span[2] | 13 | strip: /html/body/div[5]/div[2]/span[2] |
14 | strip: /html/body/div[5]/div[2]/span[3] | 14 | strip: /html/body/div[5]/div[2]/span[3] |
15 | strip: /html/body/div[5]/div[2]/span[4] | 15 | strip: /html/body/div[5]/div[2]/span[4] |
16 | strip: /html/body/div[5]/div[2]/span[5] | 16 | strip: /html/body/div[5]/div[2]/span[5] |
17 | strip: //*[@id="kommentszam"] | 17 | strip: //*[@id="kommentszam"] |
18 | test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html \ No newline at end of file | 18 | test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/oschina.net.txt b/inc/3rdparty/site_config/standard/oschina.net.txt new file mode 100755 index 00000000..56451539 --- /dev/null +++ b/inc/3rdparty/site_config/standard/oschina.net.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1 | ||
2 | strip_id_or_class: syntaxhighlighter | ||
3 | test_url: http://www.oschina.net/translate/event-based-programming-what-async-has-over-sync?print \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt b/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt index f03c9551..7e2985e0 100644..100755 --- a/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt +++ b/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | 1 | #body: (//div[@class='ftr-yt-vid'])[1] |
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | 2 | body: (//blockquote[contains(@class, 'postcontent')])[1] |
3 | body: (//div[starts-with(@id, 'post_message')])[1] | 3 | body: (//div[starts-with(@id, 'post_message')])[1] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | 7 | ||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | 8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" |
9 | #replace_string(</iframe>): </iframe> </div> | 9 | #replace_string(</iframe>): </iframe> </div> |
10 | 10 | ||
11 | test_url: http://pakistantvdekho.com/showthread.php?647741-Sitam-Gar-by-HUM-TV-Episode-07&p=659080#post659080 \ No newline at end of file | 11 | test_url: http://pakistantvdekho.com/showthread.php?647741-Sitam-Gar-by-HUM-TV-Episode-07&p=659080#post659080 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pakmedia.tv.txt b/inc/3rdparty/site_config/standard/pakmedia.tv.txt new file mode 100755 index 00000000..5d6e4c8c --- /dev/null +++ b/inc/3rdparty/site_config/standard/pakmedia.tv.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | body: //article//div[@class='entry'] | ||
3 | strip_id_or_class: addthis | ||
4 | strip_id_or_class: gdsrcacheloader | ||
5 | strip_id_or_class: entry-meta | ||
6 | strip_id_or_class: entry-tags | ||
7 | strip_id_or_class: authorbox | ||
8 | strip: //div[@class='entry']/p[1] | ||
9 | strip: //img[@width='600' and @height='70'] | ||
10 | # related posts | ||
11 | strip: //h3[contains(., 'Related posts')] | ||
12 | strip: //div[contains(@style, 'border: 0pt none ; margin: 0pt; padding: 0pt;')] | ||
13 | |||
14 | prune: no | ||
15 | tidy: no | ||
16 | |||
17 | test_url: http://pakmedia.tv/tv-one/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pandagon.net.txt b/inc/3rdparty/site_config/standard/pandagon.net.txt index d0d2a5d0..35121e14 100644..100755 --- a/inc/3rdparty/site_config/standard/pandagon.net.txt +++ b/inc/3rdparty/site_config/standard/pandagon.net.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title://h2 | 1 | title://h2 |
2 | author://div[@class="posted"]/a | 2 | author://div[@class="posted"]/a |
3 | date://div[@class="date"] | 3 | date://div[@class="date"] |
4 | body://div[@class="entry"] | 4 | body://div[@class="entry"] |
5 | test_url: http://pandagon.net/index.php/site/its-okay-to-admit-that-mass-hysteria-is-real \ No newline at end of file | 5 | test_url: http://pandagon.net/index.php/site/its-okay-to-admit-that-mass-hysteria-is-real \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pandodaily.com.txt b/inc/3rdparty/site_config/standard/pandodaily.com.txt index 7d1c2183..a5d427af 100644..100755 --- a/inc/3rdparty/site_config/standard/pandodaily.com.txt +++ b/inc/3rdparty/site_config/standard/pandodaily.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | body: //article | 2 | body: //article |
3 | date: //time/@datetime | 3 | date: //time/@datetime |
4 | strip_id_or_class: sharedaddy | 4 | strip_id_or_class: sharedaddy |
5 | test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/ \ No newline at end of file | 5 | test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/panic.com.txt b/inc/3rdparty/site_config/standard/panic.com.txt index 0361f06d..e0e2595c 100644..100755 --- a/inc/3rdparty/site_config/standard/panic.com.txt +++ b/inc/3rdparty/site_config/standard/panic.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='entry'] | 1 | body: //div[@class='entry'] |
2 | date: //h3[@class='postDate'] | 2 | date: //h3[@class='postDate'] |
3 | test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/ \ No newline at end of file | 3 | test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/papodehomem.com.br.txt b/inc/3rdparty/site_config/standard/papodehomem.com.br.txt new file mode 100755 index 00000000..2c522da4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/papodehomem.com.br.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2[@class="page_title"] | ||
2 | body: //div[@class="entry arquivo"] | ||
3 | author: //span[@class="author"] | ||
4 | footnotes: yes | ||
5 | prune: yes | ||
6 | test_url: http://papodehomem.com.br/um-relato-confessional-sobre-a-maioridade-penal/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/parislemon.com.txt b/inc/3rdparty/site_config/standard/parislemon.com.txt index a3bd4b0f..cd9bd55d 100644..100755 --- a/inc/3rdparty/site_config/standard/parislemon.com.txt +++ b/inc/3rdparty/site_config/standard/parislemon.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2[@class="post-title"] | 1 | title: //h2[@class="post-title"] |
2 | author: substring-after(//div[@class="description"],'Words by ') | 2 | author: substring-after(//div[@class="description"],'Words by ') |
3 | date: //li[@class="date"] | 3 | date: //li[@class="date"] |
4 | strip: //h2[@class="post-title"] | 4 | strip: //h2[@class="post-title"] |
5 | body: //div[@class="copy"] | 5 | body: //div[@class="copy"] |
6 | test_url: http://parislemon.com/post/13462682469/the-15-inch-air \ No newline at end of file | 6 | test_url: http://parislemon.com/post/13462682469/the-15-inch-air \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/parliament.uk.txt b/inc/3rdparty/site_config/standard/parliament.uk.txt index 478a669f..caaa2e94 100644..100755 --- a/inc/3rdparty/site_config/standard/parliament.uk.txt +++ b/inc/3rdparty/site_config/standard/parliament.uk.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='news-article'] | 2 | body: //div[@id='news-article'] |
3 | test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/ \ No newline at end of file | 3 | test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pastebin.com.txt b/inc/3rdparty/site_config/standard/pastebin.com.txt index 89d13b2a..03b67b7e 100644..100755 --- a/inc/3rdparty/site_config/standard/pastebin.com.txt +++ b/inc/3rdparty/site_config/standard/pastebin.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title://div[@class="paste_box_line1"]/h1 | 1 | title://div[@class="paste_box_line1"]/h1 |
2 | author://div[@class="paste_box_line2"]/a | 2 | author://div[@class="paste_box_line2"]/a |
3 | body://div[@class="text"] | 3 | body://div[@class="text"] |
4 | date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|') | 4 | date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|') |
5 | dissolve://li | 5 | dissolve://li |
6 | test_url: http://pastebin.com/LAykd1es \ No newline at end of file | 6 | test_url: http://pastebin.com/LAykd1es \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt b/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt index 40a049e0..c535158d 100644..100755 --- a/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt +++ b/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='ff-pastepad-content'] | 2 | body: //div[@id='ff-pastepad-content'] |
3 | prune: no | 3 | prune: no |
4 | # todo: add test file | 4 | # todo: add test file |
5 | test_url: http://pastepad.fivefilters.org/test.html \ No newline at end of file | 5 | test_url: http://pastepad.fivefilters.org/test.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pathawks.com.txt b/inc/3rdparty/site_config/standard/pathawks.com.txt index 1a4cd25b..25042224 100644..100755 --- a/inc/3rdparty/site_config/standard/pathawks.com.txt +++ b/inc/3rdparty/site_config/standard/pathawks.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title://*[contains(@class,'post-title')] | 1 | title://*[contains(@class,'post-title')] |
2 | body://div[contains(@class,'post-body')] | 2 | body://div[contains(@class,'post-body')] |
3 | body://div[contains(@class,'entry-content')] | 3 | body://div[contains(@class,'entry-content')] |
4 | strip_comments:no | 4 | strip_comments:no |
5 | prune:no | 5 | prune:no |
6 | convert_double_br_tags:yes | 6 | convert_double_br_tags:yes |
7 | tidy:yes | 7 | tidy:yes |
8 | test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html \ No newline at end of file | 8 | test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pcast.me.txt b/inc/3rdparty/site_config/standard/pcast.me.txt index ae38e8e1..ae38e8e1 100644..100755 --- a/inc/3rdparty/site_config/standard/pcast.me.txt +++ b/inc/3rdparty/site_config/standard/pcast.me.txt | |||
diff --git a/inc/3rdparty/site_config/standard/pcmag.com.txt b/inc/3rdparty/site_config/standard/pcmag.com.txt index cebea4d7..96bdd95a 100644..100755 --- a/inc/3rdparty/site_config/standard/pcmag.com.txt +++ b/inc/3rdparty/site_config/standard/pcmag.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | prune:yes | 1 | prune:yes |
2 | 2 | ||
3 | date://*[contains(@class,'date')] | 3 | date://*[contains(@class,'date')] |
4 | 4 | ||
5 | body://div[contains(@id,'content')] | 5 | body://div[contains(@id,'content')] |
6 | 6 | ||
7 | next_page_link://a[contains(.,'Next >')] | 7 | next_page_link://a[contains(.,'Next >')] |
8 | 8 | ||
9 | strip_id_or_class:sponsors | 9 | strip_id_or_class:sponsors |
10 | test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp \ No newline at end of file | 10 | test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pcworld.com.txt b/inc/3rdparty/site_config/standard/pcworld.com.txt index 30ccbb5f..7193f87e 100644..100755 --- a/inc/3rdparty/site_config/standard/pcworld.com.txt +++ b/inc/3rdparty/site_config/standard/pcworld.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //div[@class='articleHead']//h1 | 1 | title: //div[@class='articleHead']//h1 |
2 | author: //div[@class="author-name"]/a[1] | 2 | author: //div[@class="author-name"]/a[1] |
3 | body: //div[@class="main"] | 3 | body: //div[@class="main"] |
4 | 4 | ||
5 | # remove 'From the Lab' and 'Recent posts' text | 5 | # remove 'From the Lab' and 'Recent posts' text |
6 | strip: //div[@class='blogLabel'] | 6 | strip: //div[@class='blogLabel'] |
7 | 7 | ||
8 | # remove byline and meta info | 8 | # remove byline and meta info |
9 | strip: //h1 | 9 | strip: //h1 |
10 | strip: //div[@class="article-meta"] | 10 | strip: //div[@class="article-meta"] |
11 | strip: //div[@class="author-info"] | 11 | strip: //div[@class="author-info"] |
12 | 12 | ||
13 | #strip tags and categories | 13 | #strip tags and categories |
14 | strip: //div[@class="department"] | 14 | strip: //div[@class="department"] |
15 | 15 | ||
16 | #strip product cap links | 16 | #strip product cap links |
17 | strip: //div[@class="cap-main"] | 17 | strip: //div[@class="cap-main"] |
18 | strip: //div[@id="compare-lede"] | 18 | strip: //div[@id="compare-lede"] |
19 | test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html \ No newline at end of file | 19 | test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/penny-arcade.com.txt b/inc/3rdparty/site_config/standard/penny-arcade.com.txt index f97615f1..a0d5099e 100644..100755 --- a/inc/3rdparty/site_config/standard/penny-arcade.com.txt +++ b/inc/3rdparty/site_config/standard/penny-arcade.com.txt | |||
@@ -1,23 +1,23 @@ | |||
1 | # 2012-01-14 carlo@... - fixed title, body; added author, date | 1 | # 2012-01-14 carlo@... - fixed title, body; added author, date |
2 | 2 | ||
3 | title: //div[@class="title"]/h2/a | 3 | title: //div[@class="title"]/h2/a |
4 | # body: //div[@class="post"] | 4 | # body: //div[@class="post"] |
5 | # author: //p[@class="iconEmail"]/a | 5 | # author: //p[@class="iconEmail"]/a |
6 | # date: //p[@class="iconDate"] | 6 | # date: //p[@class="iconDate"] |
7 | 7 | ||
8 | # 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report | 8 | # 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report |
9 | 9 | ||
10 | # Penny Arcade | 10 | # Penny Arcade |
11 | 11 | ||
12 | author: //li[@class="iconEmail"]/a | 12 | author: //li[@class="iconEmail"]/a |
13 | date: //li[@class="iconDate"] | 13 | date: //li[@class="iconDate"] |
14 | body: //div[@class="body"] | 14 | body: //div[@class="body"] |
15 | 15 | ||
16 | # PA Report | 16 | # PA Report |
17 | 17 | ||
18 | author: //div[@class="meta"]/p/a | 18 | author: //div[@class="meta"]/p/a |
19 | date: substring-after(//div[@class="meta"]/p, '/ ') | 19 | date: substring-after(//div[@class="meta"]/p, '/ ') |
20 | title: substring-after(//title, '- ') | 20 | title: substring-after(//title, '- ') |
21 | 21 | ||
22 | test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news | 22 | test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news |
23 | test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech \ No newline at end of file | 23 | test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pentaxforums.com.txt b/inc/3rdparty/site_config/standard/pentaxforums.com.txt index 00f61a48..00f61a48 100644..100755 --- a/inc/3rdparty/site_config/standard/pentaxforums.com.txt +++ b/inc/3rdparty/site_config/standard/pentaxforums.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt b/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt index a369fd65..5ba5f772 100644..100755 --- a/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt +++ b/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | prune: no | 1 | prune: no |
2 | tidy: no | 2 | tidy: no |
3 | body: //div[@class='article-content'] | 3 | body: //div[@class='article-content'] |
4 | dissolve: //nobr/a | 4 | dissolve: //nobr/a |
5 | dissolve: //nobr | 5 | dissolve: //nobr |
6 | test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7 \ No newline at end of file | 6 | test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/philly.com.txt b/inc/3rdparty/site_config/standard/philly.com.txt index 41318f63..accbd60b 100644..100755 --- a/inc/3rdparty/site_config/standard/philly.com.txt +++ b/inc/3rdparty/site_config/standard/philly.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //h1[@class='entry-title'] | 1 | title: //h1[@class='entry-title'] |
2 | author: //p[@class='byline']/span | 2 | author: //p[@class='byline']/span |
3 | body: //@id='body-content' | 3 | body: //@id='body-content' |
4 | date: //div[@class='article_timestamp']/span | 4 | date: //div[@class='article_timestamp']/span |
5 | 5 | ||
6 | strip: //@class=b-group | 6 | strip: //@class=b-group |
7 | strip: //*[contains(@style, 'none')] | 7 | strip: //*[contains(@style, 'none')] |
8 | strip: //a[contains(@href, 'comments')] | 8 | strip: //a[contains(@href, 'comments')] |
9 | strip: //*[contains(@class, 'comment')] | 9 | strip: //*[contains(@class, 'comment')] |
10 | test_url: http://www.philly.com/philly/sports/eagles/20120127_Ohio_State_s_Posey_didn_t_waste_time_lost_to_suspension.html \ No newline at end of file | 10 | test_url: http://www.philly.com/philly/sports/eagles/20120127_Ohio_State_s_Posey_didn_t_waste_time_lost_to_suspension.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt b/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt index 4e2ccb01..7f7e3830 100644..100755 --- a/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt +++ b/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | author: substring-before(//div[@class='post_meta'],' on') | 1 | author: substring-before(//div[@class='post_meta'],' on') |
2 | date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on') | 2 | date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on') |
3 | title: //h1[class='post_title'] | 3 | title: //h1[class='post_title'] |
4 | body: //div[@class='article'] | 4 | body: //div[@class='article'] |
5 | 5 | ||
6 | test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/ \ No newline at end of file | 6 | test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/php.net.txt b/inc/3rdparty/site_config/standard/php.net.txt index 7c57a84d..cc643f05 100644..100755 --- a/inc/3rdparty/site_config/standard/php.net.txt +++ b/inc/3rdparty/site_config/standard/php.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | strip_id_or_class: manualnavbar | 2 | strip_id_or_class: manualnavbar |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://www.php.net/manual/en/migration5.incompatible.php \ No newline at end of file | 6 | test_url: http://www.php.net/manual/en/migration5.incompatible.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/physicstoday.org.txt b/inc/3rdparty/site_config/standard/physicstoday.org.txt index a8163995..624055b7 100644..100755 --- a/inc/3rdparty/site_config/standard/physicstoday.org.txt +++ b/inc/3rdparty/site_config/standard/physicstoday.org.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //div[@class='abstitle']//h1 | 1 | title: //div[@class='abstitle']//h1 |
2 | author: //div[@class='authorList'] | 2 | author: //div[@class='authorList'] |
3 | body: //div[@id='fulltext_body'] | 3 | body: //div[@id='fulltext_body'] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.physicstoday.org/resource/1/phtoad/v64/i10/p48_s1?bypassSSO=1 \ No newline at end of file | 7 | test_url: http://www.physicstoday.org/resource/1/phtoad/v64/i10/p48_s1?bypassSSO=1 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pinterest.com.txt b/inc/3rdparty/site_config/standard/pinterest.com.txt new file mode 100755 index 00000000..01b6df41 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pinterest.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //title | ||
2 | body: //div[contains(@class, 'imageContainer')] | ||
3 | |||
4 | test_url: http://pinterest.com/pin/380906080954441188/ | ||
5 | test_url: http://pinterest.com/michaelsorm/architecture/rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pitchfork.com.txt b/inc/3rdparty/site_config/standard/pitchfork.com.txt index 3decc538..eee96a9c 100644..100755 --- a/inc/3rdparty/site_config/standard/pitchfork.com.txt +++ b/inc/3rdparty/site_config/standard/pitchfork.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title:concat(//h1,' - ',//h2,' - ',//h3) | 1 | title:concat(//h1,' - ',//h2,' - ',//h3) |
2 | author://address | 2 | author://address |
3 | date://span[@class='pub-date'] | 3 | date://span[@class='pub-date'] |
4 | body://div[@id='main'] | 4 | body://div[@id='main'] |
5 | single_page_link://link[@rel='canonical'] | 5 | single_page_link://link[@rel='canonical'] |
6 | strip://div[@class='info'] | 6 | strip://div[@class='info'] |
7 | strip_id_or_class:'object-grid related-content' | 7 | strip_id_or_class:'object-grid related-content' |
8 | strip_id_or_class:'object-prevnext' | 8 | strip_id_or_class:'object-prevnext' |
9 | strip_id_or_class:'object-header' | 9 | strip_id_or_class:'object-header' |
10 | strip_id_or_class:'source' | 10 | strip_id_or_class:'source' |
11 | strip_id_or_class:'label' | 11 | strip_id_or_class:'label' |
12 | strip_id_or_class:'title' | 12 | strip_id_or_class:'title' |
13 | dissolve://ul | 13 | dissolve://ul |
14 | strip://li[@class='next'] | 14 | strip://li[@class='next'] |
15 | strip://li[@class='prev'] | 15 | strip://li[@class='prev'] |
16 | test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/ \ No newline at end of file | 16 | test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittnews.com.txt b/inc/3rdparty/site_config/standard/pittnews.com.txt index 92777073..c302526d 100644..100755 --- a/inc/3rdparty/site_config/standard/pittnews.com.txt +++ b/inc/3rdparty/site_config/standard/pittnews.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h2[@class='post-title'] | 1 | title: //h2[@class='post-title'] |
2 | author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/') | 2 | author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/') |
3 | date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in') | 3 | date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in') |
4 | strip: //h2[@class='post-title'] | 4 | strip: //h2[@class='post-title'] |
5 | strip: //p[@class='post-details'] | 5 | strip: //p[@class='post-details'] |
6 | strip: //h3[@class='post-byline'] | 6 | strip: //h3[@class='post-byline'] |
7 | body: //div[@id='content'] | 7 | body: //div[@id='content'] |
8 | test_url: http://pittnews.com/newsstory/mens-basketball-pitt-recruit-robinson-to-bring-leadership/ \ No newline at end of file | 8 | test_url: http://pittnews.com/newsstory/mens-basketball-pitt-recruit-robinson-to-bring-leadership/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt b/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt index 824cb064..f2948528 100644..100755 --- a/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt +++ b/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: substring-before(//title,'pirates.com') | 1 | title: substring-before(//title,'pirates.com') |
2 | date: //span[@class='timeStamp'] | 2 | date: //span[@class='timeStamp'] |
3 | author: substring-before(substring-after(//div[@class='byLine'],'By'),'/') | 3 | author: substring-before(substring-after(//div[@class='byLine'],'By'),'/') |
4 | body: //div[@id='article'] | 4 | body: //div[@id='article'] |
5 | #strip: //div[@class='inner'] | 5 | #strip: //div[@class='inner'] |
6 | strip: //div[@id='article_head'] | 6 | strip: //div[@id='article_head'] |
7 | strip: //p[@class='tagLine'] | 7 | strip: //p[@class='tagLine'] |
8 | strip: //div[@id='article_related_links'] | 8 | strip: //div[@id='article_related_links'] |
9 | strip: //div[@id='article_related_mlb'] | 9 | strip: //div[@id='article_related_mlb'] |
10 | strip: //div[@id='article_related_club'] | 10 | strip: //div[@id='article_related_club'] |
11 | strip: //span[@class='more'] | 11 | strip: //span[@class='more'] |
12 | strip: //div[@class='article_component'] | 12 | strip: //div[@class='article_component'] |
13 | strip: //span[@class='screen_reader'] | 13 | strip: //span[@class='screen_reader'] |
14 | strip: //ul[@class='columnists_blurb'] | 14 | strip: //ul[@class='columnists_blurb'] |
15 | test_url: http://pittsburgh.pirates.mlb.com/news/article.jsp?ymd=20120330&content_id=27759040&vkey=news_pit&c_id=pit \ No newline at end of file | 15 | test_url: http://pittsburgh.pirates.mlb.com/news/article.jsp?ymd=20120330&content_id=27759040&vkey=news_pit&c_id=pit \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittsburghlive.com.txt b/inc/3rdparty/site_config/standard/pittsburghlive.com.txt index b3e66166..cc7891f3 100644..100755 --- a/inc/3rdparty/site_config/standard/pittsburghlive.com.txt +++ b/inc/3rdparty/site_config/standard/pittsburghlive.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: substring-before(//title,'- Pittsburgh Tribune') | 1 | title: substring-before(//title,'- Pittsburgh Tribune') |
2 | author: substring-before(substring-after(//div[@class='byline'],'By '),',') | 2 | author: substring-before(substring-after(//div[@class='byline'],'By '),',') |
3 | date: substring-after(substring-after(//div[@class='byline'],','),',') | 3 | date: substring-after(substring-after(//div[@class='byline'],','),',') |
4 | body: //div[@id='storyBody'] | 4 | body: //div[@id='storyBody'] |
5 | strip: //div[@class='morestories'] | 5 | strip: //div[@class='morestories'] |
6 | dissolve: //p[@class='subheader'] | 6 | dissolve: //p[@class='subheader'] |
7 | test_url: http://www.pittsburghlive.com/x/pittsburghtrib/sports/columnists/s_785654.html \ No newline at end of file | 7 | test_url: http://www.pittsburghlive.com/x/pittsburghtrib/sports/columnists/s_785654.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt b/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt index dd715d8f..4d02f6bb 100644..100755 --- a/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt +++ b/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //title | 1 | title: //title |
2 | author: substring-after(//div[@class='by-line'],'BY') | 2 | author: substring-after(//div[@class='by-line'],'BY') |
3 | 3 | ||
4 | body: //div[@id='article-body'] | 4 | body: //div[@id='article-body'] |
5 | 5 | ||
6 | strip: //div[@class='by-line'] | 6 | strip: //div[@class='by-line'] |
7 | strip: //div[@id='article-body']/h1 | 7 | strip: //div[@id='article-body']/h1 |
8 | test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/ \ No newline at end of file | 8 | test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt b/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt index 6113b96e..c372284a 100644..100755 --- a/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt +++ b/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //span[@class='StoryHeadline'] | 1 | title: //span[@class='StoryHeadline'] |
2 | strip: //div[@class='fivevert'] | 2 | strip: //div[@class='fivevert'] |
3 | body: //div[@id='Content'] | 3 | body: //div[@id='Content'] |
4 | test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html \ No newline at end of file | 4 | test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pittscriptblog.com.txt b/inc/3rdparty/site_config/standard/pittscriptblog.com.txt index 3936310d..571874a4 100644..100755 --- a/inc/3rdparty/site_config/standard/pittscriptblog.com.txt +++ b/inc/3rdparty/site_config/standard/pittscriptblog.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@class='articletitle'] | 1 | title: //h1[@class='articletitle'] |
2 | author: substring-after(//span[@class='author'],'by') | 2 | author: substring-after(//span[@class='author'],'by') |
3 | date: //span[@class='created'] | 3 | date: //span[@class='created'] |
4 | body: //div[@class='article'] | 4 | body: //div[@class='article'] |
5 | strip: //div[@class='headline'] | 5 | strip: //div[@class='headline'] |
6 | strip: //p[@class='articleinfo'] | 6 | strip: //p[@class='articleinfo'] |
7 | #dissolve: //p[@class='subheader'] | 7 | #dissolve: //p[@class='subheader'] |
8 | test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html \ No newline at end of file | 8 | test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/planetvita.de.txt b/inc/3rdparty/site_config/standard/planetvita.de.txt new file mode 100755 index 00000000..bfc3342d --- /dev/null +++ b/inc/3rdparty/site_config/standard/planetvita.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@id='frnRahmen']/div/div[@id='content']/div[2]/h2 | ||
2 | author: //div[@id='content']/div[1]/div/a | ||
3 | body: //div[@id='content']/div[2]/span | ||
4 | strip: //div[@id='commenthead'] | ||
5 | test_url: http://www.planetvita.de/news/10389-psn-store-update-vom-03-april-neue-inhalte-fuer-psvita.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/playboy.com.txt b/inc/3rdparty/site_config/standard/playboy.com.txt index 07b347a0..92834947 100644..100755 --- a/inc/3rdparty/site_config/standard/playboy.com.txt +++ b/inc/3rdparty/site_config/standard/playboy.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | author: //article//*[@class="author"] | 1 | author: //article//*[@class="author"] |
2 | date: //article//*[@class="publication-date"] | 2 | date: //article//*[@class="publication-date"] |
3 | body: //article | 3 | body: //article |
4 | strip: //article/header | 4 | strip: //article/header |
5 | strip: //article/section | 5 | strip: //article/section |
6 | test_url: http://www.playboy.com/playground/view/playboy-interview-jon-hamm \ No newline at end of file | 6 | test_url: http://www.playboy.com/playground/view/playboy-interview-jon-hamm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/plus.google.com.txt b/inc/3rdparty/site_config/standard/plus.google.com.txt index 50a5dbf5..4a7ea126 100644..100755 --- a/inc/3rdparty/site_config/standard/plus.google.com.txt +++ b/inc/3rdparty/site_config/standard/plus.google.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | body: //div[@id='contentPane']//div[@class='vg'] | 1 | body: //div[@id='contentPane']//div[@class='vg'] |
2 | body: //div[@id='contentPane'] | 2 | body: //div[@id='contentPane'] |
3 | 3 | ||
4 | # Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :( | 4 | # Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :( |
5 | 5 | ||
6 | author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title | 6 | author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title |
7 | 7 | ||
8 | 8 | ||
9 | strip: //*[@title="People who +1'd this"]/../.. | 9 | strip: //*[@title="People who +1'd this"]/../.. |
10 | strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')] | 10 | strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')] |
11 | strip: //*[@role='menu'] | 11 | strip: //*[@role='menu'] |
12 | strip: //img[contains(@alt, 'profile photo')] | 12 | strip: //img[contains(@alt, 'profile photo')] |
13 | strip: //*[@class='a-f-i-Ad'] | 13 | strip: //*[@class='a-f-i-Ad'] |
14 | 14 | ||
15 | tidy: no | 15 | tidy: no |
16 | 16 | ||
17 | test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp \ No newline at end of file | 17 | test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/plzkthxbai.com.txt b/inc/3rdparty/site_config/standard/plzkthxbai.com.txt index bb9be0a9..ec151b42 100644..100755 --- a/inc/3rdparty/site_config/standard/plzkthxbai.com.txt +++ b/inc/3rdparty/site_config/standard/plzkthxbai.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h2[@class='jcw-pagetitle' | 1 | title: //h2[@class='jcw-pagetitle' |
2 | date: //p[@class='postinfo'] | 2 | date: //p[@class='postinfo'] |
3 | body: //div[@class='contenttext'] | 3 | body: //div[@class='contenttext'] |
4 | test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/ \ No newline at end of file | 4 | test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt b/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt index 880311d3..65ddba54 100644..100755 --- a/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt +++ b/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id="content"]/div[1] | 1 | body: //div[@id="content"]/div[1] |
2 | 2 | ||
3 | title: //h1[@class="entry-title"] | 3 | title: //h1[@class="entry-title"] |
4 | test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/ \ No newline at end of file | 4 | test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt index c5302d1b..d8f5e575 100755 --- a/inc/3rdparty/site_config/standard/politico.com.txt +++ b/inc/3rdparty/site_config/standard/politico.com.txt | |||
@@ -1,17 +1,13 @@ | |||
1 | title://div[contains(@class, "article")]/h1 | 1 | title://div[contains(@class, "article")]/h1 |
2 | body://div[contains(@class,"story-text")] | 2 | body://div[contains(@class,"story-text")] |
3 | 3 | ||
4 | # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] | 4 | # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] |
5 | 5 | ||
6 | next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a | 6 | next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a |
7 | next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a | 7 | date://meta[@name="publish_date"]/@content |
8 | date://meta[@name="publish_date"]/@content | 8 | |
9 | 9 | strip://div[contains(@class, "breadcrumbs")] | |
10 | strip://div[contains(@class, "breadcrumbs")] | 10 | strip://a[contains(@class, "hidden")] |
11 | strip://a[contains(@class, "hidden")] | 11 | strip://div[contains(@class, "story-embed")] |
12 | strip://div[contains(@class, "story-embed")] | ||
13 | strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. | 12 | strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. |
14 | strip://div[contains(@class, "story-interrupt")] | ||
15 | strip://footer[contains(@class, "author-bio")] | ||
16 | |||
17 | test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file | 13 | test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/politifact.com.txt b/inc/3rdparty/site_config/standard/politifact.com.txt index fd247b5b..65a8fc57 100644..100755 --- a/inc/3rdparty/site_config/standard/politifact.com.txt +++ b/inc/3rdparty/site_config/standard/politifact.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id="content"] | 1 | body: //div[@id="content"] |
2 | 2 | ||
3 | strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"] | 3 | strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"] |
4 | test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/ \ No newline at end of file | 4 | test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/politiken.dk.txt b/inc/3rdparty/site_config/standard/politiken.dk.txt index 8deecbca..b13f8f87 100644..100755 --- a/inc/3rdparty/site_config/standard/politiken.dk.txt +++ b/inc/3rdparty/site_config/standard/politiken.dk.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | # 21/10-2011: | 1 | # 21/10-2011: |
2 | # Added Author+Date | 2 | # Added Author+Date |
3 | # Remove fakta-boks if found | 3 | # Remove fakta-boks if found |
4 | # Deleted 'Læs også...' filter | 4 | # Deleted 'Læs ogsÃ¥...' filter |
5 | # - Change in markup caused it to strip too much. | 5 | # - Change in markup caused it to strip too much. |
6 | 6 | ||
7 | author://span[@class='autor-name'] | 7 | author://span[@class='autor-name'] |
8 | date:substring-after(//div[@class='art-created'], ' ') | 8 | date:substring-after(//div[@class='art-created'], ' ') |
9 | title: //h1[contains(@class, 'stor-type')] | 9 | title: //h1[contains(@class, 'stor-type')] |
10 | body: //div[@id='art-body'] | 10 | body: //div[@id='art-body'] |
11 | strip: //div[@class='art-fakta article-box'] | 11 | strip: //div[@class='art-fakta article-box'] |
12 | 12 | ||
13 | test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/ \ No newline at end of file | 13 | test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/polygon.com.txt b/inc/3rdparty/site_config/standard/polygon.com.txt new file mode 100755 index 00000000..8fe9b1be --- /dev/null +++ b/inc/3rdparty/site_config/standard/polygon.com.txt | |||
@@ -0,0 +1,34 @@ | |||
1 | body: //div[@id='article-content'] | ||
2 | body: //article[@id='entry-top']/div[@class='float_wrapper'] | ||
3 | author: //header/p[@class='byline']/em/a | ||
4 | date: //header/p[@class='byline']/span[@class='timestamp'] | ||
5 | |||
6 | strip: //div[@id='article-content']//header | ||
7 | strip: //label | ||
8 | |||
9 | #photos on left column (delete all) | ||
10 | strip: //div[@class='big_photo'] | ||
11 | |||
12 | #photos on left column (remove extras used for scroll effect) | ||
13 | #strip: //div[@class='big_photo']/div[./img] | ||
14 | #strip: //div[@class='big_photo']/img[position()>1] | ||
15 | |||
16 | strip_id_or_class: vox-lazy-load | ||
17 | strip_id_or_class: social_buttons | ||
18 | strip_id_or_class: feature_toc | ||
19 | |||
20 | prune: no | ||
21 | |||
22 | find_string: <noscript> | ||
23 | replace_string: <div> | ||
24 | find_string: </noscript> | ||
25 | replace_string: </div> | ||
26 | |||
27 | #find_string: <script | ||
28 | #replace_string: <div style="display:none" | ||
29 | #find_string: </script> | ||
30 | #replace_string: </div> | ||
31 | |||
32 | strip: //div[@class='float_wrapper']/header | ||
33 | test_url: http://www.polygon.com/2013/4/5/4189028/donkey-kong-country-returns-3d-new-content | ||
34 | test_url: http://www.polygon.com/features/2013/8/22/4602568/30-years-xbox-360-playstation-3-wii \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/popularmechanics.com.txt b/inc/3rdparty/site_config/standard/popularmechanics.com.txt index 85b7656b..2582e6fb 100644..100755 --- a/inc/3rdparty/site_config/standard/popularmechanics.com.txt +++ b/inc/3rdparty/site_config/standard/popularmechanics.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | next_page_link: //div[@id='longPagination']/a[@class='next'] | 1 | next_page_link: //div[@id='longPagination']/a[@class='next'] |
2 | 2 | ||
3 | title: //div[@id='contentHeader']//h1 | 3 | title: //div[@id='contentHeader']//h1 |
4 | 4 | ||
5 | body: //div[@id='articleBody'] | 5 | body: //div[@id='articleBody'] |
6 | # this is so sad | 6 | # this is so sad |
7 | body: //div[@id='intelliTXT'] | 7 | body: //div[@id='intelliTXT'] |
8 | test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877 \ No newline at end of file | 8 | test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/portertech.ca.txt b/inc/3rdparty/site_config/standard/portertech.ca.txt new file mode 100755 index 00000000..2897cb57 --- /dev/null +++ b/inc/3rdparty/site_config/standard/portertech.ca.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: //*[(@class = "author")] | ||
2 | date: //*[(@class = "date")] | ||
3 | test_url: http://portertech.ca/2012/12/10/iac-morning-market/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/positioningmag.com.txt b/inc/3rdparty/site_config/standard/positioningmag.com.txt index 21cd833c..f8eeb0a3 100644..100755 --- a/inc/3rdparty/site_config/standard/positioningmag.com.txt +++ b/inc/3rdparty/site_config/standard/positioningmag.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //div[@id="newsDetailTitle"] | 1 | title: //div[@id="newsDetailTitle"] |
2 | author: //span[@id="showAuthor"] | 2 | author: //span[@id="showAuthor"] |
3 | date: //span[@id="showRefDate"] | 3 | date: //span[@id="showRefDate"] |
4 | 4 | ||
5 | strip: //div[@id="breadcrumbs"] | 5 | strip: //div[@id="breadcrumbs"] |
6 | strip: //span[@id="PageTitle"] | 6 | strip: //span[@id="PageTitle"] |
7 | strip: //div[@id="newsDetailAuthorPublish"] | 7 | strip: //div[@id="newsDetailAuthorPublish"] |
8 | 8 | ||
9 | strip: //div[@class="leadPix"] | 9 | strip: //div[@class="leadPix"] |
10 | 10 | ||
11 | strip: //span[@id="ctl00_PageTitle"] | 11 | strip: //span[@id="ctl00_PageTitle"] |
12 | strip: //div[@id="newsDetailTitle"] | 12 | strip: //div[@id="newsDetailTitle"] |
13 | convert_double_br_tags:yes | 13 | convert_double_br_tags:yes |
14 | 14 | ||
15 | strip: //div[@id="newsDetailCredential"] | 15 | strip: //div[@id="newsDetailCredential"] |
16 | strip: //div[@id="sidebar2"] | 16 | strip: //div[@id="sidebar2"] |
17 | strip: //div[@id="footer"] | 17 | strip: //div[@id="footer"] |
18 | 18 | ||
19 | test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083 \ No newline at end of file | 19 | test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/post-gazette.com.txt b/inc/3rdparty/site_config/standard/post-gazette.com.txt index 1ea945a0..baa9d69d 100644..100755 --- a/inc/3rdparty/site_config/standard/post-gazette.com.txt +++ b/inc/3rdparty/site_config/standard/post-gazette.com.txt | |||
@@ -1,26 +1,26 @@ | |||
1 | title: //div[@class='story_headline'] | 1 | title: //div[@class='story_headline'] |
2 | author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/') | 2 | author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/') |
3 | date: //div[@class='story_lastupdate'] | 3 | date: //div[@class='story_lastupdate'] |
4 | body: //div[@id='story'] | 4 | body: //div[@id='story'] |
5 | strip: //div[@class='story_byline'] | 5 | strip: //div[@class='story_byline'] |
6 | strip: //div[@class='story_lastupdate'] | 6 | strip: //div[@class='story_lastupdate'] |
7 | strip: //div[@class='story_headline'] | 7 | strip: //div[@class='story_headline'] |
8 | strip: //div[@id='abuse'] | 8 | strip: //div[@id='abuse'] |
9 | strip: //h2 | 9 | strip: //h2 |
10 | strip: //div[@class='pagenumbers_wrap'] | 10 | strip: //div[@class='pagenumbers_wrap'] |
11 | strip: //ul[@class='pagenumbers'] | 11 | strip: //ul[@class='pagenumbers'] |
12 | strip: //div[starts-with(., 'To report inappropriate comments')] | 12 | strip: //div[starts-with(., 'To report inappropriate comments')] |
13 | 13 | ||
14 | strip_id_or_class: story_share | 14 | strip_id_or_class: story_share |
15 | strip_id_or_class: OUTBRAIN | 15 | strip_id_or_class: OUTBRAIN |
16 | strip_id_or_class: story_box_right | 16 | strip_id_or_class: story_box_right |
17 | strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']] | 17 | strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']] |
18 | strip: //ul[@id='pikame']/li[position()>1] | 18 | strip: //ul[@id='pikame']/li[position()>1] |
19 | 19 | ||
20 | prune: no | 20 | prune: no |
21 | tidy: no | 21 | tidy: no |
22 | 22 | ||
23 | single_page_link: //a[contains(@href, '?p=0')] | 23 | single_page_link: //a[contains(@href, '?p=0')] |
24 | 24 | ||
25 | test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/ | 25 | test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/ |
26 | test_url: http://www.post-gazette.com/stories/sports/pirates/pirates-fork-over-changes-for-fans-at-pnc-park-629789 \ No newline at end of file | 26 | test_url: http://www.post-gazette.com/stories/sports/pirates/pirates-fork-over-changes-for-fans-at-pnc-park-629789 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/posta.com.tr.txt b/inc/3rdparty/site_config/standard/posta.com.tr.txt index 86cb5d0b..0f01149c 100644..100755 --- a/inc/3rdparty/site_config/standard/posta.com.tr.txt +++ b/inc/3rdparty/site_config/standard/posta.com.tr.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: //div[@id='divAdnetKeyword']/h1 | 1 | title: //div[@id='divAdnetKeyword']/h1 |
2 | body: //div[@id='_middle_content_bottom'] | 2 | body: //div[@id='_middle_content_bottom'] |
3 | 3 | ||
4 | wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img | 4 | wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img |
5 | 5 | ||
6 | strip: //div[@id='_middle_content_bottom_child1'] | 6 | strip: //div[@id='_middle_content_bottom_child1'] |
7 | strip: //div[@id='_middle_content_bottom_child4'] | 7 | strip: //div[@id='_middle_content_bottom_child4'] |
8 | strip: //div[@class='cls'] | 8 | strip: //div[@class='cls'] |
9 | strip: //div[@class='iphoneBox'] | 9 | strip: //div[@class='iphoneBox'] |
10 | strip: //ul[@class='ilgiliHaber'] | 10 | strip: //ul[@class='ilgiliHaber'] |
11 | strip: //div[@class='yorumlar'] | 11 | strip: //div[@class='yorumlar'] |
12 | strip: //div[@class='kategoriler'] | 12 | strip: //div[@class='kategoriler'] |
13 | strip: //div[@class='textSize'] | 13 | strip: //div[@class='textSize'] |
14 | strip: //span[@class='tarih'] | 14 | strip: //span[@class='tarih'] |
15 | test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044 \ No newline at end of file | 15 | test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/prb.org.txt b/inc/3rdparty/site_config/standard/prb.org.txt index 7f7a5031..3952ea99 100644..100755 --- a/inc/3rdparty/site_config/standard/prb.org.txt +++ b/inc/3rdparty/site_config/standard/prb.org.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | date: /html/head/meta[@name="date"]/@content | 2 | date: /html/head/meta[@name="date"]/@content |
3 | body: //div[@id="featuredlinksbox"] | 3 | body: //div[@id="featuredlinksbox"] |
4 | strip: //div[@class="relatedbox"] | 4 | strip: //div[@class="relatedbox"] |
5 | strip: //h1 | 5 | strip: //h1 |
6 | strip: //br | 6 | strip: //br |
7 | strip_image_src: "/images" | 7 | strip_image_src: "/images" |
8 | test_url: http://www.prb.org/Journalists/Webcasts/2011/military-families.aspx \ No newline at end of file | 8 | test_url: http://www.prb.org/Journalists/Webcasts/2011/military-families.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt b/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt index 906c27a0..9a49557e 100644..100755 --- a/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt +++ b/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='left'] | 2 | body: //div[@id='left'] |
3 | strip: //h1 | 3 | strip: //h1 |
4 | convert_double_br_tags: yes | 4 | convert_double_br_tags: yes |
5 | strip_id_or_class: entry-footer | 5 | strip_id_or_class: entry-footer |
6 | strip: //h1[. = 'Previously']/following::* | 6 | strip: //h1[. = 'Previously']/following::* |
7 | author: string('James Hague') | 7 | author: string('James Hague') |
8 | date: //div[@class = 'entry-footer']/text() | 8 | date: //div[@class = 'entry-footer']/text() |
9 | test_url: http://prog21.dadgum.com/105.html \ No newline at end of file | 9 | test_url: http://prog21.dadgum.com/105.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/prolost.com.txt b/inc/3rdparty/site_config/standard/prolost.com.txt index cef811d4..82ebf6bb 100644..100755 --- a/inc/3rdparty/site_config/standard/prolost.com.txt +++ b/inc/3rdparty/site_config/standard/prolost.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class='body'] | 1 | body: //div[@class='body'] |
2 | title: //h2[@class='title'] | 2 | title: //h2[@class='title'] |
3 | date: //span[@class='posted-on'] | 3 | date: //span[@class='posted-on'] |
4 | test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html \ No newline at end of file | 4 | test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/propublica.org.txt b/inc/3rdparty/site_config/standard/propublica.org.txt index 11e63bd0..d141ac90 100644..100755 --- a/inc/3rdparty/site_config/standard/propublica.org.txt +++ b/inc/3rdparty/site_config/standard/propublica.org.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1[@class="article-title"] | 1 | title: //h1[@class="article-title"] |
2 | author: //meta[@name="author"]/@content | 2 | author: //meta[@name="author"]/@content |
3 | body: //div[@class="article-full"] | 3 | body: //div[@class="article-full"] |
4 | strip_id_or_class: sidebar_inject | 4 | strip_id_or_class: sidebar_inject |
5 | strip_id_or_class: callout | 5 | strip_id_or_class: callout |
6 | strip_id_or_class: content-inset | 6 | strip_id_or_class: content-inset |
7 | strip_id_or_class: byline-block | 7 | strip_id_or_class: byline-block |
8 | strip_id_or_class: photo-caption | 8 | strip_id_or_class: photo-caption |
9 | strip_id_or_class: foot-tools | 9 | strip_id_or_class: foot-tools |
10 | 10 | ||
11 | test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places \ No newline at end of file | 11 | test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/prosa.dk.txt b/inc/3rdparty/site_config/standard/prosa.dk.txt index dedd33d3..ba9ce8b8 100644..100755 --- a/inc/3rdparty/site_config/standard/prosa.dk.txt +++ b/inc/3rdparty/site_config/standard/prosa.dk.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author: //p[@class='name'] | 1 | author: //p[@class='name'] |
2 | date: substring-before(//p[@class='date'], ' | ') | 2 | date: substring-before(//p[@class='date'], ' | ') |
3 | body: //div[@class='news_single_item'] | 3 | body: //div[@class='news_single_item'] |
4 | test_url: http://www.prosa.dk/aktuelt/nyhed/artikel/internetaktivisten-uden-maske/ \ No newline at end of file | 4 | test_url: http://www.prosa.dk/aktuelt/nyhed/artikel/internetaktivisten-uden-maske/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt b/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt index 19059c4a..739d1b9e 100644..100755 --- a/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt +++ b/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt | |||
@@ -1,26 +1,26 @@ | |||
1 | #basics | 1 | #basics |
2 | author: (//div[contains(@class,'author')])[1] | 2 | author: (//div[contains(@class,'author')])[1] |
3 | date: substring-before(//a[@class='issue'], '—') | 3 | date: substring-before(//a[@class='issue'], '—') |
4 | #body://div[@class = 'entry'] | 4 | #body://div[@class = 'entry'] |
5 | # use this until move_into support is ready | 5 | # use this until move_into support is ready |
6 | body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image'] | 6 | body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image'] |
7 | 7 | ||
8 | #moves header image and tagline into body | 8 | #moves header image and tagline into body |
9 | move_into(//div[@class='entry']/div)://div[@class = 'lead_image'] | 9 | move_into(//div[@class='entry']/div)://div[@class = 'lead_image'] |
10 | move_into(//div[@class='entry']/div)://div[@class = 'standfirst'] | 10 | move_into(//div[@class='entry']/div)://div[@class = 'standfirst'] |
11 | 11 | ||
12 | 12 | ||
13 | # moves author info to end of text | 13 | # moves author info to end of text |
14 | move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em | 14 | move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em |
15 | 15 | ||
16 | prune: no | 16 | prune: no |
17 | 17 | ||
18 | # strips social links | 18 | # strips social links |
19 | strip_id_or_class:login-status | 19 | strip_id_or_class:login-status |
20 | strip_id_or_class:shareinpost | 20 | strip_id_or_class:shareinpost |
21 | strip_id_or_class:content_subscribe | 21 | strip_id_or_class:content_subscribe |
22 | strip_id_or_class:postinfo | 22 | strip_id_or_class:postinfo |
23 | strip_id_or_class:postutils | 23 | strip_id_or_class:postutils |
24 | strip_id_or_class:comments | 24 | strip_id_or_class:comments |
25 | strip://strong[string(.) = 'Follow Prospect on Twitter'] | 25 | strip://strong[string(.) = 'Follow Prospect on Twitter'] |
26 | test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/ \ No newline at end of file | 26 | test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/protothema.gr.txt b/inc/3rdparty/site_config/standard/protothema.gr.txt new file mode 100755 index 00000000..fae261b0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/protothema.gr.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //a[contains(@rel, 'mainphotos')] | //div[contains(@class, 'article-content')] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.protothema.gr//politics/article/326464/diamadopoulou-floridis-kaminis-kai-boutaris-se-ekdilosi-ton-europaion-fileleutheron/ | ||
6 | test_url: http://www.protothema.gr/rss/news/politics/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/psychologytoday.com.txt b/inc/3rdparty/site_config/standard/psychologytoday.com.txt index 3da3cea3..1bb63c29 100644..100755 --- a/inc/3rdparty/site_config/standard/psychologytoday.com.txt +++ b/inc/3rdparty/site_config/standard/psychologytoday.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@class="page-title"]/h1 | 1 | title: //div[@class="page-title"]/h1 |
2 | author: //a[@title="View Bio"] | 2 | author: //a[@title="View Bio"] |
3 | date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by') | 3 | date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by') |
4 | strip://div[@class="page-title"]/h1 | 4 | strip://div[@class="page-title"]/h1 |
5 | strip://div[@class="article-abstract"] | 5 | strip://div[@class="article-abstract"] |
6 | strip://div[@class="article-meta"] | 6 | strip://div[@class="article-meta"] |
7 | strip://div[@id="rightColumn"] | 7 | strip://div[@id="rightColumn"] |
8 | strip://div[@id="inline-content-bottom-left"] | 8 | strip://div[@id="inline-content-bottom-left"] |
9 | test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook \ No newline at end of file | 9 | test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/publications.parliament.uk.txt b/inc/3rdparty/site_config/standard/publications.parliament.uk.txt index fa099473..8f32d7a4 100644..100755 --- a/inc/3rdparty/site_config/standard/publications.parliament.uk.txt +++ b/inc/3rdparty/site_config/standard/publications.parliament.uk.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author: //meta[@name="Author"] | 1 | author: //meta[@name="Author"] |
2 | date: //meta[@name="Date"] | 2 | date: //meta[@name="Date"] |
3 | strip: //h5 | 3 | strip: //h5 |
4 | test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm \ No newline at end of file | 4 | test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/publico.pt.txt b/inc/3rdparty/site_config/standard/publico.pt.txt new file mode 100755 index 00000000..bb6a05e1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/publico.pt.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | author: //span[@class="author"] | ||
3 | body: //article[@itemtype="http://schema.org/Article"] | ||
4 | date: //time[@itemprop="dateCreated"] | ||
5 | |||
6 | strip: //header[@class="entry-header single-header"] | ||
7 | strip: //aside[@class="entry-assets"] | ||
8 | strip: //div[@class="entry-options entry-options-above group"] | ||
9 | strip: //div[@class="entry-options entry-options-below group"] | ||
10 | |||
11 | convert_double_br_tags: yes | ||
12 | test_url: http://www.publico.pt/politica/noticia/passos-diz-que-se-limitacao-de-mandatos-fosse-para-todos-os-concelhos-estaria-claro-na-lei-1577691 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt b/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt index 126f9e27..0f1392a4 100644..100755 --- a/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt +++ b/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //div[@class='title'] | 1 | title: //div[@class='title'] |
2 | body: //div[@class='body'] | 2 | body: //div[@class='body'] |
3 | next_page_link: //div[@class='source']/text()[contains(., 'page')]/following-sibling::a | 3 | next_page_link: //div[@class='source']/text()[contains(., 'page')]/following-sibling::a |
4 | test_url: http://purpleplanetmedia.com/eye/inte/ngaiman.php \ No newline at end of file | 4 | test_url: http://purpleplanetmedia.com/eye/inte/ngaiman.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/qctimes.com.txt b/inc/3rdparty/site_config/standard/qctimes.com.txt new file mode 100755 index 00000000..3c3edfeb --- /dev/null +++ b/inc/3rdparty/site_config/standard/qctimes.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | # this site seems to work OK in the web view, but only occasionally in the instapaper app itself. | ||
2 | |||
3 | body: //div[@class='entry-content'] | ||
4 | author: //span[@class='byline'] | ||
5 | test_url: http://qctimes.com/news/local/woman-faces-perjury-charges-in-meth-case/article_83f4c470-956a-11e2-a921-001a4bcf887a.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/quantumdiaries.org.txt b/inc/3rdparty/site_config/standard/quantumdiaries.org.txt index a366c1b3..c17fb312 100644..100755 --- a/inc/3rdparty/site_config/standard/quantumdiaries.org.txt +++ b/inc/3rdparty/site_config/standard/quantumdiaries.org.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //div[contains(@class, "hentry")]/h3 | 1 | title: //div[contains(@class, "hentry")]/h3 |
2 | 2 | ||
3 | author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")] | 3 | author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")] |
4 | 4 | ||
5 | date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under") | 5 | date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under") |
6 | 6 | ||
7 | body: //div[contains(@class, "entry")] | 7 | body: //div[contains(@class, "entry")] |
8 | 8 | ||
9 | strip_id_or_class: addtoany_share_save_container | 9 | strip_id_or_class: addtoany_share_save_container |
10 | strip_id_or_class: postmetadata | 10 | strip_id_or_class: postmetadata |
11 | strip_id_or_class: author_bio | 11 | strip_id_or_class: author_bio |
12 | strip_id_or_class: author_bio_2 | 12 | strip_id_or_class: author_bio_2 |
13 | strip: //div[contains(@class, "hentry")]/h3 | 13 | strip: //div[contains(@class, "hentry")]/h3 |
14 | test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/ \ No newline at end of file | 14 | test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/queerty.com.txt b/inc/3rdparty/site_config/standard/queerty.com.txt index 655f8b80..fc7ab37f 100644..100755 --- a/inc/3rdparty/site_config/standard/queerty.com.txt +++ b/inc/3rdparty/site_config/standard/queerty.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='copy'] | 1 | body: //div[@class='copy'] |
2 | title: //h1[@class='hed'] | 2 | title: //h1[@class='hed'] |
3 | test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/ \ No newline at end of file | 3 | test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/quepasa.cl.txt b/inc/3rdparty/site_config/standard/quepasa.cl.txt index fae4e6a3..fb09a8f3 100644..100755 --- a/inc/3rdparty/site_config/standard/quepasa.cl.txt +++ b/inc/3rdparty/site_config/standard/quepasa.cl.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | body: //div[@class="cuerpoArticulo"] | 3 | body: //div[@class="cuerpoArticulo"] |
4 | 4 | ||
5 | 5 | ||
6 | test_url: http://www.quepasa.cl/magazine/articulo/print.html?id=5299 \ No newline at end of file | 6 | test_url: http://www.quepasa.cl/magazine/articulo/print.html?id=5299 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/quora.com.txt b/inc/3rdparty/site_config/standard/quora.com.txt index 3d34f2f8..732d12d7 100644..100755 --- a/inc/3rdparty/site_config/standard/quora.com.txt +++ b/inc/3rdparty/site_config/standard/quora.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | prune: no | 2 | prune: no |
3 | body: //div[contains(@class, 'main_col')] | 3 | body: //div[contains(@class, 'main_col')] |
4 | title: //h1 | 4 | title: //h1 |
5 | 5 | ||
6 | strip_id_or_class: hidden | 6 | strip_id_or_class: hidden |
7 | strip_id_or_class: item_action_bar | 7 | strip_id_or_class: item_action_bar |
8 | strip_id_or_class: answer_voters | 8 | strip_id_or_class: answer_voters |
9 | strip_id_or_class: question_topics | 9 | strip_id_or_class: question_topics |
10 | strip_id_or_class: answer_header_text | 10 | strip_id_or_class: answer_header_text |
11 | strip_id_or_class: editor_link | 11 | strip_id_or_class: editor_link |
12 | strip_id_or_class: view_tag | 12 | strip_id_or_class: view_tag |
13 | strip_id_or_class: include_details | 13 | strip_id_or_class: include_details |
14 | strip_id_or_class: sig_edit | 14 | strip_id_or_class: sig_edit |
15 | strip_id_or_class: profile_photo_img | 15 | strip_id_or_class: profile_photo_img |
16 | 16 | ||
17 | test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life \ No newline at end of file | 17 | test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/racjonalista.pl.txt b/inc/3rdparty/site_config/standard/racjonalista.pl.txt new file mode 100755 index 00000000..19c719d4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/racjonalista.pl.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: /html/body/center/b | ||
2 | date: /html/body/table/tr[2]/td/i | ||
3 | single_page_link: //*[@id='oTxt']/table[3]/tr[2]/td/a[1] | ||
4 | |||
5 | test_url: http://www.racjonalista.pl/kk.php/s,7214/q,Geneza.szubrawstwa \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/radar.oreilly.com.txt b/inc/3rdparty/site_config/standard/radar.oreilly.com.txt index 99ab4bb1..fa66b815 100644..100755 --- a/inc/3rdparty/site_config/standard/radar.oreilly.com.txt +++ b/inc/3rdparty/site_config/standard/radar.oreilly.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | date://span[@class='date'] | 1 | date://span[@class='date'] |
2 | body://div[@class='entry-body'] | 2 | body://div[@class='entry-body'] |
3 | test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html \ No newline at end of file | 3 | test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/radionz.co.nz.txt b/inc/3rdparty/site_config/standard/radionz.co.nz.txt index e2617dc5..2496ddab 100644..100755 --- a/inc/3rdparty/site_config/standard/radionz.co.nz.txt +++ b/inc/3rdparty/site_config/standard/radionz.co.nz.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='body'] | 1 | body: //div[@class='body'] |
2 | title: //div[@class='newsstory']/h2 | 2 | title: //div[@class='newsstory']/h2 |
3 | test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d \ No newline at end of file | 3 | test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/randsinrepose.com.txt b/inc/3rdparty/site_config/standard/randsinrepose.com.txt index f0c91c51..6970a744 100644..100755 --- a/inc/3rdparty/site_config/standard/randsinrepose.com.txt +++ b/inc/3rdparty/site_config/standard/randsinrepose.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //div[@id='center-col']/h4 | 1 | title: //div[@id='center-col']/h4 |
2 | author: substring-before(//title,'In') | 2 | author: substring-before(//title,'In') |
3 | date: substring-after(//div[@class='commenttext']/span,'#') | 3 | date: substring-after(//div[@class='commenttext']/span,'#') |
4 | body: //div[@id='center-col'] | 4 | body: //div[@id='center-col'] |
5 | strip: //div[@id='center-col']/h4 | 5 | strip: //div[@id='center-col']/h4 |
6 | strip: //div[@class='graytext'] | 6 | strip: //div[@class='graytext'] |
7 | 7 | ||
8 | # Anthony Perez-Sanz 2012.3.14 | 8 | # Anthony Perez-Sanz 2012.3.14 |
9 | # Removed long gif from the end | 9 | # Removed long gif from the end |
10 | strip: //img[@src='http://www.randsinrepose.com/spreader.gif'] | 10 | strip: //img[@src='http://www.randsinrepose.com/spreader.gif'] |
11 | test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html \ No newline at end of file | 11 | test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/readability.com.txt b/inc/3rdparty/site_config/standard/readability.com.txt index 80337291..2d5aba76 100644..100755 --- a/inc/3rdparty/site_config/standard/readability.com.txt +++ b/inc/3rdparty/site_config/standard/readability.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //link[@rel='canonical']/@href | 1 | single_page_link: //link[@rel='canonical']/@href |
2 | 2 | ||
3 | test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler \ No newline at end of file | 3 | test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/readwriteweb.com.txt b/inc/3rdparty/site_config/standard/readwriteweb.com.txt index ff799aa0..e2aabda9 100644..100755 --- a/inc/3rdparty/site_config/standard/readwriteweb.com.txt +++ b/inc/3rdparty/site_config/standard/readwriteweb.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1[@class="titlelink"] | 1 | title: //h1[@class="titlelink"] |
2 | date: //span[@class="timestamp"]/@data-published | 2 | date: //span[@class="timestamp"]/@data-published |
3 | body: //div[@class="asset-content"] | 3 | body: //div[@class="asset-content"] |
4 | strip_id_or_class: related-entries | 4 | strip_id_or_class: related-entries |
5 | strip_id_or_class: like-and-retweet | 5 | strip_id_or_class: like-and-retweet |
6 | 6 | ||
7 | author: //div[@id="submeta"]/a[1] | 7 | author: //div[@id="submeta"]/a[1] |
8 | test_url: http://www.readwriteweb.com/archives/why_facebook_terrifies_google.php \ No newline at end of file | 8 | test_url: http://www.readwriteweb.com/archives/why_facebook_terrifies_google.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/real.gr.txt b/inc/3rdparty/site_config/standard/real.gr.txt index fe5ab672..1a33610d 100644..100755 --- a/inc/3rdparty/site_config/standard/real.gr.txt +++ b/inc/3rdparty/site_config/standard/real.gr.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='_ctl12__ctl0_Article'] | 1 | body: //div[@id='_ctl12__ctl0_Article'] |
2 | prune: no | 2 | prune: no |
3 | autodetect_on_failure: no \ No newline at end of file | 3 | autodetect_on_failure: no \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/recipe.com.txt b/inc/3rdparty/site_config/standard/recipe.com.txt index 8c8f0e0c..a01aaef4 100644..100755 --- a/inc/3rdparty/site_config/standard/recipe.com.txt +++ b/inc/3rdparty/site_config/standard/recipe.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients'] | 1 | body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients'] |
2 | 2 | ||
3 | strip_id_or_class: location | 3 | strip_id_or_class: location |
4 | strip_id_or_class: savings | 4 | strip_id_or_class: savings |
5 | strip_id_or_class: recipeDetailDescButton | 5 | strip_id_or_class: recipeDetailDescButton |
6 | 6 | ||
7 | prune: no | 7 | prune: no |
8 | tidy: no | 8 | tidy: no |
9 | 9 | ||
10 | test_url: http://www.recipe.com/avocado-basil-pasta/ \ No newline at end of file | 10 | test_url: http://www.recipe.com/avocado-basil-pasta/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/red-hot-girls.com.txt b/inc/3rdparty/site_config/standard/red-hot-girls.com.txt index 3ae959b1..0403ee86 100644..100755 --- a/inc/3rdparty/site_config/standard/red-hot-girls.com.txt +++ b/inc/3rdparty/site_config/standard/red-hot-girls.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='short-text' or starts-with(@id, 'news-id-')] | 1 | body: //div[@class='short-text' or starts-with(@id, 'news-id-')] |
2 | prune: no | 2 | prune: no |
3 | tidy: no | 3 | tidy: no |
4 | 4 | ||
5 | test_url: http://red-hot-girls.com/2011/06/10/the_red_hot_natalia_maria_53_pics.html \ No newline at end of file | 5 | test_url: http://red-hot-girls.com/2011/06/10/the_red_hot_natalia_maria_53_pics.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/reddit.com.txt b/inc/3rdparty/site_config/standard/reddit.com.txt index 58ca9ece..8871f564 100644..100755 --- a/inc/3rdparty/site_config/standard/reddit.com.txt +++ b/inc/3rdparty/site_config/standard/reddit.com.txt | |||
@@ -1,16 +1,20 @@ | |||
1 | # This setup grabs the text from a Reddit self post. It ignores all comments etc. | 1 | # This setup grabs the text from a Reddit self post. It ignores all comments etc. |
2 | 2 | ||
3 | title: //p[@class="title"]/a/text() | 3 | title: //p[@class="title"]/a/text() |
4 | 4 | ||
5 | author: //p[@class="tagline"]/a | 5 | author: //p[@class="tagline"]/a |
6 | 6 | ||
7 | # this doesn't work for some reason...? | 7 | # this doesn't work for some reason...? |
8 | date: //p[@class="tagline"]//@datetime | 8 | date: //p[@class="tagline"]//@datetime |
9 | 9 | ||
10 | body: //div[@class="expando"]//div[@class="usertext-body"] | 10 | body: //div[@class="expando"]//div[@class="usertext-body"] |
11 | 11 | ||
12 | strip_id_or_class: tagline | 12 | strip_id_or_class: tagline |
13 | strip_id_or_class: unvotable-message | 13 | strip_id_or_class: unvotable-message |
14 | strip_id_or_class: buttons | 14 | strip_id_or_class: buttons |
15 | 15 | ||
16 | test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/ \ No newline at end of file | 16 | # follow the posted link (unless it's a self post - relative URL, no http://) |
17 | single_page_link: //p[@class="title"]/a[contains(@href, 'http://')] | ||
18 | |||
19 | test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/ | ||
20 | test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/redmondpie.com.txt b/inc/3rdparty/site_config/standard/redmondpie.com.txt index 12a96187..66cc1707 100644..100755 --- a/inc/3rdparty/site_config/standard/redmondpie.com.txt +++ b/inc/3rdparty/site_config/standard/redmondpie.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //div[@class='posthead']//h2 | 1 | title: //div[@class='posthead']//h2 |
2 | body: //div[contains(@class, 'postcontent') or @class='posthead'] | 2 | body: //div[contains(@class, 'postcontent') or @class='posthead'] |
3 | author: //div[@class='posthead']//a[@rel='author'] | 3 | author: //div[@class='posthead']//a[@rel='author'] |
4 | 4 | ||
5 | strip: //div[@class='posthead']//h2 | 5 | strip: //div[@class='posthead']//h2 |
6 | replace_string(>Advertisements</div>): ></div> | 6 | replace_string(>Advertisements</div>): ></div> |
7 | replace_string(<p>You can follow us on): <p style="display:none;"> | 7 | replace_string(<p>You can follow us on): <p style="display:none;"> |
8 | strip_id_or_class: likeThisPost | 8 | strip_id_or_class: likeThisPost |
9 | 9 | ||
10 | prune: no | 10 | prune: no |
11 | tidy: no | 11 | tidy: no |
12 | 12 | ||
13 | test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/ \ No newline at end of file | 13 | test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt b/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt index 4f195a06..8541a0d4 100644..100755 --- a/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt +++ b/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | # Think there might be something up with your parser that it strips out 'print' from the title :) | 1 | # Think there might be something up with your parser that it strips out 'print' from the title :) |
2 | 2 | ||
3 | title: //meta[@name='title']/@content | 3 | title: //meta[@name='title']/@content |
4 | author: //meta[@name='author']/@content | 4 | author: //meta[@name='author']/@content |
5 | date: //meta[@name='date']/@content | 5 | date: //meta[@name='date']/@content |
6 | 6 | ||
7 | body: //div[@class='articleText'] | 7 | body: //div[@class='articleText'] |
8 | 8 | ||
9 | strip: //div[contains(@class, 'day')] | 9 | strip: //div[contains(@class, 'day')] |
10 | strip: //div[contains(@class, 'month')] | 10 | strip: //div[contains(@class, 'month')] |
11 | strip: //div[contains(@class, 'year')] | 11 | strip: //div[contains(@class, 'year')] |
12 | strip: //div[contains(@class, 'time')] | 12 | strip: //div[contains(@class, 'time')] |
13 | strip: //h1[@class='gl_headline'] | 13 | strip: //h1[@class='gl_headline'] |
14 | strip: //div[@class='byline'] | 14 | strip: //div[@class='byline'] |
15 | strip: //div[@id='left_ear'] | 15 | strip: //div[@id='left_ear'] |
16 | strip: //div[@id='right_ear'] | 16 | strip: //div[@id='right_ear'] |
17 | strip: //div[contains(@class, 'PopularPosts')] | 17 | strip: //div[contains(@class, 'PopularPosts')] |
18 | strip ://div[@class='discuss_page_break'] | 18 | strip ://div[@class='discuss_page_break'] |
19 | strip ://div[contains(@class, 'p-content_TagList')] | 19 | strip ://div[contains(@class, 'p-content_TagList')] |
20 | test_url: http://redtape.msnbc.msn.com/_news/2011/09/28/8020661-sprint-raises-fee-but-wont-free-users-from-two-year-contracts?preview=true \ No newline at end of file | 20 | test_url: http://redtape.msnbc.msn.com/_news/2011/09/28/8020661-sprint-raises-fee-but-wont-free-users-from-two-year-contracts?preview=true \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/reflets.info.txt b/inc/3rdparty/site_config/standard/reflets.info.txt index 4a9fab67..98a2bbfc 100644..100755 --- a/inc/3rdparty/site_config/standard/reflets.info.txt +++ b/inc/3rdparty/site_config/standard/reflets.info.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body://div[@class='storycontent'] | 1 | body://div[@class='storycontent'] |
2 | date://div[@class='date'] | 2 | date://div[@class='date'] |
3 | strip://li[@class='sharing_label'] | 3 | strip://li[@class='sharing_label'] |
4 | strip://a[@class='FlattrButton'] | 4 | strip://a[@class='FlattrButton'] |
5 | test_url: http://reflets.info/orange-nokia-siemens-deep-packet-inspection/ \ No newline at end of file | 5 | test_url: http://reflets.info/orange-nokia-siemens-deep-packet-inspection/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/renenekuda.cz.txt b/inc/3rdparty/site_config/standard/renenekuda.cz.txt index 0b3dee1d..a5361fd0 100644..100755 --- a/inc/3rdparty/site_config/standard/renenekuda.cz.txt +++ b/inc/3rdparty/site_config/standard/renenekuda.cz.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //*[@class='entry-title'] | 1 | title: //*[@class='entry-title'] |
2 | body: //div[@class='entry-content'] | 2 | body: //div[@class='entry-content'] |
3 | test_url: http://www.renenekuda.cz/recept-na-produktivitu/ \ No newline at end of file | 3 | test_url: http://www.renenekuda.cz/recept-na-produktivitu/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/resume.se.txt b/inc/3rdparty/site_config/standard/resume.se.txt new file mode 100755 index 00000000..17122a9b --- /dev/null +++ b/inc/3rdparty/site_config/standard/resume.se.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | date: //meta[@name='bi3dPubDate']/@content | ||
2 | body: //div[contains(@class, 'articleBody')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.resume.se/nyheter/media/2013/09/18/kvallspress-och-tv-slass-om-playtittarna-men-youtube-ohotat-storst/ | ||
7 | test_url: http://www.resume.se/nyheter/media/2013/09/18/cecilia-blankens-lamnar-mama-for-konkurrent/ | ||
8 | test_url: http://www.resume.se/nyheter/reklam/2013/09/18/ravelli-trodde-jag-var-med-i-blasningen/ | ||
9 | test_url: http://www.resume.se/rss-nyheter \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/retrieverweekly.com.txt b/inc/3rdparty/site_config/standard/retrieverweekly.com.txt index 1264ee3f..a0a23940 100644..100755 --- a/inc/3rdparty/site_config/standard/retrieverweekly.com.txt +++ b/inc/3rdparty/site_config/standard/retrieverweekly.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | single_page_link://a[contains(@href, 'print')] | 1 | single_page_link://a[contains(@href, 'print')] |
2 | 2 | ||
3 | # Grab metadata from the "printer-friendly" page, after specifying single_page_link | 3 | # Grab metadata from the "printer-friendly" page, after specifying single_page_link |
4 | title://h2 | 4 | title://h2 |
5 | date://cite | 5 | date://cite |
6 | test_url: http://www.retrieverweekly.com/?cmd=displaystory&story_id=7548&format=html \ No newline at end of file | 6 | test_url: http://www.retrieverweekly.com/?cmd=displaystory&story_id=7548&format=html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/reuters.com.txt b/inc/3rdparty/site_config/standard/reuters.com.txt index c5c94a4f..7411e62b 100644..100755 --- a/inc/3rdparty/site_config/standard/reuters.com.txt +++ b/inc/3rdparty/site_config/standard/reuters.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //h1[@class='headline3'] | 1 | title: //h1[@class='headline3'] |
2 | author: substring-after(//p[@class="byline"], 'By ') | 2 | author: substring-after(//p[@class="byline"], 'By ') |
3 | date: //meta[@name="REVISION_DATE"]/@content | 3 | date: //meta[@name="REVISION_DATE"]/@content |
4 | body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation'] | 4 | body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation'] |
5 | strip: //li[@class='next'] | 5 | strip: //li[@class='next'] |
6 | strip: //span[@class='articleLocation'] | 6 | strip: //span[@class='articleLocation'] |
7 | prune: no | 7 | prune: no |
8 | tidy: no | 8 | tidy: no |
9 | 9 | ||
10 | test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408 \ No newline at end of file | 10 | test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt b/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt index dbe42932..30e627dc 100644..100755 --- a/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt +++ b/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //div[@class="article_header"]/h3 | 1 | title: //div[@class="article_header"]/h3 |
2 | author: //div[@class="autor"]/p/* | 2 | author: //div[@class="autor"]/p/* |
3 | date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ") | 3 | date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ") |
4 | 4 | ||
5 | move_into(//div[@class="new_article"]): //div[@class="img_article"]/img | 5 | move_into(//div[@class="new_article"]): //div[@class="img_article"]/img |
6 | 6 | ||
7 | body: //div[@class="article_content"] | 7 | body: //div[@class="article_content"] |
8 | convert_double_br_tags: yes | 8 | convert_double_br_tags: yes |
9 | 9 | ||
10 | test_url: http://revistapiaui.estadao.com.br/edicao-68/questoes-latino-americanas/filhos-da-guerra-suja \ No newline at end of file | 10 | test_url: http://revistapiaui.estadao.com.br/edicao-68/questoes-latino-americanas/filhos-da-guerra-suja \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rezeptwelt.de.txt b/inc/3rdparty/site_config/standard/rezeptwelt.de.txt new file mode 100644 index 00000000..2093573b --- /dev/null +++ b/inc/3rdparty/site_config/standard/rezeptwelt.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='step-content'] | //div[@class='global-active ingredients-box'] | ||
2 | title: //div[@class='step-1-container'] | ||
3 | |||
4 | tidy: no | ||
5 | test_url: http://www.rezeptwelt.de/backen-herzhaft-rezepte/w%C3%BCrstchen-schlangen/530372 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt b/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt index 904a11dd..b0ee92dc 100644..100755 --- a/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt +++ b/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@id="post"] | 1 | body: //div[@id="post"] |
2 | strip: //div[@id="author-description"] | 2 | strip: //div[@id="author-description"] |
3 | date: //span[@class="entry-date"] | 3 | date: //span[@class="entry-date"] |
4 | author: //span[@class="author vcard"] | 4 | author: //span[@class="author vcard"] |
5 | test_url: http://richardmuscat.wordpress.com/2011/06/20/the-price-of-free/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+TheBrooksReview+%28The+Brooks+Review%29 \ No newline at end of file | 5 | test_url: http://richardmuscat.wordpress.com/2011/06/20/the-price-of-free/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+TheBrooksReview+%28The+Brooks+Review%29 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt b/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt index 82cfaf27..ed72915c 100644..100755 --- a/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt +++ b/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='post-body entry-content'] | 1 | body: //div[@class='post-body entry-content'] |
2 | strip: //div[@id='lws_0'] | 2 | strip: //div[@id='lws_0'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://ritemail.blogspot.com/2011/06/hayden-panettiere-candids-in-los.html \ No newline at end of file | 5 | test_url: http://ritemail.blogspot.com/2011/06/hayden-panettiere-candids-in-los.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ritholtz.com.txt b/inc/3rdparty/site_config/standard/ritholtz.com.txt new file mode 100755 index 00000000..d598479e --- /dev/null +++ b/inc/3rdparty/site_config/standard/ritholtz.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='post']/h2 | ||
2 | author: substring-before(substring-after(//div[@class='alignright']/small, 'By '),'-') | ||
3 | date: substring-after(//div[@class='alignright']/small, '-') | ||
4 | strip: //div[@class='alignleft'] | ||
5 | test_url: http://www.ritholtz.com/blog/2012/09/situational-awareness/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/robertsspaceindustries.com.txt b/inc/3rdparty/site_config/standard/robertsspaceindustries.com.txt new file mode 100755 index 00000000..b0b90fb7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/robertsspaceindustries.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | strip_id_or_class: 'sharedaddy' | ||
2 | strip_id_or_class: 'respond' | ||
3 | strip_id_or_class: 'meta' | ||
4 | test_url: http://www.robertsspaceindustries.com/news-update-ai-pilots/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/robots.thoughtbot.com.txt b/inc/3rdparty/site_config/standard/robots.thoughtbot.com.txt new file mode 100755 index 00000000..da5b7bd8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/robots.thoughtbot.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //section[@class='post text'] | ||
2 | title: //h1[@class='title'] | ||
3 | date: //p[@class='post-date'] | ||
4 | strip: //section[@class='meta-info'] | ||
5 | test_url: http://robots.thoughtbot.com/post/32455387133/four-phase-test \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt b/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt index 3035527c..f8c9541f 100644..100755 --- a/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt +++ b/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | 2 | ||
3 | strip: //div[ contains(@class, 'respond') ] | //h2 | //h1 | 3 | strip: //div[ contains(@class, 'respond') ] | //h2 | //h1 |
4 | 4 | ||
5 | date: substring-after(//p[@class='info'], ' on ') | 5 | date: substring-after(//p[@class='info'], ' on ') |
6 | 6 | ||
7 | author: //p[@class='info']//a | 7 | author: //p[@class='info']//a |
8 | test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/ \ No newline at end of file | 8 | test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt b/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt index abe70351..eef8b11c 100644..100755 --- a/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt +++ b/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | author: //article/header/span[@class='author'] | 1 | author: //article/header/span[@class='author'] |
2 | title://article/header/h1 | 2 | title://article/header/h1 |
3 | body: //article | 3 | body: //article |
4 | strip: //article/header | 4 | strip: //article/header |
5 | strip: //article/p[@class='metadata'] | 5 | strip: //article/p[@class='metadata'] |
6 | footnotes: yes | 6 | footnotes: yes |
7 | test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/ \ No newline at end of file | 7 | test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rogerebert.com.txt b/inc/3rdparty/site_config/standard/rogerebert.com.txt index 26792330..da215109 100644..100755 --- a/inc/3rdparty/site_config/standard/rogerebert.com.txt +++ b/inc/3rdparty/site_config/standard/rogerebert.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: substring-before(//title,':') | 1 | title: substring-before(//title,':') |
2 | author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY') | 2 | author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY') |
3 | 3 | ||
4 | body: //div[@class='text'] | 4 | body: //div[@class='text'] |
5 | 5 | ||
6 | strip: //a[contains(@href,'printart')] | 6 | strip: //a[contains(@href,'printart')] |
7 | strip_id_or_class: enlarge_photo | 7 | strip_id_or_class: enlarge_photo |
8 | test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY \ No newline at end of file | 8 | test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt b/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt index d618c23f..2365c42a 100644..100755 --- a/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt +++ b/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[contains(@class, 'inhoud')] | 1 | body: //div[contains(@class, 'inhoud')] |
2 | date: //span[@class ='published'] | 2 | date: //span[@class ='published'] |
3 | author: //span[@class ='author'] | 3 | author: //span[@class ='author'] |
4 | strip: //div[@class = 'grid_2'] | 4 | strip: //div[@class = 'grid_2'] |
5 | strip: //div[@class = 'block-citation-text'] | 5 | strip: //div[@class = 'block-citation-text'] |
6 | test_url: http://www.rolfinjapan.nl/2011/06/duizend-kraanvogels/ \ No newline at end of file | 6 | test_url: http://www.rolfinjapan.nl/2011/06/duizend-kraanvogels/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rollingstone.com.txt b/inc/3rdparty/site_config/standard/rollingstone.com.txt index 9a10a69e..9a10a69e 100644..100755 --- a/inc/3rdparty/site_config/standard/rollingstone.com.txt +++ b/inc/3rdparty/site_config/standard/rollingstone.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/rottentomatoes.com.txt b/inc/3rdparty/site_config/standard/rottentomatoes.com.txt index b5b29fe4..ef327691 100644..100755 --- a/inc/3rdparty/site_config/standard/rottentomatoes.com.txt +++ b/inc/3rdparty/site_config/standard/rottentomatoes.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class='movie_content_area'] | 1 | body: //div[@class='movie_content_area'] |
2 | strip_id_or_class: tomatometer_bar_help | 2 | strip_id_or_class: tomatometer_bar_help |
3 | strip_id_or_class: critic-links | 3 | strip_id_or_class: critic-links |
4 | strip_id_or_class: top-critics-numbers | 4 | strip_id_or_class: top-critics-numbers |
5 | strip_id_or_class: fan_side | 5 | strip_id_or_class: fan_side |
6 | strip_id_or_class: fblike | 6 | strip_id_or_class: fblike |
7 | strip_id_or_class: rating_widget | 7 | strip_id_or_class: rating_widget |
8 | strip_id_or_class: friend_reviews | 8 | strip_id_or_class: friend_reviews |
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | test_url: http://www.rottentomatoes.com/m/thor/ \ No newline at end of file | 11 | test_url: http://www.rottentomatoes.com/m/thor/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/roughtype.com.txt b/inc/3rdparty/site_config/standard/roughtype.com.txt index f2f00392..a012a67d 100644..100755 --- a/inc/3rdparty/site_config/standard/roughtype.com.txt +++ b/inc/3rdparty/site_config/standard/roughtype.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='content'] | 1 | body: //div[@class='content'] |
2 | strip: //p[@class='postmeta']/following::* | 2 | strip: //p[@class='postmeta']/following::* |
3 | strip: //p[@class='postmeta'] | 3 | strip: //p[@class='postmeta'] |
4 | strip: //p[@align='left'] | 4 | strip: //p[@align='left'] |
5 | test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php \ No newline at end of file | 5 | test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/roy.gbiv.com.txt b/inc/3rdparty/site_config/standard/roy.gbiv.com.txt index 6ff03de8..6ff03de8 100644..100755 --- a/inc/3rdparty/site_config/standard/roy.gbiv.com.txt +++ b/inc/3rdparty/site_config/standard/roy.gbiv.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/rpgsite.net.txt b/inc/3rdparty/site_config/standard/rpgsite.net.txt index e7f29bbe..9ddbf0f2 100644..100755 --- a/inc/3rdparty/site_config/standard/rpgsite.net.txt +++ b/inc/3rdparty/site_config/standard/rpgsite.net.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id='news-text'] | 1 | body: //div[@id='news-text'] |
2 | prune: no | 2 | prune: no |
3 | test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy | 3 | test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy |
4 | test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork \ No newline at end of file | 4 | test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/rubysfera.pl.txt b/inc/3rdparty/site_config/standard/rubysfera.pl.txt index d9df7684..d9d9a431 100644..100755 --- a/inc/3rdparty/site_config/standard/rubysfera.pl.txt +++ b/inc/3rdparty/site_config/standard/rubysfera.pl.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | author: //div[contains(@class, 'author_text')]/h4/text() | 1 | author: //div[contains(@class, 'author_text')]/h4/text() |
2 | date: //li[@class='date'] | 2 | date: //li[@class='date'] |
3 | 3 | ||
4 | # stripping excessive tags | 4 | # stripping excessive tags |
5 | strip: //div[contains(@class, 'entry_meta')] | 5 | strip: //div[contains(@class, 'entry_meta')] |
6 | strip: //div[contains(@class, 'single_meta')] | 6 | strip: //div[contains(@class, 'single_meta')] |
7 | strip: //br[contains(@class, 'clear')] | 7 | strip: //br[contains(@class, 'clear')] |
8 | strip: //h3[contains(., 'Komentarz')] | 8 | strip: //h3[contains(., 'Komentarz')] |
9 | test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/ \ No newline at end of file | 9 | test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ruhlman.com.txt b/inc/3rdparty/site_config/standard/ruhlman.com.txt index 7a21c4af..e54b0f0e 100644..100755 --- a/inc/3rdparty/site_config/standard/ruhlman.com.txt +++ b/inc/3rdparty/site_config/standard/ruhlman.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class='entry-title'] | 1 | title: //h1[@class='entry-title'] |
2 | author: ///span[@class='author vcard'] | 2 | author: ///span[@class='author vcard'] |
3 | date: //abbr[@class='published'] | 3 | date: //abbr[@class='published'] |
4 | body: //div[@class='entry-content'] | 4 | body: //div[@class='entry-content'] |
5 | 5 | ||
6 | test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/ \ No newline at end of file | 6 | test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/ruttloff.org.txt b/inc/3rdparty/site_config/standard/ruttloff.org.txt index c036dcf8..43e130a4 100644..100755 --- a/inc/3rdparty/site_config/standard/ruttloff.org.txt +++ b/inc/3rdparty/site_config/standard/ruttloff.org.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | author: //a[@class='author'] | 1 | author: //a[@class='author'] |
2 | tidy: no | 2 | tidy: no |
3 | test_url: http://ruttloff.org/2012/06/13/intervention \ No newline at end of file | 3 | test_url: http://ruttloff.org/2012/06/13/intervention \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/salon.com.txt b/inc/3rdparty/site_config/standard/salon.com.txt index 04f8afd5..2b47f744 100644..100755 --- a/inc/3rdparty/site_config/standard/salon.com.txt +++ b/inc/3rdparty/site_config/standard/salon.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | author: (//span[@class="byline"]/a)[1] | 2 | author: (//span[@class="byline"]/a)[1] |
3 | date: //span[contains(@class, "toLocalTime")] | 3 | date: //span[contains(@class, "toLocalTime")] |
4 | body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")] | 4 | body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")] |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | 7 | ||
8 | # deal with singleton links | 8 | # deal with singleton links |
9 | single_page_link: (//h1/a[contains(@href, '/singleton')])[1] | 9 | single_page_link: (//h1/a[contains(@href, '/singleton')])[1] |
10 | 10 | ||
11 | test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/singleton/ \ No newline at end of file | 11 | test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/singleton/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/salzburg.com.txt b/inc/3rdparty/site_config/standard/salzburg.com.txt index 31067481..464f99f1 100644..100755 --- a/inc/3rdparty/site_config/standard/salzburg.com.txt +++ b/inc/3rdparty/site_config/standard/salzburg.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //p[@class='teaser1 darkgrey myriad'] | 1 | body: //p[@class='teaser1 darkgrey myriad'] |
2 | move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear'] | 2 | move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear'] |
3 | strip: //div[@class='hidden'] | 3 | strip: //div[@class='hidden'] |
4 | strip: //div[@id='article_related_source'] | 4 | strip: //div[@id='article_related_source'] |
5 | 5 | ||
6 | test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/ \ No newline at end of file | 6 | test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sanpedrosun.com.txt b/inc/3rdparty/site_config/standard/sanpedrosun.com.txt new file mode 100755 index 00000000..3f19cced --- /dev/null +++ b/inc/3rdparty/site_config/standard/sanpedrosun.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[contains(@class, 'post')]//h1 | ||
2 | date: //div[contains(@class, 'post')]//h6 | ||
3 | body: //div[contains(@class, 'entry')] | ||
4 | strip_id_or_class: post_stats | ||
5 | strip_id_or_class: related-posts | ||
6 | strip_id_or_class: after_story | ||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.sanpedrosun.com/community-and-society/2013/06/05/little-angelspre-school-talent-show/ | ||
10 | test_url: http://www.sanpedrosun.com/feed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/saveyourself.ca.txt b/inc/3rdparty/site_config/standard/saveyourself.ca.txt index 354f5911..5a5605d9 100644..100755 --- a/inc/3rdparty/site_config/standard/saveyourself.ca.txt +++ b/inc/3rdparty/site_config/standard/saveyourself.ca.txt | |||
@@ -1,25 +1,25 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | 2 | ||
3 | # my section divs seem to interfere with the Instapaper parser, so I ditch 'em | 3 | # my section divs seem to interfere with the Instapaper parser, so I ditch 'em |
4 | dissolve://div[contains(@class, 'section')] | 4 | dissolve://div[contains(@class, 'section')] |
5 | 5 | ||
6 | #these don't seem to be necessary, but just in case | 6 | #these don't seem to be necessary, but just in case |
7 | strip_id_or_class:'masthead' | 7 | strip_id_or_class:'masthead' |
8 | strip_id_or_class:'footer' | 8 | strip_id_or_class:'footer' |
9 | 9 | ||
10 | #again, Instapaper seems to understand where my content is, but just in case | 10 | #again, Instapaper seems to understand where my content is, but just in case |
11 | body://div[@id='content'] | 11 | body://div[@id='content'] |
12 | 12 | ||
13 | # in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing | 13 | # in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing |
14 | strip_id_or_class:'screen-only' | 14 | strip_id_or_class:'screen-only' |
15 | strip_id_or_class:'no-print' | 15 | strip_id_or_class:'no-print' |
16 | 16 | ||
17 | #other misc removals and simplifications | 17 | #other misc removals and simplifications |
18 | strip_id_or_class:'popup' | 18 | strip_id_or_class:'popup' |
19 | strip_id_or_class:'ZoomSpin' | 19 | strip_id_or_class:'ZoomSpin' |
20 | 20 | ||
21 | #I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes | 21 | #I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes |
22 | wrap_in(blockquote)://div[contains(@class, 'sidebar')] | 22 | wrap_in(blockquote)://div[contains(@class, 'sidebar')] |
23 | wrap_in(blockquote)://div[contains(@class, 'meta')] | 23 | wrap_in(blockquote)://div[contains(@class, 'meta')] |
24 | wrap_in(blockquote)://p[contains(@class, 'meta')] | 24 | wrap_in(blockquote)://p[contains(@class, 'meta')] |
25 | test_url: http://saveyourself.ca/tutorials/low-back-pain.php \ No newline at end of file | 25 | test_url: http://saveyourself.ca/tutorials/low-back-pain.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sayidaty.net.txt b/inc/3rdparty/site_config/standard/sayidaty.net.txt new file mode 100755 index 00000000..2d9f1884 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sayidaty.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | date: //meta[@property='article:published_time']/@content | ||
2 | body: (//div[contains(@class, 'article-slider')]//img)[1] | //div[contains(@class, 'bottom-article-con')] | ||
3 | |||
4 | test_url: http://www.sayidaty.net/taxonomy/term/10/all/feed \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sbnation.com.txt b/inc/3rdparty/site_config/standard/sbnation.com.txt index c213843c..41b36755 100644..100755 --- a/inc/3rdparty/site_config/standard/sbnation.com.txt +++ b/inc/3rdparty/site_config/standard/sbnation.com.txt | |||
@@ -1,28 +1,28 @@ | |||
1 | title: //h1[@id='stream_title'] | 1 | title: //h1[@id='stream_title'] |
2 | 2 | ||
3 | # Author and date don't work | 3 | # Author and date don't work |
4 | author: //div[@class='byline'] | 4 | author: //div[@class='byline'] |
5 | date: //div[@class='date-stamp'] | 5 | date: //div[@class='date-stamp'] |
6 | 6 | ||
7 | body: //div[@class='node-article'] | 7 | body: //div[@class='node-article'] |
8 | 8 | ||
9 | strip_id_or_class: fb-like-box | 9 | strip_id_or_class: fb-like-box |
10 | strip_id_or_class: stream-fb-like | 10 | strip_id_or_class: stream-fb-like |
11 | strip_id_or_class: social-meta | 11 | strip_id_or_class: social-meta |
12 | strip_id_or_class: social-spoken | 12 | strip_id_or_class: social-spoken |
13 | strip_id_or_class: twitter-share-button | 13 | strip_id_or_class: twitter-share-button |
14 | strip_id_or_class: twitter-follow-button | 14 | strip_id_or_class: twitter-follow-button |
15 | strip_id_or_class: spinner_node_list | 15 | strip_id_or_class: spinner_node_list |
16 | strip_id_or_class: node-sort-link | 16 | strip_id_or_class: node-sort-link |
17 | strip_id_or_class: stream_title | 17 | strip_id_or_class: stream_title |
18 | strip_id_or_class: stream_summary | 18 | strip_id_or_class: stream_summary |
19 | strip_id_or_class: update-count-container | 19 | strip_id_or_class: update-count-container |
20 | strip_id_or_class: major-updates | 20 | strip_id_or_class: major-updates |
21 | strip_id_or_class: newsletter-slide | 21 | strip_id_or_class: newsletter-slide |
22 | strip_id_or_class: author-mini-profile | 22 | strip_id_or_class: author-mini-profile |
23 | strip_id_or_class: byline | 23 | strip_id_or_class: byline |
24 | strip_id_or_class: header | 24 | strip_id_or_class: header |
25 | strip_id_or_class: footer | 25 | strip_id_or_class: footer |
26 | 26 | ||
27 | # Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns | 27 | # Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns |
28 | test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic \ No newline at end of file | 28 | test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/schneier.com.txt b/inc/3rdparty/site_config/standard/schneier.com.txt index 67181b65..0074a86a 100644..100755 --- a/inc/3rdparty/site_config/standard/schneier.com.txt +++ b/inc/3rdparty/site_config/standard/schneier.com.txt | |||
@@ -1,25 +1,25 @@ | |||
1 | author: //p[@class='mastname'] | 1 | author: //p[@class='mastname'] |
2 | 2 | ||
3 | body: //div[@class='indivbody'] | 3 | body: //div[@class='indivbody'] |
4 | date: //div[@class='indivbody']/h2[1] | 4 | date: //div[@class='indivbody']/h2[1] |
5 | 5 | ||
6 | # Remove blog title. Specify first occurrence in case h1 is used in article | 6 | # Remove blog title. Specify first occurrence in case h1 is used in article |
7 | strip: //div[@class='indivbody']/h1[1] | 7 | strip: //div[@class='indivbody']/h1[1] |
8 | 8 | ||
9 | # Remove blog description (the first p element) | 9 | # Remove blog description (the first p element) |
10 | strip: //div[@class='indivbody']/p[1] | 10 | strip: //div[@class='indivbody']/p[1] |
11 | 11 | ||
12 | # Remove navigation (second p element) | 12 | # Remove navigation (second p element) |
13 | strip: //div[@class='indivbody']/p[2] | 13 | strip: //div[@class='indivbody']/p[2] |
14 | 14 | ||
15 | # Remove duplicate of article title. Specify first occurrence in case h3 is used in article | 15 | # Remove duplicate of article title. Specify first occurrence in case h3 is used in article |
16 | strip: //div[@class='indivbody']/h3[1] | 16 | strip: //div[@class='indivbody']/h3[1] |
17 | 17 | ||
18 | # Remove publishing date, it's extracted by rule above | 18 | # Remove publishing date, it's extracted by rule above |
19 | strip: //div[@class='indivbody']/h2[1] | 19 | strip: //div[@class='indivbody']/h2[1] |
20 | 20 | ||
21 | # Remove duplicate of date at end, and newsletter signup | 21 | # Remove duplicate of date at end, and newsletter signup |
22 | strip: //p[@class='posted'] | 22 | strip: //p[@class='posted'] |
23 | 23 | ||
24 | # Leave date at top | 24 | # Leave date at top |
25 | test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html \ No newline at end of file | 25 | test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/science.orf.at.txt b/inc/3rdparty/site_config/standard/science.orf.at.txt index 89ebfe08..c4b21834 100644..100755 --- a/inc/3rdparty/site_config/standard/science.orf.at.txt +++ b/inc/3rdparty/site_config/standard/science.orf.at.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class="storybox"] | 1 | body: //div[@class="storybox"] |
2 | title: //div[@class="storybox"]//h1 | 2 | title: //div[@class="storybox"]//h1 |
3 | strip: //p[@class='metaline'] | 3 | strip: //p[@class='metaline'] |
4 | date: substring-after(//*[@class='time'],'Erstellt am') | 4 | date: substring-after(//*[@class='time'],'Erstellt am') |
5 | strip: //div[@class='fact'] | 5 | strip: //div[@class='fact'] |
6 | strip: //p[@class='backlink'] | 6 | strip: //p[@class='backlink'] |
7 | strip: //div[@class='mailto'] | 7 | strip: //div[@class='mailto'] |
8 | strip: //div[@id='forumDisclaimer'] | 8 | strip: //div[@id='forumDisclaimer'] |
9 | strip: //div[@class='forum'] | 9 | strip: //div[@class='forum'] |
10 | 10 | ||
11 | test_url: http://science.orf.at/stories/1700900/ \ No newline at end of file | 11 | test_url: http://science.orf.at/stories/1700900/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scienceblogs.de.txt b/inc/3rdparty/site_config/standard/scienceblogs.de.txt index 08c16842..b0dec3d2 100644..100755 --- a/inc/3rdparty/site_config/standard/scienceblogs.de.txt +++ b/inc/3rdparty/site_config/standard/scienceblogs.de.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a | 1 | single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a |
2 | 2 | ||
3 | author: //div[@class='details clear']//a[@class='hi'] | 3 | author: //div[@class='details clear']//a[@class='hi'] |
4 | body: //div[@class='title'] | 4 | body: //div[@class='title'] |
5 | strip: //p[@class='entrypagination'] | 5 | strip: //p[@class='entrypagination'] |
6 | strip: //p[@class='details_top'] | 6 | strip: //p[@class='details_top'] |
7 | date: //p[@class='details_top'] | 7 | date: //p[@class='details_top'] |
8 | title: //div[@class='title']/h1 | 8 | title: //div[@class='title']/h1 |
9 | strip: //p[@class='details'] | 9 | strip: //p[@class='details'] |
10 | strip: //p[@class='details_bottom'] | 10 | strip: //p[@class='details_bottom'] |
11 | 11 | ||
12 | test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php \ No newline at end of file | 12 | test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scienceticker.info.txt b/inc/3rdparty/site_config/standard/scienceticker.info.txt index 75a52824..2a06f734 100644..100755 --- a/inc/3rdparty/site_config/standard/scienceticker.info.txt +++ b/inc/3rdparty/site_config/standard/scienceticker.info.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body: //div[@class='post'] | 1 | body: //div[@class='post'] |
2 | title: //h1[@id='singlePageTitle'] | 2 | title: //h1[@id='singlePageTitle'] |
3 | date: substring-before(//small,'• Rubrik') | 3 | date: substring-before(//small,'• Rubrik') |
4 | 4 | ||
5 | strip: //div[@class='post-ratings'] | 5 | strip: //div[@class='post-ratings'] |
6 | strip: //div[@class='post-ratings-loading'] | 6 | strip: //div[@class='post-ratings-loading'] |
7 | strip: //a[@title='Empfehlen Sie den Text weiter!'] | 7 | strip: //a[@title='Empfehlen Sie den Text weiter!'] |
8 | strip: //a[@title='Drucken'] | 8 | strip: //a[@title='Drucken'] |
9 | strip: //div[@class='share'] | 9 | strip: //div[@class='share'] |
10 | 10 | ||
11 | test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/ \ No newline at end of file | 11 | test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scientificamerican.com.txt b/inc/3rdparty/site_config/standard/scientificamerican.com.txt index d510407d..1b3f31cf 100644..100755 --- a/inc/3rdparty/site_config/standard/scientificamerican.com.txt +++ b/inc/3rdparty/site_config/standard/scientificamerican.com.txt | |||
@@ -1,25 +1,25 @@ | |||
1 | # | 1 | # |
2 | # After site revisions at SciAm, this configuration does | 2 | # After site revisions at SciAm, this configuration does |
3 | # not work, especially for multi-page articles. For | 3 | # not work, especially for multi-page articles. For |
4 | # every article there is now a "Print" link which | 4 | # every article there is now a "Print" link which |
5 | # is far more reliable. So this configuration should be | 5 | # is far more reliable. So this configuration should be |
6 | # removed or disabled. | 6 | # removed or disabled. |
7 | # 2/3/13 | 7 | # 2/3/13 |
8 | # | 8 | # |
9 | 9 | ||
10 | # meta data | 10 | # meta data |
11 | title://h1[@class = 'articleTitle'] | 11 | title://h1[@class = 'articleTitle'] |
12 | author:substring-after(//span[@class = 'byline'],'By ') | 12 | author:substring-after(//span[@class = 'byline'],'By ') |
13 | date:substring-before(//span[@class = 'datestamp'],'|') | 13 | date:substring-before(//span[@class = 'datestamp'],'|') |
14 | 14 | ||
15 | #body content | 15 | #body content |
16 | body://div[@id = 'articleContent'] | 16 | body://div[@id = 'articleContent'] |
17 | #next_page_link://li[@id = 'flairPagination']/a[last()] | 17 | #next_page_link://li[@id = 'flairPagination']/a[last()] |
18 | 18 | ||
19 | single_page_link: //a[contains(@href, 'print=true')] | 19 | single_page_link: //a[contains(@href, 'print=true')] |
20 | 20 | ||
21 | #cleanup | 21 | #cleanup |
22 | strip://div[@class = 'fsgBooks'] | 22 | strip://div[@class = 'fsgBooks'] |
23 | 23 | ||
24 | test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state | 24 | test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state |
25 | test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet \ No newline at end of file | 25 | test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scilogs.de.txt b/inc/3rdparty/site_config/standard/scilogs.de.txt new file mode 100755 index 00000000..b24d7844 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scilogs.de.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //h1 | ||
2 | author: //div[@class='date']/a | ||
3 | date: substring-after(//div[@class='date'], ',') | ||
4 | body: //div[@class='entrybody'] | ||
5 | |||
6 | strip_id_or_class: socialshareprivacy | ||
7 | strip: //div[@class='entrybody']/br[1] | ||
8 | |||
9 | # Strip related articles | ||
10 | # 'p'-Tag strips 'Ähnliche Artikel: ' (<br> tags become <p>) | ||
11 | strip: //div[@class='entrybody']/p[last()] | ||
12 | strip: //div[@class='entrybody']/ul[last()] | ||
13 | |||
14 | convert_double_br_tags: yes | ||
15 | test_url: http://www.scilogs.de/wblogs/blog/formbar/fusion/2012-10-08/rundgang-durch-deutschlands-gr-tes-fusionsexperiment \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scotusblog.com.txt b/inc/3rdparty/site_config/standard/scotusblog.com.txt index f29e37f9..8881bb45 100644..100755 --- a/inc/3rdparty/site_config/standard/scotusblog.com.txt +++ b/inc/3rdparty/site_config/standard/scotusblog.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //title | 1 | title: //title |
2 | author: //p[@id='author-name-role']/a | 2 | author: //p[@id='author-name-role']/a |
3 | date: substring-after(//p[@class='time'],'Posted') | 3 | date: substring-after(//p[@class='time'],'Posted') |
4 | body: //div[@id='main'] | 4 | body: //div[@id='main'] |
5 | strip: //div[@id='author-info'] | 5 | strip: //div[@id='author-info'] |
6 | strip: //div[@id='author-links'] | 6 | strip: //div[@id='author-links'] |
7 | strip: //h1 | 7 | strip: //h1 |
8 | test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/ \ No newline at end of file | 8 | test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scraplab.net.txt b/inc/3rdparty/site_config/standard/scraplab.net.txt index 84be27f9..ca7ec195 100644..100755 --- a/inc/3rdparty/site_config/standard/scraplab.net.txt +++ b/inc/3rdparty/site_config/standard/scraplab.net.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | body: //div[@class='body'] | 2 | body: //div[@class='body'] |
3 | test_url: http://scraplab.net/2010/10/26/please-keep-your-belongings-with-you-at-all-times/ \ No newline at end of file | 3 | test_url: http://scraplab.net/2010/10/26/please-keep-your-belongings-with-you-at-all-times/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/scripting.com.txt b/inc/3rdparty/site_config/standard/scripting.com.txt index d8b969b1..5fb0ee79 100644..100755 --- a/inc/3rdparty/site_config/standard/scripting.com.txt +++ b/inc/3rdparty/site_config/standard/scripting.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | strip: //a[starts-with(@href, '#')] | 1 | strip: //a[starts-with(@href, '#')] |
2 | strip: //*[@class='storyByline'] | 2 | strip: //*[@class='storyByline'] |
3 | body: //*[@class='storyPageText']/.. | 3 | body: //*[@class='storyPageText']/.. |
4 | author: string('Dave Winer') | 4 | author: string('Dave Winer') |
5 | date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at') | 5 | date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at') |
6 | title: //h1 | 6 | title: //h1 |
7 | footnotes: no | 7 | footnotes: no |
8 | test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html \ No newline at end of file | 8 | test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sct.temple.edu.txt b/inc/3rdparty/site_config/standard/sct.temple.edu.txt index 9927675b..55f24173 100644..100755 --- a/inc/3rdparty/site_config/standard/sct.temple.edu.txt +++ b/inc/3rdparty/site_config/standard/sct.temple.edu.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //*[@class="entry-content"] | 1 | body: //*[@class="entry-content"] |
2 | title: //h1[@class="entry-title"] | 2 | title: //h1[@class="entry-title"] |
3 | date: //*[@class="entry-date"] | 3 | date: //*[@class="entry-date"] |
4 | author: //*[@class="author vcard"] | 4 | author: //*[@class="author vcard"] |
5 | test_url: http://sct.temple.edu/blogs/news-events/2011/05/congratulations-sct-class-of-2011/ \ No newline at end of file | 5 | test_url: http://sct.temple.edu/blogs/news-events/2011/05/congratulations-sct-class-of-2011/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/searchenginejournal.com.txt b/inc/3rdparty/site_config/standard/searchenginejournal.com.txt new file mode 100755 index 00000000..dc98af3c --- /dev/null +++ b/inc/3rdparty/site_config/standard/searchenginejournal.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | strip: //ul[contains(@id, "social")] | ||
2 | strip: //div[contains(@class, "ts-fab-wrapper")] | ||
3 | strip: //div[contains(@id, 'gpt-ad')] | ||
4 | |||
5 | test_url: http://www.searchenginejournal.com/web-design-vs-seo-it-doesnt-make-much-sense/62294/ | ||
diff --git a/inc/3rdparty/site_config/standard/searchengineland.com.txt b/inc/3rdparty/site_config/standard/searchengineland.com.txt index f176d7c7..fb6a1074 100644..100755 --- a/inc/3rdparty/site_config/standard/searchengineland.com.txt +++ b/inc/3rdparty/site_config/standard/searchengineland.com.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | body: //div[@class="storyBox"] | 1 | body: //div[@class="storyBox"] |
2 | title: //div[@class="storyBox"]/h1 | 2 | title: //div[@class="storyBox"]/h1 |
3 | author: //a[@rel="author"] | 3 | author: //a[@rel="author"] |
4 | date: substring-before(//span[@class="dateline"], 'by') | 4 | date: substring-before(//span[@class="dateline"], 'by') |
5 | 5 | ||
6 | #Removes related content but cleans up article text | 6 | #Removes related content but cleans up article text |
7 | strip: //h1 | 7 | strip: //h1 |
8 | strip: //p[@class="homeStory tdmSideInfo"] | 8 | strip: //p[@class="homeStory tdmSideInfo"] |
9 | strip: //div[@id="bylineShare"] | 9 | strip: //div[@id="bylineShare"] |
10 | strip: //script | 10 | strip: //script |
11 | strip: //hr | 11 | strip: //hr |
12 | 12 | ||
13 | strip_id_or_class: homeStory | 13 | strip_id_or_class: homeStory |
14 | strip_id_or_class: authorpic | 14 | strip_id_or_class: authorpic |
15 | strip_id_or_class: insideComments | 15 | strip_id_or_class: insideComments |
16 | strip_id_or_class: authorbio | 16 | strip_id_or_class: authorbio |
17 | strip_id_or_class: gpt-ad-sel-cube | 17 | strip_id_or_class: gpt-ad-sel-cube |
18 | strip_id_or_class: smxTextAd | 18 | strip_id_or_class: smxTextAd |
19 | 19 | ||
20 | test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348 \ No newline at end of file | 20 | test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/seattletransitblog.com.txt b/inc/3rdparty/site_config/standard/seattletransitblog.com.txt index 5129c069..5129c069 100644..100755 --- a/inc/3rdparty/site_config/standard/seattletransitblog.com.txt +++ b/inc/3rdparty/site_config/standard/seattletransitblog.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/sebbo.net.txt b/inc/3rdparty/site_config/standard/sebbo.net.txt index 3e800a16..b6d9c92d 100644..100755 --- a/inc/3rdparty/site_config/standard/sebbo.net.txt +++ b/inc/3rdparty/site_config/standard/sebbo.net.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: substring-before(//title, '«') | 1 | title: substring-before(//title, '«') |
2 | body: //div[@class = 'entry'] | 2 | body: //div[@class = 'entry'] |
3 | strip_id_or_class: 'postmetabox' | 3 | strip_id_or_class: 'postmetabox' |
4 | test_url: http://sebbo.net/2010/12/akkus/ \ No newline at end of file | 4 | test_url: http://sebbo.net/2010/12/akkus/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/select.yeeyan.org.txt b/inc/3rdparty/site_config/standard/select.yeeyan.org.txt new file mode 100755 index 00000000..6e98b149 --- /dev/null +++ b/inc/3rdparty/site_config/standard/select.yeeyan.org.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://select.yeeyan.org/view/18312/332365 | ||
3 | # http://select.yeeyan.org/view/365295/333788 | ||
4 | # http://select.yeeyan.org/view/174464/332336 | ||
5 | |||
6 | tidy:no | ||
7 | prune:no | ||
8 | title://h1 | ||
9 | author: //div[@class='sa_author']/span/a | ||
10 | date: substring-after(//div[@class='sa_author']/span/following-sibling::span, ':') | ||
11 | body: //div[@class='sa_left closetag'] | ||
12 | wrap_in(b)://div[@class='sa_abstract'] | ||
13 | |||
14 | strip://ul[@class='sa_next clearfix'] | ||
15 | strip: //div[@class='sa_author'] | ||
16 | strip: //div[@class='sa_title_box'] | ||
17 | |||
18 | test_url: http://select.yeeyan.org/view/258033/333481 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/seriouseats.com.txt b/inc/3rdparty/site_config/standard/seriouseats.com.txt index d7b4788c..5e633470 100644..100755 --- a/inc/3rdparty/site_config/standard/seriouseats.com.txt +++ b/inc/3rdparty/site_config/standard/seriouseats.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | 2 | ||
3 | # clean up recipe pages | 3 | # clean up recipe pages |
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | 4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] |
5 | 5 | ||
6 | #recipe pages | 6 | #recipe pages |
7 | strip_id_or_class: "recipe-feedback" | 7 | strip_id_or_class: "recipe-feedback" |
8 | strip_id_or_class: "comments" | 8 | strip_id_or_class: "comments" |
9 | strip_id_or_class: "procedure-number" | 9 | strip_id_or_class: "procedure-number" |
10 | strip_id_or_class: "more-with-author" | 10 | strip_id_or_class: "more-with-author" |
11 | 11 | ||
12 | #slice | 12 | #slice |
13 | strip_id_or_class: "inner" | 13 | strip_id_or_class: "inner" |
14 | 14 | ||
15 | test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html \ No newline at end of file | 15 | test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sf.curbed.com.txt b/inc/3rdparty/site_config/standard/sf.curbed.com.txt index 9f443d5c..4c10e9c7 100644..100755 --- a/inc/3rdparty/site_config/standard/sf.curbed.com.txt +++ b/inc/3rdparty/site_config/standard/sf.curbed.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1[@class='post-title'] | 1 | title: //h1[@class='post-title'] |
2 | author: //div[@class='post-byline']/a | 2 | author: //div[@class='post-byline']/a |
3 | date: substring-before(//div[@class='post-byline'], ', by') | 3 | date: substring-before(//div[@class='post-byline'], ', by') |
4 | 4 | ||
5 | body: //div[@class='post-body'] | 5 | body: //div[@class='post-body'] |
6 | dissolve: //noscript | 6 | dissolve: //noscript |
7 | test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php \ No newline at end of file | 7 | test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sf.eater.com.txt b/inc/3rdparty/site_config/standard/sf.eater.com.txt index fca656d2..1e7c85a0 100644..100755 --- a/inc/3rdparty/site_config/standard/sf.eater.com.txt +++ b/inc/3rdparty/site_config/standard/sf.eater.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1[@class="post-title"] | 1 | title: //h1[@class="post-title"] |
2 | author: //div[@class="post-byline"]/a | 2 | author: //div[@class="post-byline"]/a |
3 | date: substring-before(//div[@class='post-byline'], ', by') | 3 | date: substring-before(//div[@class='post-byline'], ', by') |
4 | 4 | ||
5 | body: //div[@class='post-body'] | 5 | body: //div[@class='post-body'] |
6 | strip_id_or_class: post-kicker | 6 | strip_id_or_class: post-kicker |
7 | test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php \ No newline at end of file | 7 | test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sfgate.com.txt b/inc/3rdparty/site_config/standard/sfgate.com.txt index 5f73fbcb..54691122 100644..100755 --- a/inc/3rdparty/site_config/standard/sfgate.com.txt +++ b/inc/3rdparty/site_config/standard/sfgate.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: /html/head/title | 1 | title: /html/head/title |
2 | 2 | ||
3 | body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')] | 3 | body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')] |
4 | author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn'] | 4 | author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn'] |
5 | date: //div[@class = 'articleheadings']/span[@class = 'updated'] | 5 | date: //div[@class = 'articleheadings']/span[@class = 'updated'] |
6 | strip: //div[div[contains(@class, 'imgbox')]] | 6 | strip: //div[div[contains(@class, 'imgbox')]] |
7 | 7 | ||
8 | body: //div[@class = 'blogitem'] | 8 | body: //div[@class = 'blogitem'] |
9 | author: //p[@class="credit"]/span[@class="author"]/a[position() = 1] | 9 | author: //p[@class="credit"]/span[@class="author"]/a[position() = 1] |
10 | date: //span[@class = 'pubdate'] | 10 | date: //span[@class = 'pubdate'] |
11 | 11 | ||
12 | test_url: http://www.sfgate.com/columnists/garchik/ \ No newline at end of file | 12 | test_url: http://www.sfgate.com/columnists/garchik/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sfweekly.com.txt b/inc/3rdparty/site_config/standard/sfweekly.com.txt index a11fe4cb..73c3017e 100644..100755 --- a/inc/3rdparty/site_config/standard/sfweekly.com.txt +++ b/inc/3rdparty/site_config/standard/sfweekly.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[contains(@class, 'content_body')] | 1 | body: //div[contains(@class, 'content_body')] |
2 | strip_id_or_class: det_rel | 2 | strip_id_or_class: det_rel |
3 | test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/ \ No newline at end of file | 3 | test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/shabayek.com.txt b/inc/3rdparty/site_config/standard/shabayek.com.txt index b175720e..9a0d60ae 100644..100755 --- a/inc/3rdparty/site_config/standard/shabayek.com.txt +++ b/inc/3rdparty/site_config/standard/shabayek.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | date: //span[@class='date'] | 1 | date: //span[@class='date'] |
2 | body: //div[@class='post_content'] | 2 | body: //div[@class='post_content'] |
3 | test_url: http://www.shabayek.com/blog/2011/10/16/%D8%AF%D8%B1%D9%88%D8%B3-%D9%85%D9%86-%D9%82%D8%B5%D8%A9-%D8%AA%D8%A3%D8%B3%D9%8A%D8%B3-%D8%AA%D9%88%D9%8A%D8%AA%D8%B1-%E2%80%93%D8%AC3/ \ No newline at end of file | 3 | test_url: http://www.shabayek.com/blog/2011/10/16/%D8%AF%D8%B1%D9%88%D8%B3-%D9%85%D9%86-%D9%82%D8%B5%D8%A9-%D8%AA%D8%A3%D8%B3%D9%8A%D8%B3-%D8%AA%D9%88%D9%8A%D8%AA%D8%B1-%E2%80%93%D8%AC3/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/shawnblanc.net.txt b/inc/3rdparty/site_config/standard/shawnblanc.net.txt index b536fc3a..bd8438f7 100644..100755 --- a/inc/3rdparty/site_config/standard/shawnblanc.net.txt +++ b/inc/3rdparty/site_config/standard/shawnblanc.net.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title://*[@class='primary']/h1 | 1 | title://*[@class='primary']/h1 |
2 | date: //*[@class='articledate'] | 2 | date: //*[@class='articledate'] |
3 | author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.') | 3 | author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.') |
4 | body: //div[@class='primary'] | 4 | body: //div[@class='primary'] |
5 | footnotes: yes | 5 | footnotes: yes |
6 | strip: //*[@class='primary']/h1 | 6 | strip: //*[@class='primary']/h1 |
7 | strip: //*[@class='articledate'] | 7 | strip: //*[@class='articledate'] |
8 | strip: //*[@class='detailsarticle'] | 8 | strip: //*[@class='detailsarticle'] |
9 | strip: //*[@class='endnav'] | 9 | strip: //*[@class='endnav'] |
10 | strip: //*[@class='endmeta'] | 10 | strip: //*[@class='endmeta'] |
11 | test_url: http://shawnblanc.net/2011/11/kindle-touch-review/ \ No newline at end of file | 11 | test_url: http://shawnblanc.net/2011/11/kindle-touch-review/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/shifteleven.com.txt b/inc/3rdparty/site_config/standard/shifteleven.com.txt index 68059ae1..43fd871d 100644..100755 --- a/inc/3rdparty/site_config/standard/shifteleven.com.txt +++ b/inc/3rdparty/site_config/standard/shifteleven.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[ @class='entry-content' ] | 1 | body: //div[ @class='entry-content' ] |
2 | 2 | ||
3 | strip: //div[ contains(@class, 'sharing') ] | 3 | strip: //div[ contains(@class, 'sharing') ] |
4 | 4 | ||
5 | date: //div[ @class='entry-meta' ]/a | 5 | date: //div[ @class='entry-meta' ]/a |
6 | test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit \ No newline at end of file | 6 | test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/siasat.pk.txt b/inc/3rdparty/site_config/standard/siasat.pk.txt index a82ce69c..b10e12de 100644..100755 --- a/inc/3rdparty/site_config/standard/siasat.pk.txt +++ b/inc/3rdparty/site_config/standard/siasat.pk.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | 1 | #body: (//div[@class='ftr-yt-vid'])[1] |
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | 2 | body: (//blockquote[contains(@class, 'postcontent')])[1] |
3 | body: (//div[starts-with(@id, 'post_message')])[1] | 3 | body: (//div[starts-with(@id, 'post_message')])[1] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | 7 | ||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | 8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" |
9 | #replace_string(</iframe>): </iframe> </div> | 9 | #replace_string(</iframe>): </iframe> </div> |
10 | 10 | ||
11 | test_url: http://www.siasat.pk/forum/showthread.php?107668-Policy-Matters-17th-March-2012-Dr-Shahid-Masood-Gen-Hameed-gul-amp-Fawad-Chudhary-Pak-US-Relationship&p=787733 \ No newline at end of file | 11 | test_url: http://www.siasat.pk/forum/showthread.php?107668-Policy-Matters-17th-March-2012-Dr-Shahid-Masood-Gen-Hameed-gul-amp-Fawad-Chudhary-Pak-US-Relationship&p=787733 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/signalscv.com.txt b/inc/3rdparty/site_config/standard/signalscv.com.txt new file mode 100755 index 00000000..2d3c388e --- /dev/null +++ b/inc/3rdparty/site_config/standard/signalscv.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | author: //span[contains(@class, 'byline_1')] | ||
2 | date: //span[@class='posted_date'] | ||
3 | body: //*[contains(@class, 'bigimage_container') or contains(@class, 'overlay_text') or contains(@id, 'articlebody')] | ||
4 | |||
5 | strip_id_or_class: leftWrapper | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.signalscv.com/section/46/article/102948/ | ||
10 | test_url: http://www.signalscv.com/syndication/feeds/rss/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/simonwillison.net.txt b/inc/3rdparty/site_config/standard/simonwillison.net.txt index e3ad6e41..69999698 100644..100755 --- a/inc/3rdparty/site_config/standard/simonwillison.net.txt +++ b/inc/3rdparty/site_config/standard/simonwillison.net.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[contains(@class, "entry")] | 1 | body: //div[contains(@class, "entry")] |
2 | 2 | ||
3 | date: //div[contains(@class, "entryFooter")]/a | 3 | date: //div[contains(@class, "entryFooter")]/a |
4 | 4 | ||
5 | test_url: http://simonwillison.net/2009/Oct/22/redis/ \ No newline at end of file | 5 | test_url: http://simonwillison.net/2009/Oct/22/redis/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt b/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt index a1b6b673..46e2d5f2 100644..100755 --- a/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt +++ b/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@class='post-body'] | 1 | body: //div[@class='post-body'] |
2 | strip: //div[@id='lws_0'] | 2 | strip: //div[@id='lws_0'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html \ No newline at end of file | 5 | test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/singularityhub.com.txt b/inc/3rdparty/site_config/standard/singularityhub.com.txt index 3999d4d4..3999d4d4 100644..100755 --- a/inc/3rdparty/site_config/standard/singularityhub.com.txt +++ b/inc/3rdparty/site_config/standard/singularityhub.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/sintagoulis.gr.txt b/inc/3rdparty/site_config/standard/sintagoulis.gr.txt index 822bbeb0..0d05c40e 100644..100755 --- a/inc/3rdparty/site_config/standard/sintagoulis.gr.txt +++ b/inc/3rdparty/site_config/standard/sintagoulis.gr.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@class='headline']//h2 | 1 | title: //div[@class='headline']//h2 |
2 | body: //div[contains(@class, 'storycontent')] | 2 | body: //div[contains(@class, 'storycontent')] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://sintagoulis.gr/sokolatenia/sokolatenia-mpompa-me-amaretti- \ No newline at end of file | 6 | test_url: http://sintagoulis.gr/sokolatenia/sokolatenia-mpompa-me-amaretti- \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sivers.org.txt b/inc/3rdparty/site_config/standard/sivers.org.txt new file mode 100755 index 00000000..a88f30d7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sivers.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //article[@class='post']/header[@class='wrapper']//h1/a | ||
2 | author: //header[@id='masthead']//h1/a | ||
3 | date: //article[@class='post']/header[@class='wrapper']//p[@class='postdate'] | ||
4 | body: //div[@id='body-content'] | ||
5 | |||
6 | test_url: http://sivers.org/delegate/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/skanesfria.se.txt b/inc/3rdparty/site_config/standard/skanesfria.se.txt new file mode 100755 index 00000000..a0ddac79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/skanesfria.se.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.skanesfria.se/artikel/112045 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/slashfilm.com.txt b/inc/3rdparty/site_config/standard/slashfilm.com.txt index 78d38ecf..4d17176a 100644..100755 --- a/inc/3rdparty/site_config/standard/slashfilm.com.txt +++ b/inc/3rdparty/site_config/standard/slashfilm.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: substring-before(//title,'| /Film') | 1 | title: substring-before(//title,'| /Film') |
2 | date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by') | 2 | date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by') |
3 | strip: //div[@class='pm-left'] | 3 | strip: //div[@class='pm-left'] |
4 | strip: //div[@class='pm-right'] | 4 | strip: //div[@class='pm-right'] |
5 | strip: //h2/span | 5 | strip: //h2/span |
6 | next_page_link: //h2/strong/a | 6 | next_page_link: //h2/strong/a |
7 | strip: //h2/strong/a | 7 | strip: //h2/strong/a |
8 | strip: //p[contains(text(),'we have to split this post over')] | 8 | strip: //p[contains(text(),'we have to split this post over')] |
9 | strip: //p[@class='post-info'] | 9 | strip: //p[@class='post-info'] |
10 | strip: //h1/a | 10 | strip: //h1/a |
11 | strip: //img[contains(@src,'siteimages/authors')] | 11 | strip: //img[contains(@src,'siteimages/authors')] |
12 | strip: //div[@id='header'] | 12 | strip: //div[@id='header'] |
13 | strip: //div[@class='topad-right'] | 13 | strip: //div[@class='topad-right'] |
14 | strip: //strong[contains(text(),'Cool Posts From Around the Web:')] | 14 | strip: //strong[contains(text(),'Cool Posts From Around the Web:')] |
15 | test_url: http://www.slashfilm.com/superhero-bits-206/ \ No newline at end of file | 15 | test_url: http://www.slashfilm.com/superhero-bits-206/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/slate.com.txt b/inc/3rdparty/site_config/standard/slate.com.txt index e92f6a06..d5798e01 100644..100755 --- a/inc/3rdparty/site_config/standard/slate.com.txt +++ b/inc/3rdparty/site_config/standard/slate.com.txt | |||
@@ -1,19 +1,19 @@ | |||
1 | title: //h1[@class="sl-art-head-dek"] | 1 | title: //h1[@class="sl-art-head-dek"] |
2 | body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')] | 2 | body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')] |
3 | strip: //div[@class="department_kicker"] | 3 | strip: //div[@class="department_kicker"] |
4 | strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"] | 4 | strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"] |
5 | strip: //div[@id="bottom_sponsored_links"] | 5 | strip: //div[@id="bottom_sponsored_links"] |
6 | strip: //div[@class="sl-art-ad-midflex"] | 6 | strip: //div[@class="sl-art-ad-midflex"] |
7 | #strip: //dl | 7 | #strip: //dl |
8 | #strip: //p[em/a[contains(@href, 'facebook.com')]] | 8 | #strip: //p[em/a[contains(@href, 'facebook.com')]] |
9 | prune: no | 9 | prune: no |
10 | 10 | ||
11 | author: //div[@id='author_bio']//a[contains(@href, '/author/')] | 11 | author: //div[@id='author_bio']//a[contains(@href, '/author/')] |
12 | author: //a[contains(@href, '/authors.')] | 12 | author: //a[contains(@href, '/authors.')] |
13 | 13 | ||
14 | date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ') | 14 | date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ') |
15 | 15 | ||
16 | single_page_link: //a[@class='sl-art-sinpage'] | 16 | single_page_link: //a[@class='sl-art-sinpage'] |
17 | 17 | ||
18 | test_url: http://www.slate.com/id/2274583/pagenum/all/ | 18 | test_url: http://www.slate.com/id/2274583/pagenum/all/ |
19 | test_url: http://www.slate.com/id/2293116/ \ No newline at end of file | 19 | test_url: http://www.slate.com/id/2293116/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt b/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt index 1a902b96..e62a3966 100644..100755 --- a/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt +++ b/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | 2 | ||
3 | # clean up recipe pages | 3 | # clean up recipe pages |
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | 4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] |
5 | 5 | ||
6 | #recipe pages | 6 | #recipe pages |
7 | strip_id_or_class: "recipe-feedback" | 7 | strip_id_or_class: "recipe-feedback" |
8 | strip_id_or_class: "comments" | 8 | strip_id_or_class: "comments" |
9 | strip_id_or_class: "procedure-number" | 9 | strip_id_or_class: "procedure-number" |
10 | strip_id_or_class: "more-with-author" | 10 | strip_id_or_class: "more-with-author" |
11 | 11 | ||
12 | #slice | 12 | #slice |
13 | strip_id_or_class: "inner" | 13 | strip_id_or_class: "inner" |
14 | 14 | ||
15 | test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html \ No newline at end of file | 15 | test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/slog.thestranger.com.txt b/inc/3rdparty/site_config/standard/slog.thestranger.com.txt index daa5e31b..f9526945 100644..100755 --- a/inc/3rdparty/site_config/standard/slog.thestranger.com.txt +++ b/inc/3rdparty/site_config/standard/slog.thestranger.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | strip_id_or_class: postCategory | 1 | strip_id_or_class: postCategory |
2 | title: //h3[@class='postTitle'] | 2 | title: //h3[@class='postTitle'] |
3 | body: //div[@class='postBody'] | 3 | body: //div[@class='postBody'] |
4 | test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone \ No newline at end of file | 4 | test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/smartinvestor.de.txt b/inc/3rdparty/site_config/standard/smartinvestor.de.txt index ec6c55c8..85ca46de 100644..100755 --- a/inc/3rdparty/site_config/standard/smartinvestor.de.txt +++ b/inc/3rdparty/site_config/standard/smartinvestor.de.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //td[@class='hweissblau2'] | 1 | title: //td[@class='hweissblau2'] |
2 | body: //p[@class='copy'] | //div[@class='Section1'] | 2 | body: //p[@class='copy'] | //div[@class='Section1'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://www.smartinvestor.de/news/smartinvestor/detail.hbs?itemid=item949496655&recnr=14593 \ No newline at end of file | 5 | test_url: http://www.smartinvestor.de/news/smartinvestor/detail.hbs?itemid=item949496655&recnr=14593 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sme.sk.txt b/inc/3rdparty/site_config/standard/sme.sk.txt index c3d01ffb..d41612cc 100644..100755 --- a/inc/3rdparty/site_config/standard/sme.sk.txt +++ b/inc/3rdparty/site_config/standard/sme.sk.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //meta[@property='og:title']/@content | 1 | title: //meta[@property='og:title']/@content |
2 | date: //p[@class='autor_line']/b/text() | 2 | date: //p[@class='autor_line']/b/text() |
3 | test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html \ No newline at end of file | 3 | test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/smithsonianmag.com.txt b/inc/3rdparty/site_config/standard/smithsonianmag.com.txt index 10a3f717..3e8fee95 100644..100755 --- a/inc/3rdparty/site_config/standard/smithsonianmag.com.txt +++ b/inc/3rdparty/site_config/standard/smithsonianmag.com.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | # meta data | 1 | # meta data |
2 | title://h1[@id = 'articleTitle'] | 2 | title://h1[@id = 'articleTitle'] |
3 | author:substring-after(//ul[@id = 'byLine']/li[1],'By ') | 3 | author:substring-after(//ul[@id = 'byLine']/li[1],'By ') |
4 | date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',') | 4 | date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',') |
5 | body://div[@id = 'article-body'] | 5 | body://div[@id = 'article-body'] |
6 | 6 | ||
7 | # full content | 7 | # full content |
8 | single_page_link://td/li[@class = 'article-singlepage']/a | 8 | single_page_link://td/li[@class = 'article-singlepage']/a |
9 | 9 | ||
10 | # caption clean up | 10 | # caption clean up |
11 | wrap_in(i)://span[@class='articleImageCaptionwide'] | 11 | wrap_in(i)://span[@class='articleImageCaptionwide'] |
12 | move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p | 12 | move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p |
13 | 13 | ||
14 | 14 | ||
15 | # clean up | 15 | # clean up |
16 | strip://p[@id = 'articlePaginationWrapper'] | 16 | strip://p[@id = 'articlePaginationWrapper'] |
17 | strip://ul[contains(@class, 'cat-breadcrumb')] | 17 | strip://ul[contains(@class, 'cat-breadcrumb')] |
18 | strip://div [@class= 'viewMorePhotos'] | 18 | strip://div [@class= 'viewMorePhotos'] |
19 | 19 | ||
20 | test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html \ No newline at end of file | 20 | test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/smokingapples.com.txt b/inc/3rdparty/site_config/standard/smokingapples.com.txt index e22af7a9..c68c1321 100644..100755 --- a/inc/3rdparty/site_config/standard/smokingapples.com.txt +++ b/inc/3rdparty/site_config/standard/smokingapples.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h2[@class='custom-entry-title'] | 1 | title: //h2[@class='custom-entry-title'] |
2 | author: substring-after(//span[@class='author vcard'],'by ') | 2 | author: substring-after(//span[@class='author vcard'],'by ') |
3 | date: substring-after(//span[@class='publ'],'Published on ') | 3 | date: substring-after(//span[@class='publ'],'Published on ') |
4 | body: //div[@class='postentry-content'] | 4 | body: //div[@class='postentry-content'] |
5 | test_url: http://smokingapples.com/software/popclip-for-mac/ \ No newline at end of file | 5 | test_url: http://smokingapples.com/software/popclip-for-mac/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/somethingawful.com.txt b/inc/3rdparty/site_config/standard/somethingawful.com.txt new file mode 100755 index 00000000..48547948 --- /dev/null +++ b/inc/3rdparty/site_config/standard/somethingawful.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id = 'content-area'] | ||
3 | author: //p[contains(@class, 'byline')]/a | ||
4 | autodetect_next_page: yes | ||
5 | tidy: no | ||
6 | |||
7 | strip_id_or_class: articleid | ||
8 | strip_id_or_class: logo | ||
9 | strip_id_or_class: pagebar | ||
10 | strip_id_or_class: featurenavlinks | ||
11 | strip_id_or_class: featured_frontpage | ||
12 | strip_id_or_class: sidebar | ||
13 | strip_id_or_class: footer | ||
14 | strip_id_or_class: byline | ||
15 | strip_id_or_class: logo | ||
16 | strip_id_or_class: nav_network | ||
17 | test_url: http://www.somethingawful.com/d/dungeons-and-dragons/wtf-monster-manual.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/songshuhui.net.txt b/inc/3rdparty/site_config/standard/songshuhui.net.txt new file mode 100755 index 00000000..a9233593 --- /dev/null +++ b/inc/3rdparty/site_config/standard/songshuhui.net.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://songshuhui.net/archives/65522 | ||
3 | # http://songshuhui.net/archives/75760 | ||
4 | title://h2/span/a | ||
5 | date:substring-before(substring-after(//div[@class='atrctitle']/div, 'å‘表于'),' |') | ||
6 | body://div[@class='entry'] | ||
7 | test_url: http://songshuhui.net/archives/74819 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sourcebooks.com.txt b/inc/3rdparty/site_config/standard/sourcebooks.com.txt index 668fc44a..b52169da 100644..100755 --- a/inc/3rdparty/site_config/standard/sourcebooks.com.txt +++ b/inc/3rdparty/site_config/standard/sourcebooks.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | #grab the actual content div | 1 | #grab the actual content div |
2 | body: //div[@class='rt-article'] | 2 | body: //div[@class='rt-article'] |
3 | 3 | ||
4 | test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html \ No newline at end of file | 4 | test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/spectator.co.uk.txt b/inc/3rdparty/site_config/standard/spectator.co.uk.txt index a05c8395..d0605ed2 100644..100755 --- a/inc/3rdparty/site_config/standard/spectator.co.uk.txt +++ b/inc/3rdparty/site_config/standard/spectator.co.uk.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text() | 1 | author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text() |
2 | 2 | ||
3 | body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body'] | 3 | body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body'] |
4 | 4 | ||
5 | # Not very helpfull, the title and author are container by the same element that contains the body | 5 | # Not very helpfull, the title and author are container by the same element that contains the body |
6 | strip: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/h2 | /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link'] | 6 | strip: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/h2 | /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link'] |
7 | test_url: http://www.spectator.co.uk/arts-and-culture/night-and-day/7449683/spotify-sunday-my-personal-soundtrack.thtml \ No newline at end of file | 7 | test_url: http://www.spectator.co.uk/arts-and-culture/night-and-day/7449683/spotify-sunday-my-personal-soundtrack.thtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt b/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt index 4b0704a8..aea3627e 100644..100755 --- a/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt +++ b/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body://div[@class="articleBody"] | 1 | body://div[@class="articleBody"] |
2 | author://p[@class="articleBodyTtl"] | 2 | author://p[@class="articleBodyTtl"] |
3 | test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/ \ No newline at end of file | 3 | test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/speirs.org.txt b/inc/3rdparty/site_config/standard/speirs.org.txt index 3bf859e3..3bf859e3 100644..100755 --- a/inc/3rdparty/site_config/standard/speirs.org.txt +++ b/inc/3rdparty/site_config/standard/speirs.org.txt | |||
diff --git a/inc/3rdparty/site_config/standard/spiegel.de.txt b/inc/3rdparty/site_config/standard/spiegel.de.txt index 390c075c..413e0155 100644..100755 --- a/inc/3rdparty/site_config/standard/spiegel.de.txt +++ b/inc/3rdparty/site_config/standard/spiegel.de.txt | |||
@@ -1,75 +1,75 @@ | |||
1 | # A. Niepel, narya.de@... | 1 | # A. Niepel, narya.de@... |
2 | # - added single_page_link | 2 | # - added single_page_link |
3 | # - added author for default and single page view | 3 | # - added author for default and single page view |
4 | # - added date for single page view | 4 | # - added date for single page view |
5 | # fforst@... | 5 | # fforst@... |
6 | # - Fixed it | 6 | # - Fixed it |
7 | # bode2104@... | 7 | # bode2104@... |
8 | # - Fixed single_page_link | 8 | # - Fixed single_page_link |
9 | # - Included intro text in single page view | 9 | # - Included intro text in single page view |
10 | # - Added body in default view | 10 | # - Added body in default view |
11 | 11 | ||
12 | # set body | 12 | # set body |
13 | tidy: no | 13 | tidy: no |
14 | # body in single page view | 14 | # body in single page view |
15 | body: //div[@id="spArticleContent"] | 15 | body: //div[@id="spArticleContent"] |
16 | # body in default view | 16 | # body in default view |
17 | body: //div[@id="spArticleSection"] | 17 | body: //div[@id="spArticleSection"] |
18 | # body in "Fotostrecke" | 18 | # body in "Fotostrecke" |
19 | body: //div[@id="spBigaContent"] | 19 | body: //div[@id="spBigaContent"] |
20 | 20 | ||
21 | # set date in single page view | 21 | # set date in single page view |
22 | date: //div[@id="spArticleContent"]/h3 | 22 | date: //div[@id="spArticleContent"]/h3 |
23 | # strip date | 23 | # strip date |
24 | strip: //div[@id="spArticleContent"]/h3 | 24 | strip: //div[@id="spArticleContent"]/h3 |
25 | # set date in "Fotostrecke" | 25 | # set date in "Fotostrecke" |
26 | date: //div[@id="spBigaDatum"] | 26 | date: //div[@id="spBigaDatum"] |
27 | 27 | ||
28 | #set title in single page view | 28 | #set title in single page view |
29 | title: //div[@id='spArticleContent']/h2 | 29 | title: //div[@id='spArticleContent']/h2 |
30 | # strip title | 30 | # strip title |
31 | strip: //div[@id='spArticleContent']/h1 | 31 | strip: //div[@id='spArticleContent']/h1 |
32 | strip: //div[@id='spArticleContent']/h2 | 32 | strip: //div[@id='spArticleContent']/h2 |
33 | #set title in "Fotostrecke" | 33 | #set title in "Fotostrecke" |
34 | title: //div[@class='spBigaHeadline'] | 34 | title: //div[@class='spBigaHeadline'] |
35 | 35 | ||
36 | # set author | 36 | # set author |
37 | author: //p[@class="spAuthor"]/a | 37 | author: //p[@class="spAuthor"]/a |
38 | author: substring-after(//p[@class="spAuthor"], 'Von ') | 38 | author: substring-after(//p[@class="spAuthor"], 'Von ') |
39 | # strip author | 39 | # strip author |
40 | strip: //p[@class='spAuthor'] | 40 | strip: //p[@class='spAuthor'] |
41 | 41 | ||
42 | # remove captions | 42 | # remove captions |
43 | strip: //*/span[@class='spPicLayerText'] | 43 | strip: //*/span[@class='spPicLayerText'] |
44 | strip: //*/div[@class='spPanoPlayerPaneControl'] | 44 | strip: //*/div[@class='spPanoPlayerPaneControl'] |
45 | strip: //*/div[@class='spCredit'] | 45 | strip: //*/div[@class='spCredit'] |
46 | strip: //*/div[@class='spCredit']/following-sibling::p | 46 | strip: //*/div[@class='spCredit']/following-sibling::p |
47 | 47 | ||
48 | # remove ads | 48 | # remove ads |
49 | strip: //div[@class='spMInline'] | 49 | strip: //div[@class='spMInline'] |
50 | 50 | ||
51 | # remove photogalleries and extras | 51 | # remove photogalleries and extras |
52 | strip: //div[@class='spPhotoGallery'] | 52 | strip: //div[@class='spPhotoGallery'] |
53 | strip: //div[@class='spPhotoGallery']/following-sibling::br | 53 | strip: //div[@class='spPhotoGallery']/following-sibling::br |
54 | strip: //div[@class='spAssetAlignleft'] | 54 | strip: //div[@class='spAssetAlignleft'] |
55 | strip: //div[contains(@class,'spAsset')] | 55 | strip: //div[contains(@class,'spAsset')] |
56 | strip: //br[@clear='all'] | 56 | strip: //br[@clear='all'] |
57 | 57 | ||
58 | # remove community functions | 58 | # remove community functions |
59 | strip: //div[@id='spSocialBookmark'] | 59 | strip: //div[@id='spSocialBookmark'] |
60 | strip: //div[contains(@class, 'spCommunityBox')] | 60 | strip: //div[contains(@class, 'spCommunityBox')] |
61 | strip: //div[contains(@class, 'spArticleNewsfeedBox')] | 61 | strip: //div[contains(@class, 'spArticleNewsfeedBox')] |
62 | strip: //div[@class='spArticleCredit'] | 62 | strip: //div[@class='spArticleCredit'] |
63 | 63 | ||
64 | # remove clutter in "Fotostrecke" | 64 | # remove clutter in "Fotostrecke" |
65 | strip: //div[@id='spBreadcrumb'] | 65 | strip: //div[@id='spBreadcrumb'] |
66 | strip: //div[@id='spBigaLatestEntries'] | 66 | strip: //div[@id='spBigaLatestEntries'] |
67 | strip: //div[contains(@class, 'spBigaNavi')] | 67 | strip: //div[contains(@class, 'spBigaNavi')] |
68 | strip: //div[@class='spDottedLine'] | 68 | strip: //div[@class='spDottedLine'] |
69 | 69 | ||
70 | # Use link to print article for single page view | 70 | # Use link to print article for single page view |
71 | single_page_link: //a[contains(@href, '-druck')] | 71 | single_page_link: //a[contains(@href, '-druck')] |
72 | 72 | ||
73 | # use next link in "Fotostrecke" | 73 | # use next link in "Fotostrecke" |
74 | next_page_link: //a[@class='spBigaControlForw'] | 74 | next_page_link: //a[@class='spBigaControlForw'] |
75 | test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html \ No newline at end of file | 75 | test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/spiked-online.com.txt b/inc/3rdparty/site_config/standard/spiked-online.com.txt new file mode 100755 index 00000000..7ec39c2b --- /dev/null +++ b/inc/3rdparty/site_config/standard/spiked-online.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@id='articleTitleWrapper' or @id='mainFeature']//h1 | ||
2 | author: //*[@id='authorNameJob']//a | ||
3 | date: //div[@id='articleMeta']/p | ||
4 | body: //div[@id='mainFeature']//img | //div[contains(@class, 'fullText')] | ||
5 | |||
6 | test_url: http://www.spiked-online.com/newsite/article/standing_up_to_the_white-coated_gods_of_fortune/13785 | ||
7 | test_url: http://www.spiked-online.com/newsite/article/sex_box_and_the_crisis_of_intimacy/14168 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/spin.com.txt b/inc/3rdparty/site_config/standard/spin.com.txt index 66f6192b..88eb454c 100644..100755 --- a/inc/3rdparty/site_config/standard/spin.com.txt +++ b/inc/3rdparty/site_config/standard/spin.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | body: //section[contains(@class, 'main')] | 2 | body: //section[contains(@class, 'main')] |
3 | strip: //footer | 3 | strip: //footer |
4 | strip: //a[@class='paginated'] | 4 | strip: //a[@class='paginated'] |
5 | test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare \ No newline at end of file | 5 | test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/splatf.com.txt b/inc/3rdparty/site_config/standard/splatf.com.txt index d5671652..3e05a225 100644..100755 --- a/inc/3rdparty/site_config/standard/splatf.com.txt +++ b/inc/3rdparty/site_config/standard/splatf.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author:string('Dan Frommer/SplatF') | 1 | author:string('Dan Frommer/SplatF') |
2 | date://div[@class='postdate'] | 2 | date://div[@class='postdate'] |
3 | body://div[@class='entry'] | 3 | body://div[@class='entry'] |
4 | title://div[@class='post']/h1 | 4 | title://div[@class='post']/h1 |
5 | test_url: http://www.splatf.com/2012/02/month-six/ \ No newline at end of file | 5 | test_url: http://www.splatf.com/2012/02/month-six/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/splitsider.com.txt b/inc/3rdparty/site_config/standard/splitsider.com.txt index d1d392e7..4bbc7aac 100644..100755 --- a/inc/3rdparty/site_config/standard/splitsider.com.txt +++ b/inc/3rdparty/site_config/standard/splitsider.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author: //div[@class='byline']/a | 1 | author: //div[@class='byline']/a |
2 | date: //div[@id='date'] | 2 | date: //div[@id='date'] |
3 | body: //div[@class='entry'] | 3 | body: //div[@class='entry'] |
4 | test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/ \ No newline at end of file | 4 | test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sport.detik.com.txt b/inc/3rdparty/site_config/standard/sport.detik.com.txt index b404b829..18552d1e 100644..100755 --- a/inc/3rdparty/site_config/standard/sport.detik.com.txt +++ b/inc/3rdparty/site_config/standard/sport.detik.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title://div[@class="content_detail"]/h1 | 1 | title://div[@class="content_detail"]/h1 |
2 | 2 | ||
3 | author://div[@class="author"]/strong | 3 | author://div[@class="author"]/strong |
4 | 4 | ||
5 | date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB') | 5 | date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB') |
6 | 6 | ||
7 | body://div[@class='text_detail'] | 7 | body://div[@class='text_detail'] |
8 | test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270 \ No newline at end of file | 8 | test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sport.orf.at.txt b/inc/3rdparty/site_config/standard/sport.orf.at.txt index a794ded9..f0be85c7 100644..100755 --- a/inc/3rdparty/site_config/standard/sport.orf.at.txt +++ b/inc/3rdparty/site_config/standard/sport.orf.at.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | 1 | single_page_link: //div[@id='content']//p[@class='readMore']/a |
2 | 2 | ||
3 | title: //div[@class='hidden offscreen']/h2 | 3 | title: //div[@class='hidden offscreen']/h2 |
4 | body: //div[@id="storyText"] | 4 | body: //div[@id="storyText"] |
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | 5 | move_into(//div[@id='storyText']): //div[@class='fact'] |
6 | strip: //small[@class='credit'] | 6 | strip: //small[@class='credit'] |
7 | strip: //small[@class='caption'] | 7 | strip: //small[@class='caption'] |
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | 8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') |
9 | strip: //p[@class='toplink'] | 9 | strip: //p[@class='toplink'] |
10 | 10 | ||
11 | test_url: http://sport.orf.at/stories/2084851/ \ No newline at end of file | 11 | test_url: http://sport.orf.at/stories/2084851/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sport365.fr.txt b/inc/3rdparty/site_config/standard/sport365.fr.txt new file mode 100755 index 00000000..8688f40b --- /dev/null +++ b/inc/3rdparty/site_config/standard/sport365.fr.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //h2[contains(@class, 'body_head')] | //div[@id='img_article' or contains(@class, 'body_content')] | ||
2 | body: //div[contains(@class, 'cpanel')]//div[contains(@class, 'thumbnails')] | ||
3 | prune: no | ||
4 | strip: //div[starts-with(@class, 'actu_')] | ||
5 | strip: //div[contains(@class, 'data')] | ||
6 | |||
7 | test_url: http://www.sport365.fr/basketball/nba/new-york-accord-avec-toronto-pour-bargnani-1038773.shtml | ||
8 | test_url: http://www.sport365.fr/rss.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sports.espn.go.com.txt b/inc/3rdparty/site_config/standard/sports.espn.go.com.txt index e0f8223c..8c21ef2b 100644..100755 --- a/inc/3rdparty/site_config/standard/sports.espn.go.com.txt +++ b/inc/3rdparty/site_config/standard/sports.espn.go.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 | 1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 |
2 | body: //div[contains(@class, 'article')] | 2 | body: //div[contains(@class, 'article')] |
3 | strip: //div[contains(@class, 'mod-inline')] | 3 | strip: //div[contains(@class, 'mod-inline')] |
4 | strip: //*/span[@class='page-actions']/a | 4 | strip: //*/span[@class='page-actions']/a |
5 | strip: //*/span[@class='page-actions']/a | 5 | strip: //*/span[@class='page-actions']/a |
6 | strip: //div[@class='page-actions']/* | 6 | strip: //div[@class='page-actions']/* |
7 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 | 7 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 |
8 | strip: //div[@class='mod-blog-navigation'] | 8 | strip: //div[@class='mod-blog-navigation'] |
9 | strip: //div[@class='monthday'] | 9 | strip: //div[@class='monthday'] |
10 | strip: //div[@class='time'] | 10 | strip: //div[@class='time'] |
11 | strip: //div[@class='timeofday'] | 11 | strip: //div[@class='timeofday'] |
12 | test_url: http://sports.espn.go.com/espn/page2/story?page=simmonsnfl2010/lebron_james_return_clevelend&sportCat=nba \ No newline at end of file | 12 | test_url: http://sports.espn.go.com/espn/page2/story?page=simmonsnfl2010/lebron_james_return_clevelend&sportCat=nba \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sports.yahoo.com.txt b/inc/3rdparty/site_config/standard/sports.yahoo.com.txt index 96a3bb71..b0f57e2c 100644..100755 --- a/inc/3rdparty/site_config/standard/sports.yahoo.com.txt +++ b/inc/3rdparty/site_config/standard/sports.yahoo.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //div[@id='article']/div[@class='hd']/h1 | 1 | title: //div[@id='article']/div[@class='hd']/h1 |
2 | body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0'] | 2 | body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0'] |
3 | strip: //div[@class='foot'] | 3 | strip: //div[@class='foot'] |
4 | strip: //div[@id='sidebar']//div[@class='ft'] | 4 | strip: //div[@id='sidebar']//div[@class='ft'] |
5 | strip: //p[@id='byline']//em | 5 | strip: //p[@id='byline']//em |
6 | tidy: no | 6 | tidy: no |
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals \ No newline at end of file | 9 | test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sportschau.de.txt b/inc/3rdparty/site_config/standard/sportschau.de.txt index 6500e75c..1e58b520 100644..100755 --- a/inc/3rdparty/site_config/standard/sportschau.de.txt +++ b/inc/3rdparty/site_config/standard/sportschau.de.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | title://div[@id='ardContent']/h1 | 1 | title://div[@id='ardContent']/h1 |
2 | 2 | ||
3 | author://p[@id='ardAutor'] | 3 | author://p[@id='ardAutor'] |
4 | author://span[@id='ardQuelle'] | 4 | author://span[@id='ardQuelle'] |
5 | author:string('sportschau.de') | 5 | author:string('sportschau.de') |
6 | 6 | ||
7 | date:substring-after(//span[@id='ardStand'], 'Stand: ') | 7 | date:substring-after(//span[@id='ardStand'], 'Stand: ') |
8 | 8 | ||
9 | body://div[@id='ardContent'] | 9 | body://div[@id='ardContent'] |
10 | 10 | ||
11 | strip://div[@id='ardContent']/h1 | 11 | strip://div[@id='ardContent']/h1 |
12 | strip://p[@id='ardAutor'] | 12 | strip://p[@id='ardAutor'] |
13 | strip: //div[@class='embeddedPlayer_clipinfo'] | 13 | strip: //div[@class='embeddedPlayer_clipinfo'] |
14 | strip: //div[@class='ardMehrZumThemaRechts'] | 14 | strip: //div[@class='ardMehrZumThemaRechts'] |
15 | strip: //*[contains(@class, 'inv')] | 15 | strip: //*[contains(@class, 'inv')] |
16 | 16 | ||
17 | strip: //p[@id='ardAbbinder'] | 17 | strip: //p[@id='ardAbbinder'] |
18 | strip: //div[@class='socialBookmarks'] | 18 | strip: //div[@class='socialBookmarks'] |
19 | strip: //div[@id='ardContentEnd'] | 19 | strip: //div[@id='ardContentEnd'] |
20 | strip: //div[@id='ardDisclaimer'] | 20 | strip: //div[@id='ardDisclaimer'] |
21 | strip: //div[@id='ardRechteSpalte'] | 21 | strip: //div[@id='ardRechteSpalte'] |
22 | test_url: http://www.sportschau.de/sp/fussball/news201203/17/analyse_leverkusen_gladbach.jsp \ No newline at end of file | 22 | test_url: http://www.sportschau.de/sp/fussball/news201203/17/analyse_leverkusen_gladbach.jsp \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt b/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt index afc5879f..b3da8138 100644..100755 --- a/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt +++ b/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt | |||
@@ -1,26 +1,26 @@ | |||
1 | # main sportsillustrated.com articles | 1 | # main sportsillustrated.com articles |
2 | # | 2 | # |
3 | body: //div[@id="cnnStoryContent"] | 3 | body: //div[@id="cnnStoryContent"] |
4 | title: //div[@id="cnnStoryHeadline"]//h1 | 4 | title: //div[@id="cnnStoryHeadline"]//h1 |
5 | author: //div[@id="cnnSubBanner"]//strong | 5 | author: //div[@id="cnnSubBanner"]//strong |
6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") | 6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") |
7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") | 7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") |
8 | 8 | ||
9 | # kill ugly font buttons | 9 | # kill ugly font buttons |
10 | strip: //div[@id="cnnSCFontButtons"] | 10 | strip: //div[@id="cnnSCFontButtons"] |
11 | 11 | ||
12 | # kill misc filler videos & etc | 12 | # kill misc filler videos & etc |
13 | strip: //div[@class="cnnDivideContent"] | 13 | strip: //div[@class="cnnDivideContent"] |
14 | strip: //*[@class="cnnTMbox"] | 14 | strip: //*[@class="cnnTMbox"] |
15 | 15 | ||
16 | # si vault articles | 16 | # si vault articles |
17 | # ------------- | 17 | # ------------- |
18 | body: //div[@class="siv_artPara"] | 18 | body: //div[@class="siv_artPara"] |
19 | title: //div[@class="siv_artHeader"]//h1 | 19 | title: //div[@class="siv_artHeader"]//h1 |
20 | author: //div[@class="byline"] | 20 | author: //div[@class="byline"] |
21 | date: //div[@class="date"] | 21 | date: //div[@class="date"] |
22 | 22 | ||
23 | next_page_link: //div[@id='cnnStoryContinue']/a | 23 | next_page_link: //div[@id='cnnStoryContinue']/a |
24 | strip_id_or_class: cnnstorypagination | 24 | strip_id_or_class: cnnstorypagination |
25 | 25 | ||
26 | test_url: http://sportsillustrated.cnn.com/2012/writers/peter_king/02/27/combine/index.html \ No newline at end of file | 26 | test_url: http://sportsillustrated.cnn.com/2012/writers/peter_king/02/27/combine/index.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sprengsatz.de.txt b/inc/3rdparty/site_config/standard/sprengsatz.de.txt index 16636bc5..5b683811 100644..100755 --- a/inc/3rdparty/site_config/standard/sprengsatz.de.txt +++ b/inc/3rdparty/site_config/standard/sprengsatz.de.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | author: string('Michael Spreng') | 2 | author: string('Michael Spreng') |
3 | date: //div[@class='date'] | 3 | date: //div[@class='date'] |
4 | body: //div[@class='entry'] | 4 | body: //div[@class='entry'] |
5 | test_url: http://www.sprengsatz.de/?p=3691 \ No newline at end of file | 5 | test_url: http://www.sprengsatz.de/?p=3691 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sqlite.org.txt b/inc/3rdparty/site_config/standard/sqlite.org.txt index 4872519a..15763c32 100644..100755 --- a/inc/3rdparty/site_config/standard/sqlite.org.txt +++ b/inc/3rdparty/site_config/standard/sqlite.org.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@id='ff-body'] | 1 | body: //div[@id='ff-body'] |
2 | 2 | ||
3 | replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center> | 3 | replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center> |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | test_url: http://www.sqlite.org/fileformat2.html \ No newline at end of file | 7 | test_url: http://www.sqlite.org/fileformat2.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt b/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt index 388209a9..8eae13ed 100644..100755 --- a/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt +++ b/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class='content'] | 1 | body: //div[@class='content'] |
2 | date: substring-before( //div[@class='unit dateAndNotes'], 'with') | 2 | date: substring-before( //div[@class='unit dateAndNotes'], 'with') |
3 | title: //h3 | 3 | title: //h3 |
4 | test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending \ No newline at end of file | 4 | test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stackoverflow.com.txt b/inc/3rdparty/site_config/standard/stackoverflow.com.txt index e5317bac..bb95e93a 100644..100755 --- a/inc/3rdparty/site_config/standard/stackoverflow.com.txt +++ b/inc/3rdparty/site_config/standard/stackoverflow.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2 | 1 | body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2 |
2 | 2 | ||
3 | replace_string(<div class="user-details"><br></div>): <!-- nothing --> | 3 | replace_string(<div class="user-details"><br></div>): <!-- nothing --> |
4 | replace_string(<div class="vote">): <div class="vote"><h3>Vote count: | 4 | replace_string(<div class="vote">): <div class="vote"><h3>Vote count: |
5 | 5 | ||
6 | strip_id_or_class: vote-up | 6 | strip_id_or_class: vote-up |
7 | strip_id_or_class: vote-down | 7 | strip_id_or_class: vote-down |
8 | strip_id_or_class: star-off | 8 | strip_id_or_class: star-off |
9 | strip_id_or_class: favoritecount | 9 | strip_id_or_class: favoritecount |
10 | strip_id_or_class: -share | 10 | strip_id_or_class: -share |
11 | strip_id_or_class: badgecount | 11 | strip_id_or_class: badgecount |
12 | 12 | ||
13 | 13 | ||
14 | test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax \ No newline at end of file | 14 | test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt b/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt index bde14217..a0f1587c 100644..100755 --- a/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt +++ b/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //div[@class='articleLeft']/h3 | 1 | title: //div[@class='articleLeft']/h3 |
2 | 2 | ||
3 | author: substring-after(//span[@class='articleAuthor']/a,'By ') | 3 | author: substring-after(//span[@class='articleAuthor']/a,'By ') |
4 | 4 | ||
5 | date: substring-before(//span[@class='articleDateTime'],'in ') | 5 | date: substring-before(//span[@class='articleDateTime'],'in ') |
6 | 6 | ||
7 | body: //div[@class='articleLeft'] | 7 | body: //div[@class='articleLeft'] |
8 | strip: //div[@class='articleMoreNews'] | 8 | strip: //div[@class='articleMoreNews'] |
9 | strip: //div[@class='articleLeft']/h3 | 9 | strip: //div[@class='articleLeft']/h3 |
10 | strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix'] | 10 | strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix'] |
11 | 11 | ||
12 | # Remove duplicate title from text | 12 | # Remove duplicate title from text |
13 | strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3 | 13 | strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3 |
14 | test_url: http://www.stalbansreview.co.uk/news/9581446.New_roundabout_in_King_Harry_Lane/r/?ref=rss \ No newline at end of file | 14 | test_url: http://www.stalbansreview.co.uk/news/9581446.New_roundabout_in_King_Harry_Lane/r/?ref=rss \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/standard.co.uk.txt b/inc/3rdparty/site_config/standard/standard.co.uk.txt index 22a33484..71a2bda1 100644..100755 --- a/inc/3rdparty/site_config/standard/standard.co.uk.txt +++ b/inc/3rdparty/site_config/standard/standard.co.uk.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | autodetect_next_page: no | 1 | autodetect_next_page: no |
2 | footnotes: no | 2 | footnotes: no |
3 | dissolve: //div[@class="column-2"]//div[@class="widget"] | 3 | dissolve: //div[@class="column-2"]//div[@class="widget"] |
4 | dissolve: //div[@class="column-2"]//div | 4 | dissolve: //div[@class="column-2"]//div |
5 | 5 | ||
6 | author: //div[@class="innerbyline"]/a | 6 | author: //div[@class="innerbyline"]/a |
7 | strip: //div[@class="innerbyline"]/a | 7 | strip: //div[@class="innerbyline"]/a |
8 | 8 | ||
9 | strip: //p[@class="dateline"] | 9 | strip: //p[@class="dateline"] |
10 | date: //p[@class="dateline"] | 10 | date: //p[@class="dateline"] |
11 | 11 | ||
12 | title: //h1[@class="title"] | 12 | title: //h1[@class="title"] |
13 | author: //div[@class="innerbyline"]/a | 13 | author: //div[@class="innerbyline"]/a |
14 | date: //p[@class="dateline"] | 14 | date: //p[@class="dateline"] |
15 | body: //div[@class="column-2"] | 15 | body: //div[@class="column-2"] |
16 | test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html \ No newline at end of file | 16 | test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/staradvertiser.com.txt b/inc/3rdparty/site_config/standard/staradvertiser.com.txt index 0579455f..254e2c2b 100644..100755 --- a/inc/3rdparty/site_config/standard/staradvertiser.com.txt +++ b/inc/3rdparty/site_config/standard/staradvertiser.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1[@id='storyTitle'] | 1 | title: //h1[@id='storyTitle'] |
2 | author: substring-after(//span[@class='hsa_postCredit'], 'By ') | 2 | author: substring-after(//span[@class='hsa_postCredit'], 'By ') |
3 | date://span[@class='hsa_dateStamp'] | 3 | date://span[@class='hsa_dateStamp'] |
4 | body: //div[@class='storytext'] | 4 | body: //div[@class='storytext'] |
5 | strip_id_or_class: insideStoryAd | 5 | strip_id_or_class: insideStoryAd |
6 | strip_id_or_class: printDesc | 6 | strip_id_or_class: printDesc |
7 | strip_id_or_class: sb_2010_story_tools | 7 | strip_id_or_class: sb_2010_story_tools |
8 | strip_id_or_class: FBConnectButton_Text | 8 | strip_id_or_class: FBConnectButton_Text |
9 | strip_id_or_class: breadcrumbs | 9 | strip_id_or_class: breadcrumbs |
10 | prune: no | 10 | prune: no |
11 | test_url: http://www.staradvertiser.com/news/20111112_World_leaders_step_onto_isle_stage.html \ No newline at end of file | 11 | test_url: http://www.staradvertiser.com/news/20111112_World_leaders_step_onto_isle_stage.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stephenfry.com.txt b/inc/3rdparty/site_config/standard/stephenfry.com.txt index 1169984f..efd1ec2b 100644..100755 --- a/inc/3rdparty/site_config/standard/stephenfry.com.txt +++ b/inc/3rdparty/site_config/standard/stephenfry.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: /html/head/meta[@name='title']/@content | 1 | title: /html/head/meta[@name='title']/@content |
2 | author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a | 2 | author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a |
3 | date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')] | 3 | date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')] |
4 | 4 | ||
5 | body: //div[@class='entry-content'] | 5 | body: //div[@class='entry-content'] |
6 | 6 | ||
7 | single_page_link: //p[@class='pagination']/a | 7 | single_page_link: //p[@class='pagination']/a |
8 | test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/ \ No newline at end of file | 8 | test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stlbeacon.org.txt b/inc/3rdparty/site_config/standard/stlbeacon.org.txt index d66fee9f..75379a9c 100644..100755 --- a/inc/3rdparty/site_config/standard/stlbeacon.org.txt +++ b/inc/3rdparty/site_config/standard/stlbeacon.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: article/h1 | 1 | title: article/h1 |
2 | author: //p[@class='byline'] | 2 | author: //p[@class='byline'] |
3 | date: //p[@class='date'] | 3 | date: //p[@class='date'] |
4 | body: //div[@class='body'] | 4 | body: //div[@class='body'] |
5 | test_url: https://www.stlbeacon.org/#!/content/23404/mogop_caucus_031712 \ No newline at end of file | 5 | test_url: https://www.stlbeacon.org/#!/content/23404/mogop_caucus_031712 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stockholm.etc.se.txt b/inc/3rdparty/site_config/standard/stockholm.etc.se.txt index 073043d5..2f4f8cb8 100644..100755 --- a/inc/3rdparty/site_config/standard/stockholm.etc.se.txt +++ b/inc/3rdparty/site_config/standard/stockholm.etc.se.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | strip_id_or_class: 'left' | 1 | strip_id_or_class: 'left' |
2 | strip_id_or_class: 'right' | 2 | strip_id_or_class: 'right' |
3 | strip_id_or_class: 'block-belowcontent' | 3 | strip_id_or_class: 'block-belowcontent' |
4 | 4 | ||
5 | test_url: http://stockholm.etc.se/reportage/bakom-stangda-dorrar-pa-fas-3-massa \ No newline at end of file | 5 | test_url: http://stockholm.etc.se/reportage/bakom-stangda-dorrar-pa-fas-3-massa \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stockholmsfria.nu.txt b/inc/3rdparty/site_config/standard/stockholmsfria.nu.txt new file mode 100755 index 00000000..cc8c28b8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/stockholmsfria.nu.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.stockholmsfria.nu/artikel/112068 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/straightdope.com.txt b/inc/3rdparty/site_config/standard/straightdope.com.txt new file mode 100755 index 00000000..f01d7ad1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/straightdope.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='article' or @id='current_illustration'] | ||
2 | title: //div[@id='article']//h1 | ||
3 | date: //div[@id='article']//div[@class='date'] | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.straightdope.com/columns/read/947/whatever-happened-to-adoption-of-the-metric-system-in-the-u-s \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/streetsblog.net.txt b/inc/3rdparty/site_config/standard/streetsblog.net.txt index 0b62a3d6..6cf03ca6 100644..100755 --- a/inc/3rdparty/site_config/standard/streetsblog.net.txt +++ b/inc/3rdparty/site_config/standard/streetsblog.net.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h2[@class="post-title"] | 1 | title: //h2[@class="post-title"] |
2 | date: //span[@class="post-date"] | 2 | date: //span[@class="post-date"] |
3 | body: //div[@class="post-entry"] | 3 | body: //div[@class="post-entry"] |
4 | 4 | ||
5 | #This is also good for *.streetsblog.org, for example: | 5 | #This is also good for *.streetsblog.org, for example: |
6 | #http://dc.streetsblog.org/2011/10/21/friday-job-market/ | 6 | #http://dc.streetsblog.org/2011/10/21/friday-job-market/ |
7 | test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/ \ No newline at end of file | 7 | test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stuff.co.nz.txt b/inc/3rdparty/site_config/standard/stuff.co.nz.txt index 12fd0939..3756092c 100644..100755 --- a/inc/3rdparty/site_config/standard/stuff.co.nz.txt +++ b/inc/3rdparty/site_config/standard/stuff.co.nz.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | title://div[@id='left_col']/h1 | 1 | title://div[@id='left_col']/h1 |
2 | author:substring-after(//span[contains(@class,'storycredit')],'BY ') | 2 | author:substring-after(//span[contains(@class,'storycredit')],'BY ') |
3 | author://span[contains(@class,'storycredit')] | 3 | author://span[contains(@class,'storycredit')] |
4 | date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ') | 4 | date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ') |
5 | date://div[contains(@class,'toolbox_date')] | 5 | date://div[contains(@class,'toolbox_date')] |
6 | body://div[@id='left_col'] | 6 | body://div[@id='left_col'] |
7 | 7 | ||
8 | strip_id_or_class: toolbox | 8 | strip_id_or_class: toolbox |
9 | strip_id_or_class: story_features | 9 | strip_id_or_class: story_features |
10 | strip_id_or_class: sharebox_new | 10 | strip_id_or_class: sharebox_new |
11 | strip_id_or_class: related_box | 11 | strip_id_or_class: related_box |
12 | strip_id_or_class: sponsored_links | 12 | strip_id_or_class: sponsored_links |
13 | strip_id_or_class: hidden_ad | 13 | strip_id_or_class: hidden_ad |
14 | strip_id_or_class: story_content_top | 14 | strip_id_or_class: story_content_top |
15 | strip_id_or_class: total_number | 15 | strip_id_or_class: total_number |
16 | strip_id_or_class: sort_order | 16 | strip_id_or_class: sort_order |
17 | strip_id_or_class: subscribe_order | 17 | strip_id_or_class: subscribe_order |
18 | 18 | ||
19 | strip://div[contains(@class,'ad_story')] | 19 | strip://div[contains(@class,'ad_story')] |
20 | 20 | ||
21 | test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge | 21 | test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge |
22 | test_url: http://www.stuff.co.nz/entertainment/7045944/International-praise-for-Ladyhawke \ No newline at end of file | 22 | test_url: http://www.stuff.co.nz/entertainment/7045944/International-praise-for-Ladyhawke \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/stumbleupon.com.txt b/inc/3rdparty/site_config/standard/stumbleupon.com.txt index 85682166..9adc3c50 100644..100755 --- a/inc/3rdparty/site_config/standard/stumbleupon.com.txt +++ b/inc/3rdparty/site_config/standard/stumbleupon.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link: //iframe[@id='stumbleFrame']/@src | 1 | single_page_link: //iframe[@id='tb-stumble-frame']/@src |
2 | 2 | ||
3 | test_url: www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/ \ No newline at end of file | 3 | test_url: http://www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/subtraction.com.txt b/inc/3rdparty/site_config/standard/subtraction.com.txt index 454e37b1..9ba6eb77 100644..100755 --- a/inc/3rdparty/site_config/standard/subtraction.com.txt +++ b/inc/3rdparty/site_config/standard/subtraction.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | title: //*[@id='posts']/div[1]/h2 | 1 | title: //*[@id='posts']/div[1]/h2 |
2 | author: //*[@id='posts']/div[1]/div[2]/span[2]/a | 2 | author: //*[@id='posts']/div[1]/div[2]/span[2]/a |
3 | date: //*[@class='date'] | 3 | date: //*[@class='date'] |
4 | body: //div[@class='body-lead'] | 4 | body: //div[@class='body-lead'] |
5 | 5 | ||
6 | # take out the bit saying 'body' | 6 | # take out the bit saying 'body' |
7 | strip: //div[@class='body-lead']/div[@class='info-label'] | 7 | strip: //div[@class='body-lead']/div[@class='info-label'] |
8 | 8 | ||
9 | 9 | ||
10 | 10 | ||
11 | 11 | ||
12 | 12 | ||
13 | 13 | ||
14 | 14 | ||
15 | 15 | ||
16 | 16 | ||
17 | test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations \ No newline at end of file | 17 | test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/sueddeutsche.de.txt index 4aa9410c..74b8d451 100644..100755 --- a/inc/3rdparty/site_config/standard/sueddeutsche.de.txt +++ b/inc/3rdparty/site_config/standard/sueddeutsche.de.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@... | 1 | # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@... |
2 | 2 | ||
3 | single_page_link: //a[ contains( @href, "/2.220/" ) ] | 3 | single_page_link: //a[ contains( @href, "/2.220/" ) ] |
4 | 4 | ||
5 | body: //article[@id="sitecontent"]/section[@class="body"] | 5 | body: //article[@id="sitecontent"]/section[@class="body"] |
6 | author: //address[@class="author"] | 6 | author: //address[@class="author"] |
7 | date: //div[@class="header"]//h1//span[@class="updated"] | 7 | date: //div[@class="header"]//h1//span[@class="updated"] |
8 | wrap_in(small): //div[@class="footer"] | 8 | wrap_in(small): //div[@class="footer"] |
9 | wrap_in(i): //figcaption/h3 | 9 | wrap_in(i): //figcaption/h3 |
10 | dissolve: //figcaption//h3 | 10 | dissolve: //figcaption//h3 |
11 | dissolve: //figure/div[@class="body"] | 11 | dissolve: //figure/div[@class="body"] |
12 | dissolve: //figure/a | 12 | dissolve: //figure/a |
13 | 13 | ||
14 | strip: //figure[ not( contains(@class, "zoomimage" ) ) ] | 14 | strip: //figure[ not( contains(@class, "zoomimage" ) ) ] |
15 | strip: //div[@data-onlineonly="true"] | 15 | strip: //div[@data-onlineonly="true"] |
16 | strip: //address[@class="author"] | 16 | strip: //address[@class="author"] |
17 | 17 | ||
18 | test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693 \ No newline at end of file | 18 | test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/summify.com.txt b/inc/3rdparty/site_config/standard/summify.com.txt index 1128e1bb..1128e1bb 100644..100755 --- a/inc/3rdparty/site_config/standard/summify.com.txt +++ b/inc/3rdparty/site_config/standard/summify.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/suntimes.com.txt b/inc/3rdparty/site_config/standard/suntimes.com.txt index 13390e4f..6d4594cf 100644..100755 --- a/inc/3rdparty/site_config/standard/suntimes.com.txt +++ b/inc/3rdparty/site_config/standard/suntimes.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //div[@class='story-details']/h1 | 1 | title: //div[@class='story-details']/h1 |
2 | date: //span[@class='date-time'] | 2 | date: //span[@class='date-time'] |
3 | Author: substring-after(//p[@class='by-line'], 'By ') | 3 | Author: substring-after(//p[@class='by-line'], 'By ') |
4 | 4 | ||
5 | strip: //div[@class='videoThumbnails'] | 5 | strip: //div[@class='videoThumbnails'] |
6 | strip: //div[@class='ad-square2-container'] | 6 | strip: //div[@class='ad-square2-container'] |
7 | strip: //div[@class='homeDeliveryContainer5'] | 7 | strip: //div[@class='homeDeliveryContainer5'] |
8 | 8 | ||
9 | strip: //div[@class='image-description'] | 9 | strip: //div[@class='image-description'] |
10 | strip: //div[@id='internal-side-bar'] | 10 | strip: //div[@id='internal-side-bar'] |
11 | 11 | ||
12 | strip: //span[@class='hide'] | 12 | strip: //span[@class='hide'] |
13 | strip: //div[@class='date'] | 13 | strip: //div[@class='date'] |
14 | test_url: http://www.suntimes.com/technology/ihnatko/8816567-452/review-kindle-fire-is-no-ipad-killer-but-it-is-a-killer-device.html \ No newline at end of file | 14 | test_url: http://www.suntimes.com/technology/ihnatko/8816567-452/review-kindle-fire-is-no-ipad-killer-but-it-is-a-killer-device.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/svd.se.txt b/inc/3rdparty/site_config/standard/svd.se.txt index 02b5b8ca..bc0a1ca0 100644..100755 --- a/inc/3rdparty/site_config/standard/svd.se.txt +++ b/inc/3rdparty/site_config/standard/svd.se.txt | |||
@@ -1,4 +1,14 @@ | |||
1 | # Ads | 1 | body: //div[@id='article-content'] |
2 | strip_id_or_class: articlead | 2 | author: //div[@id='article']//div[@class='byline']/p |
3 | 3 | ||
4 | test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd \ No newline at end of file | 4 | # Ads |
5 | strip_id_or_class: articlead | ||
6 | |||
7 | # Sharing | ||
8 | strip_id_or_class: share | ||
9 | |||
10 | prune: no | ||
11 | |||
12 | test_url: http://www.svd.se/nyheter/inrikes/oppositionen-stoppar-skattesankning_8531228.svd | ||
13 | test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd | ||
14 | test_url: http://www.svd.se/?service=rss&type=senastenytt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/svt.se.txt b/inc/3rdparty/site_config/standard/svt.se.txt new file mode 100755 index 00000000..ba35f7d1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/svt.se.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //article[@role='main']//h1 | ||
2 | body: //article[@role='main'] | ||
3 | strip: //aside | ||
4 | replace_string(<noscript>): <div> | ||
5 | replace_string(</noscript>): </div> | ||
6 | strip_id_or_class: svtHide-No-Js | ||
7 | strip_id_or_class: aside | ||
8 | strip_id_or_class: Aside | ||
9 | strip_id_or_class: hidden | ||
10 | strip_id_or_class: Share | ||
11 | tidy: no | ||
12 | prune: no | ||
13 | |||
14 | test_url: http://www.svt.se/ug/framtidsdrommar-om-jobb-blev-lackande-gifthal | ||
15 | test_url: http://www.svt.se/nyheter/het-debatt-mellan-borg-och-andersson | ||
16 | test_url: http://www.svt.se/nyheter/regionalt/svtsormland/sj-tag-evakuerades-efter-rokdrama \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sydsvenskan.se.txt b/inc/3rdparty/site_config/standard/sydsvenskan.se.txt index da6772aa..24ba1426 100644..100755 --- a/inc/3rdparty/site_config/standard/sydsvenskan.se.txt +++ b/inc/3rdparty/site_config/standard/sydsvenskan.se.txt | |||
@@ -1,11 +1,18 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | author: //a[contains(@href, '/sok/?')]/text() | 3 | author: //a[contains(@href, '/sok/?')]/text() |
4 | 4 | ||
5 | date: substring-after(//span[@class='date'], 'Publicerad ') | 5 | date: //meta[@name='bi3dPubDate']/@content |
6 | 6 | ||
7 | body: //div[@class='two_column_left'] | 7 | body: (//div[contains(@class, 'slider_wrapper')])[1] | //div[@id='article_image' or @class='two_column_left'] |
8 | strip_id_or_class: story | 8 | strip_id_or_class: story |
9 | strip: //div[@class='leadText saplo:lead']/h5 | 9 | strip_id_or_class: article_body_ad |
10 | 10 | strip: //div[@class='leadText saplo:lead']/h5 | |
11 | test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna-- \ No newline at end of file | 11 | |
12 | replace_string(<br />): <br /><br /> | ||
13 | |||
14 | prune: no | ||
15 | |||
16 | test_url: http://www.sydsvenskan.se/malmo/allt-jag-ager-ligger-pa-botten/ | ||
17 | test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna-- | ||
18 | test_url: http://www.sydsvenskan.se/rss.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt b/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt index 3109c0e7..5bcfb9ef 100644..100755 --- a/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt +++ b/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //div[contains(@class, "post")]/h2 | 1 | title: //div[contains(@class, "post")]/h2 |
2 | 2 | ||
3 | author: //div[contains(@class, "post")]/p[position()=last()]/text()[1] | 3 | author: //div[contains(@class, "post")]/p[position()=last()]/text()[1] |
4 | 4 | ||
5 | date: //div[contains(@class, "post")]/p[1] | 5 | date: //div[contains(@class, "post")]/p[1] |
6 | 6 | ||
7 | body: //div[contains(@class, "post")] | 7 | body: //div[contains(@class, "post")] |
8 | 8 | ||
9 | strip: //div[contains(@class, "post")]/h2[1] | 9 | strip: //div[contains(@class, "post")]/h2[1] |
10 | strip: //div[contains(@class, "post")]/p[1] | 10 | strip: //div[contains(@class, "post")]/p[1] |
11 | strip: //div[contains(@class, "post")]/p[position()=last()] | 11 | strip: //div[contains(@class, "post")]/p[position()=last()] |
12 | test_url: http://www.symmetrymagazine.org/breaking/?p=12784 \ No newline at end of file | 12 | test_url: http://www.symmetrymagazine.org/breaking/?p=12784 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt index c3e34977..e058032c 100644..100755 --- a/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt +++ b/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body://div[@class='drucken'] | 2 | body://div[@class='drucken'] |
3 | author: substring-after(//span[@class='autor'], 'Von ') | 3 | author: substring-after(//span[@class='autor'], 'Von ') |
4 | author: //span[@class='autor'] | 4 | author: //span[@class='autor'] |
5 | 5 | ||
6 | single_page_link://a[contains(@href, '/drucken/')] | 6 | single_page_link://a[contains(@href, '/drucken/')] |
7 | convert_double_br_tags:yes | 7 | convert_double_br_tags:yes |
8 | 8 | ||
9 | dissolve://div[@class='vorspann'] | 9 | dissolve://div[@class='vorspann'] |
10 | 10 | ||
11 | strip://h1 | 11 | strip://h1 |
12 | strip_id_or_class: klassifizierung | 12 | strip_id_or_class: klassifizierung |
13 | strip_id_or_class: source | 13 | strip_id_or_class: source |
14 | strip_id_or_class: autor | 14 | strip_id_or_class: autor |
15 | test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567 \ No newline at end of file | 15 | test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/sz.de.txt b/inc/3rdparty/site_config/standard/sz.de.txt new file mode 100755 index 00000000..f67637d2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sz.de.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@... | ||
2 | |||
3 | single_page_link: //a[ contains( @href, "/2.220/" ) ] | ||
4 | |||
5 | body: //article[@id="sitecontent"]/section[@class="body"] | ||
6 | author: //address[@class="author"] | ||
7 | date: //div[@class="header"]//h1//span[@class="updated"] | ||
8 | wrap_in(small): //div[@class="footer"] | ||
9 | wrap_in(i): //figcaption/h3 | ||
10 | dissolve: //figcaption//h3 | ||
11 | dissolve: //figure/div[@class="body"] | ||
12 | dissolve: //figure/a | ||
13 | |||
14 | strip: //figure[ not( contains(@class, "zoomimage" ) ) ] | ||
15 | strip: //div[@data-onlineonly="true"] | ||
16 | strip: //address[@class="author"] | ||
17 | |||
18 | test_url: http://sz.de/1.1556693 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tagesschau.de.txt b/inc/3rdparty/site_config/standard/tagesschau.de.txt index 8ce8a90e..be76cd05 100644..100755 --- a/inc/3rdparty/site_config/standard/tagesschau.de.txt +++ b/inc/3rdparty/site_config/standard/tagesschau.de.txt | |||
@@ -1,23 +1,23 @@ | |||
1 | title://h1[1] | 1 | title://h1[1] |
2 | 2 | ||
3 | author: substring-after(//em, 'Von ') | 3 | author: substring-after(//em, 'Von ') |
4 | author:string('tagesschau.de') | 4 | author:string('tagesschau.de') |
5 | 5 | ||
6 | date:substring-after(//div[@class='standDatum'], 'Stand: ') | 6 | date:substring-after(//div[@class='standDatum'], 'Stand: ') |
7 | 7 | ||
8 | body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')] | 8 | body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')] |
9 | 9 | ||
10 | strip://h1[1] | 10 | strip://h1[1] |
11 | strip: //div[contains(@class, 'directLinks')] | 11 | strip: //div[contains(@class, 'directLinks')] |
12 | strip: //div[contains(@class, 'zitatBox')] | 12 | strip: //div[contains(@class, 'zitatBox')] |
13 | strip: //div[contains(@class, 'teaserBox metaBlock')] | 13 | strip: //div[contains(@class, 'teaserBox metaBlock')] |
14 | strip: //*[contains(@class, 'inv')] | 14 | strip: //*[contains(@class, 'inv')] |
15 | strip: //span[@class='imgSubline'] | 15 | strip: //span[@class='imgSubline'] |
16 | strip: //*[contains(@class, 'topline')][1] | 16 | strip: //*[contains(@class, 'topline')][1] |
17 | strip: //div[@id='rightCol'][1] | 17 | strip: //div[@id='rightCol'][1] |
18 | strip: //div[@id="footer"][1] | 18 | strip: //div[@id="footer"][1] |
19 | strip: //div[@class="fPlayer"] | 19 | strip: //div[@class="fPlayer"] |
20 | strip: //div[@id='seitenanfang'] | 20 | strip: //div[@id='seitenanfang'] |
21 | strip: //div[@class='standDatum'] | 21 | strip: //div[@class='standDatum'] |
22 | strip: //em | 22 | strip: //em |
23 | test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html \ No newline at end of file | 23 | test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tampabay.com.txt b/inc/3rdparty/site_config/standard/tampabay.com.txt index bfe841c6..47a6ffab 100644..100755 --- a/inc/3rdparty/site_config/standard/tampabay.com.txt +++ b/inc/3rdparty/site_config/standard/tampabay.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //span[@class="entry-title"] | 1 | title: //span[@class="entry-title"] |
2 | author: //*[contains(@class, 'item')]/p/a/text() | 2 | author: //*[contains(@class, 'item')]/p/a/text() |
3 | date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:') | 3 | date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:') |
4 | body: //div[@class="entry-content"] | 4 | body: //div[@class="entry-content"] |
5 | test_url: http://www.tampabay.com/news/salvador-dali-leaders-want-st-petersburg-city-council-to-put-brakes-on/1236349 \ No newline at end of file | 5 | test_url: http://www.tampabay.com/news/salvador-dali-leaders-want-st-petersburg-city-council-to-put-brakes-on/1236349 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/taptaptap.com.txt b/inc/3rdparty/site_config/standard/taptaptap.com.txt index 13de70e9..e1e79428 100644..100755 --- a/inc/3rdparty/site_config/standard/taptaptap.com.txt +++ b/inc/3rdparty/site_config/standard/taptaptap.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h3[@class="storytitle"] | 1 | title: //h3[@class="storytitle"] |
2 | body: //div[@class="post"] | 2 | body: //div[@class="post"] |
3 | strip: //div[@class="blurbBox"] | 3 | strip: //div[@class="blurbBox"] |
4 | test_url: http://taptaptap.com/blog/apples-precedents-vs-apples-guidelines/ \ No newline at end of file | 4 | test_url: http://taptaptap.com/blog/apples-precedents-vs-apples-guidelines/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tasteofhome.com.txt b/inc/3rdparty/site_config/standard/tasteofhome.com.txt index 77773363..f3234f34 100644..100755 --- a/inc/3rdparty/site_config/standard/tasteofhome.com.txt +++ b/inc/3rdparty/site_config/standard/tasteofhome.com.txt | |||
@@ -1,15 +1,11 @@ | |||
1 | title: //span[@id='ctl00_ctl00_MainContent_MainContent_RecipeImage1_lblRecipeTitle'] | 1 | title: //div[@id='ctl00_MainContent_ctl00_Div1']//h2 |
2 | body: //div[@id='RDNEW']//*[@class='Recipe-imgCon' or @class='Recipe-Intro' or @class='recipeDetails'] | 2 | body: //div[@id='ctl00_MainContent_ctl00_Div1'] |
3 | strip_id_or_class: rec-ExRightPanel | 3 | |
4 | strip_id_or_class: divCarousel | 4 | single_page_link: //div[contains(@class, 'recipeHeader')]//a[contains(@href, '/print')] |
5 | strip_id_or_class: preptimeOuter | 5 | |
6 | strip_id_or_class: cooktimeOuter | 6 | strip_image_src: tohPrintL.png |
7 | strip_id_or_class: durationOuter | 7 | |
8 | strip_id_or_class: divImageFooter | 8 | prune: no |
9 | strip_id_or_class: microFormatFnIngred | 9 | |
10 | strip: //span[@class='Recipe-Intro']//*[@class='link' or @class='rating'] | 10 | test_url: http://www.tasteofhome.com/recipes/Grinch-Punch |
11 | 11 | test_url: http://www.tasteofhome.com/recipes/lactose-free-chocolate-chip-cookies \ No newline at end of file | |
12 | prune: no | ||
13 | tidy: no | ||
14 | |||
15 | test_url: http://www.tasteofhome.com/recipes/Grinch-Punch \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/taz.de.txt b/inc/3rdparty/site_config/standard/taz.de.txt index 6e84527b..cf853662 100644..100755 --- a/inc/3rdparty/site_config/standard/taz.de.txt +++ b/inc/3rdparty/site_config/standard/taz.de.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | date: //div[@class='secthead'] | 1 | date: //div[@class='secthead'] |
2 | body: //div[@class='sectbody'] | 2 | body: //div[@class='sectbody'] |
3 | title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1) | 3 | title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1) |
4 | author: //span[@class='author'] | 4 | author: //span[@class='author'] |
5 | strip: //p[@class='caption'] | 5 | strip: //p[@class='caption'] |
6 | strip_id_or_class: rack | 6 | strip_id_or_class: rack |
7 | 7 | ||
8 | test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/ \ No newline at end of file | 8 | test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tbray.org.txt b/inc/3rdparty/site_config/standard/tbray.org.txt index fbe94fa4..558dc9c8 100644..100755 --- a/inc/3rdparty/site_config/standard/tbray.org.txt +++ b/inc/3rdparty/site_config/standard/tbray.org.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | body: //div[@id='centercontent'] | 1 | body: //div[@id='centercontent'] |
2 | strip: //div[@id='rightcontent'] | 2 | strip: //div[@id='rightcontent'] |
3 | date: substring-before( //div[@id='cats'], '·') | 3 | date: substring-before( //div[@id='cats'], '·') |
4 | title: //h1 | 4 | title: //h1 |
5 | test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money \ No newline at end of file | 5 | test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tcmanila.tk.txt b/inc/3rdparty/site_config/standard/tcmanila.tk.txt new file mode 100755 index 00000000..f6032ec3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tcmanila.tk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h2 | ||
2 | body: //div[@class="post_content"] | ||
3 | author: //span[@class="fn"] | ||
4 | date: //time[@class="updated"] | ||
5 | strip_comments: //yes | ||
6 | footnotes: //yes | ||
7 | test_url: http://tcmanila.tk/post/29189064358/my-2012-roadmap-is-almost-complete-look-at-the \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tcng.org.txt b/inc/3rdparty/site_config/standard/tcng.org.txt index 765224e4..4873b50d 100644..100755 --- a/inc/3rdparty/site_config/standard/tcng.org.txt +++ b/inc/3rdparty/site_config/standard/tcng.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //div[@id='main-content']/h1 | 1 | title: //div[@id='main-content']/h1 |
2 | body: //div[@id='main-content'] | 2 | body: //div[@id='main-content'] |
3 | strip: //div[@id='main-content']/h1 | 3 | strip: //div[@id='main-content']/h1 |
4 | test_url: http://www.tcng.org/index.php/blog/view/teaching-basic-health-cutting-down-costs \ No newline at end of file | 4 | test_url: http://www.tcng.org/index.php/blog/view/teaching-basic-health-cutting-down-costs \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt b/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt index b6d17da4..da198622 100644..100755 --- a/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt +++ b/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h1[@class='storyheadline'] | 1 | title: //h1[@class='storyheadline'] |
2 | body: //div[@class='storytext'] | 2 | body: //div[@class='storytext'] |
3 | strip: //strong | 3 | strip: //strong |
4 | test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 \ No newline at end of file | 4 | test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tech.gilt.com.txt b/inc/3rdparty/site_config/standard/tech.gilt.com.txt new file mode 100755 index 00000000..ab564606 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tech.gilt.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class="title"]/h1 | ||
2 | title: //div[@class="caption"]/h1 | ||
3 | author: substring-after(//div[@class="metadata"]/div[@class="date"]/a[2], 'by ') | ||
4 | date: //div[@class="metadata"]/div[@class="date"]/a | ||
5 | test_url: http://tech.gilt.com/post/46359463184/26-3-13-todays-noon-outage-and-what-were-doing-to \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt b/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt index f7228ebf..75126f9c 100644..100755 --- a/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt +++ b/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title://h1[contains(@id,'artibodyTitle')] | 1 | title://h1[contains(@id,'artibodyTitle')] |
2 | 2 | ||
3 | date://span[contains(@id,'pub_date')] | 3 | date://span[contains(@id,'pub_date')] |
4 | 4 | ||
5 | body://div[contains(@id,'artibody')] | 5 | body://div[contains(@id,'artibody')] |
6 | 6 | ||
7 | strip://div[contains(@class,'otherContent')] | 7 | strip://div[contains(@class,'otherContent')] |
8 | 8 | ||
9 | next_page_link://p[@class='page']/a[contains(.,'下一页')] | 9 | next_page_link://p[@class='page']/a[contains(.,'下一页')] |
10 | 10 | ||
11 | test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml \ No newline at end of file | 11 | test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/techcrunch.com.txt b/inc/3rdparty/site_config/standard/techcrunch.com.txt index f436acb5..1509c46e 100644..100755 --- a/inc/3rdparty/site_config/standard/techcrunch.com.txt +++ b/inc/3rdparty/site_config/standard/techcrunch.com.txt | |||
@@ -1,18 +1,18 @@ | |||
1 | body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')] | 1 | body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')] |
2 | 2 | ||
3 | author: //a[@class="name"] | 3 | author: //a[@class="name"] |
4 | 4 | ||
5 | date: //div[@class="post-time"] | 5 | date: //div[@class="post-time"] |
6 | 6 | ||
7 | title: //h1[@class="headline"] | 7 | title: //h1[@class="headline"] |
8 | strip_id_or_class: module-crunchbase | 8 | strip_id_or_class: module-crunchbase |
9 | 9 | ||
10 | # The following is for the mobile site | 10 | # The following is for the mobile site |
11 | body: //div[@id="singlentry"] | 11 | body: //div[@id="singlentry"] |
12 | author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ') | 12 | author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ') |
13 | date: substring-before(//div[@class="single-post-meta-top"],' @') | 13 | date: substring-before(//div[@class="single-post-meta-top"],' @') |
14 | title: //a[@class="sh2"] | 14 | title: //a[@class="sh2"] |
15 | 15 | ||
16 | prune: no | 16 | prune: no |
17 | 17 | ||
18 | test_url: http://techcrunch.com/2011/10/18/apples-insanely-great-q1-2012/ \ No newline at end of file | 18 | test_url: http://techcrunch.com/2011/10/18/apples-insanely-great-q1-2012/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/techdirt.com.txt b/inc/3rdparty/site_config/standard/techdirt.com.txt index 727f3701..7db2f95b 100644..100755 --- a/inc/3rdparty/site_config/standard/techdirt.com.txt +++ b/inc/3rdparty/site_config/standard/techdirt.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@class='story'] | 1 | body: //div[@class='story'] |
2 | title: //div[@class='story']/h1 | 2 | title: //div[@class='story']/h1 |
3 | strip: //div[@class='story']/h1 | 3 | strip: //div[@class='story']/h1 |
4 | 4 | ||
5 | author: //div[@class='details']/p[contains(., 'by ')]/a | 5 | author: //div[@class='details']/p[contains(., 'by ')]/a |
6 | date: //p[@class='storydate'] | 6 | date: //p[@class='storydate'] |
7 | 7 | ||
8 | strip: //p[a[contains(., 'Leave a Comment')]] | 8 | strip: //p[a[contains(., 'Leave a Comment')]] |
9 | strip_id_or_class: share | 9 | strip_id_or_class: share |
10 | strip_id_or_class: maincolumn_head | 10 | strip_id_or_class: maincolumn_head |
11 | strip_id_or_class: maincolmod | 11 | strip_id_or_class: maincolmod |
12 | test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml \ No newline at end of file | 12 | test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/techhive.com.txt b/inc/3rdparty/site_config/standard/techhive.com.txt new file mode 100755 index 00000000..29720b0b --- /dev/null +++ b/inc/3rdparty/site_config/standard/techhive.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //div[@class='articleHead']//h1 | ||
2 | author: //div[@class="author-name"]/a[1] | ||
3 | body: //div[@class="main"] | ||
4 | |||
5 | # remove 'From the Lab' and 'Recent posts' text | ||
6 | strip: //div[@class='blogLabel'] | ||
7 | |||
8 | # remove byline and meta info | ||
9 | strip: //div[@class="article-meta"] | ||
10 | strip: //div[@class="author-info"] | ||
11 | |||
12 | #strip tags and categories | ||
13 | strip: //div[@class="department"] | ||
14 | |||
15 | #strip product cap links | ||
16 | strip: //div[@class="cap-main"] | ||
17 | strip: //div[@id="compare-lede"] | ||
18 | test_url: http://www.techhive.com/article/2010549/up-close-with-blackberry-10.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techmeme.com.txt b/inc/3rdparty/site_config/standard/techmeme.com.txt index 8644e00f..0b4bfbd6 100644..100755 --- a/inc/3rdparty/site_config/standard/techmeme.com.txt +++ b/inc/3rdparty/site_config/standard/techmeme.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | single_page_link_in_feed: //b/a | 1 | single_page_link_in_feed: //b/a |
2 | 2 | ||
3 | test_url_feed: http://www.techmeme.com/feed.xml \ No newline at end of file | 3 | test_url_feed: http://www.techmeme.com/feed.xml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt b/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt index cc26ee4c..d871b603 100644..100755 --- a/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt +++ b/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | author: //meta[@name="author"]/@content | 2 | author: //meta[@name="author"]/@content |
3 | date: //h3 | 3 | date: //h3 |
4 | body: //div[@class="postBody"] | 4 | body: //div[@class="postBody"] |
5 | strip: //h1 | 5 | strip: //h1 |
6 | strip: //h2 | 6 | strip: //h2 |
7 | strip: //h3 | 7 | strip: //h3 |
8 | test_url: http://technicallyjordan.tumblr.com/post/22914659822/facebook-to-launch-app-store-knock-off \ No newline at end of file | 8 | test_url: http://technicallyjordan.tumblr.com/post/22914659822/facebook-to-launch-app-store-knock-off \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/technologizer.com.txt b/inc/3rdparty/site_config/standard/technologizer.com.txt new file mode 100755 index 00000000..179bf5a6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/technologizer.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | next_page_link: //a[contains(., 'NEXT PAGE')] | ||
2 | # following::node() selects text nodes too whereas following::* selects only elements. | ||
3 | strip: //span[@class='pageo']/following::node() | ||
4 | strip: //span[@class='pageo'] | ||
5 | test_url: http://technologizer.com/2010/03/08/the-secret-origin-of-windows/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/technologyreview.com.txt b/inc/3rdparty/site_config/standard/technologyreview.com.txt index 41f21d46..d405eb18 100644..100755 --- a/inc/3rdparty/site_config/standard/technologyreview.com.txt +++ b/inc/3rdparty/site_config/standard/technologyreview.com.txt | |||
@@ -1,16 +1,16 @@ | |||
1 | title: //header[@class='article-meta']/h1 | 1 | title: //header[@class='article-meta']/h1 |
2 | title: substring-before(//title, '|') | 2 | title: substring-before(//title, '|') |
3 | 3 | ||
4 | body: //section[contains(@class, 'body')] | 4 | body: //section[contains(@class, 'body')] |
5 | 5 | ||
6 | # Author & Date for News and Featured Stories | 6 | # Author & Date for News and Featured Stories |
7 | author: //ul[@class='byline']/li/a | 7 | author: //ul[@class='byline']/li/a |
8 | author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on') | 8 | author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on') |
9 | date: substring-after(//ul[@class='byline']/li, 'on ') | 9 | date: substring-after(//ul[@class='byline']/li, 'on ') |
10 | 10 | ||
11 | # Author & Date for "Views" | 11 | # Author & Date for "Views" |
12 | author: //div[@class='view-byline']/div[@class='meta']/h2[1] | 12 | author: //div[@class='view-byline']/div[@class='meta']/h2[1] |
13 | date: //div[@class='view-byline']/div[@class='meta']/h2[2] | 13 | date: //div[@class='view-byline']/div[@class='meta']/h2[2] |
14 | 14 | ||
15 | next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')] | 15 | next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')] |
16 | test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/ \ No newline at end of file | 16 | test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/techpinions.com.txt b/inc/3rdparty/site_config/standard/techpinions.com.txt index 89ed8349..8e1aa96c 100644..100755 --- a/inc/3rdparty/site_config/standard/techpinions.com.txt +++ b/inc/3rdparty/site_config/standard/techpinions.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: //div[@class="post"] | 1 | body: //div[@class="post"] |
2 | 2 | ||
3 | strip: //div[@class="post-meta"] | 3 | strip: //div[@class="post-meta"] |
4 | strip: //div[@id="socialicons"] | 4 | strip: //div[@id="socialicons"] |
5 | strip: //div[@id="authorbox"] | 5 | strip: //div[@id="authorbox"] |
6 | 6 | ||
7 | test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572 \ No newline at end of file | 7 | test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/techradar.com.txt b/inc/3rdparty/site_config/standard/techradar.com.txt index ed92a974..0a0ca619 100644..100755 --- a/inc/3rdparty/site_config/standard/techradar.com.txt +++ b/inc/3rdparty/site_config/standard/techradar.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | # Title without news/reviews etc. appended | 1 | # Title without news/reviews etc. appended |
2 | title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1 | 2 | title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1 |
3 | 3 | ||
4 | # Remove home link | 4 | # Remove home link |
5 | strip: //div[@id='page_logo']/a | 5 | strip: //div[@id='page_logo']/a |
6 | 6 | ||
7 | # Remove utilities | 7 | # Remove utilities |
8 | strip: //*[(@id = "utilities")] | 8 | strip: //*[(@id = "utilities")] |
9 | 9 | ||
10 | # Remove comments link | 10 | # Remove comments link |
11 | strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny'] | 11 | strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny'] |
12 | test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105 \ No newline at end of file | 12 | test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/telegraaf.nl.txt b/inc/3rdparty/site_config/standard/telegraaf.nl.txt index ff3cd06e..91b5baf9 100644..100755 --- a/inc/3rdparty/site_config/standard/telegraaf.nl.txt +++ b/inc/3rdparty/site_config/standard/telegraaf.nl.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@id='artikelKolom'] | 1 | body: //div[@id='artikelKolom'] |
2 | strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper'] | 2 | strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper'] |
3 | strip: //div[@id='artikeltoolbar'] | 3 | strip: //div[@id='artikeltoolbar'] |
4 | strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer'] | 4 | strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer'] |
5 | strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget'] | 5 | strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget'] |
6 | tidy: no | 6 | tidy: no |
7 | prune: no | 7 | prune: no |
8 | 8 | ||
9 | test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss \ No newline at end of file | 9 | test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/telegraph.co.uk.txt b/inc/3rdparty/site_config/standard/telegraph.co.uk.txt index e1faf23b..8dcdb42b 100644..100755 --- a/inc/3rdparty/site_config/standard/telegraph.co.uk.txt +++ b/inc/3rdparty/site_config/standard/telegraph.co.uk.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea'] | 1 | body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea'] |
2 | strip: //p[@class='comments'] | 2 | strip: //p[@class='comments'] |
3 | strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")] | 3 | strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")] |
4 | strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links'] | 4 | strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links'] |
5 | strip: //p[@class='bbpTweet']/span[@class='timestamp'] | 5 | strip: //p[@class='bbpTweet']/span[@class='timestamp'] |
6 | strip: //p[@class='bbpTweet']/span[@class='metadata']//img | 6 | strip: //p[@class='bbpTweet']/span[@class='metadata']//img |
7 | tidy: no | 7 | tidy: no |
8 | prune: no | 8 | prune: no |
9 | 9 | ||
10 | test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html \ No newline at end of file | 10 | test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thanhnien.com.vn.txt b/inc/3rdparty/site_config/standard/thanhnien.com.vn.txt new file mode 100755 index 00000000..596ecc90 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thanhnien.com.vn.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body://div[@id="print-news"] | ||
2 | strip://a | ||
3 | strip://span[@class="date-line"] | ||
4 | test_url: http://www.thanhnien.com.vn/pages/20121006/hon-90-trieu-usd-nang-cap-do-thi-can-tho.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/the-magazine.org.txt b/inc/3rdparty/site_config/standard/the-magazine.org.txt new file mode 100755 index 00000000..08864657 --- /dev/null +++ b/inc/3rdparty/site_config/standard/the-magazine.org.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | tidy: no | ||
2 | |||
3 | test_url: http://the-magazine.org/1/alone-together-again \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theage.com.au.txt b/inc/3rdparty/site_config/standard/theage.com.au.txt new file mode 100755 index 00000000..ea27c314 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theage.com.au.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: //h3[@class='authorName'] | ||
2 | date: //time | ||
3 | body: //div[@class='articleBody'] | ||
4 | strip_id_or_class: adspot | ||
5 | test_url: http://www.theage.com.au/victoria/top-cops-warns-outlaw-bikies-we-have-a-gang-too-20130331-2h1l8.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theamericanscholar.org.txt b/inc/3rdparty/site_config/standard/theamericanscholar.org.txt new file mode 100755 index 00000000..38b96672 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theamericanscholar.org.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | # Article Metadata | ||
2 | title: //meta[@property="og:title"]/@content | ||
3 | author: substring-after(//h3, 'By ') | ||
4 | date: //h4/a[2] | ||
5 | |||
6 | # Content Pruning | ||
7 | strip: //h4 | ||
8 | strip: //a[@id="print_button"] | ||
9 | strip: //p[@class="excerpt"] | ||
10 | strip: //h3 | ||
11 | strip: //div[@class="caption"] | ||
12 | strip: //center/a/img | ||
13 | test_url: http://theamericanscholar.org/too-big-to-fail-and-too-risky-to-exist/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theappleblog.com.txt b/inc/3rdparty/site_config/standard/theappleblog.com.txt index 3bd555f1..caa5ae0c 100644..100755 --- a/inc/3rdparty/site_config/standard/theappleblog.com.txt +++ b/inc/3rdparty/site_config/standard/theappleblog.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | # Remove home link | 1 | # Remove home link |
2 | strip: //div[@id='blog-title']/a | 2 | strip: //div[@id='blog-title']/a |
3 | test_url: http://theappleblog.com/2010/10/21/the-new-macbook-air-is-underwhelming/ \ No newline at end of file | 3 | test_url: http://theappleblog.com/2010/10/21/the-new-macbook-air-is-underwhelming/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theatlantic.com.txt b/inc/3rdparty/site_config/standard/theatlantic.com.txt index 267fd39c..aa41b153 100644..100755 --- a/inc/3rdparty/site_config/standard/theatlantic.com.txt +++ b/inc/3rdparty/site_config/standard/theatlantic.com.txt | |||
@@ -1,18 +1,20 @@ | |||
1 | title: //div[@id='article']/h1 | 1 | title: //div[contains(@class, 'articleHead')]//h1 |
2 | title: //h1 | 2 | |
3 | 3 | body: //div[@class='articleText'] | |
4 | body: //div[@class='articleText'] | 4 | body: //div[@class='articleContent'] |
5 | body: //div[@class='articleContent'] | 5 | body: //div[@id='article'] |
6 | body: //div[@id='article'] | 6 | date: //*[contains(@class, 'date')] |
7 | date: //*[contains(@class, 'date')] | 7 | author: //div[@id='profile']//*[@class='authors']//a[1] |
8 | author: //div[@id='profile']//*[@class='authors']//a[1] | 8 | author: //*[@class='author']/span |
9 | author: //*[@class='author']/span | 9 | prune: no |
10 | prune: no | 10 | |
11 | 11 | strip: //div[@class='moreOnBoxWithImages'] | |
12 | strip: //div[@class='moreOnBoxWithImages'] | 12 | strip: //p[contains(., 'This article available online at:')] |
13 | 13 | strip: //p[contains(., 'This article available online at:')]/following::* | |
14 | single_page_link: //a[@class='print'] | 14 | strip: //div[@class='earthbox'] |
15 | 15 | ||
16 | test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/ | 16 | single_page_link: //article//a[contains(@class, 'print')] |
17 | test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/ | 17 | |
18 | test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/ | ||
19 | test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/ | ||
18 | test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ \ No newline at end of file | 20 | test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theatlanticcities.com.txt b/inc/3rdparty/site_config/standard/theatlanticcities.com.txt new file mode 100755 index 00000000..880f207d --- /dev/null +++ b/inc/3rdparty/site_config/standard/theatlanticcities.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | # To administrator: | ||
2 | # Please replace the hostname with "*.theatlanticcities.com" | ||
3 | |||
4 | # This filter is tested on: | ||
5 | # http://m.theatlanticcities.com/arts-and-lifestyle/2012/04/invisible-borders-define-american-culture/1839/ | ||
6 | # http://www.theatlanticcities.com/housing/2012/11/chinas-holdouts/3981/ | ||
7 | # http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/ | ||
8 | |||
9 | title://h1 | ||
10 | author: //ul[@class='meta']/li/a | ||
11 | date: //ul[@class='meta']/li/following-sibling::li | ||
12 | body://article[@class='post'] | ||
13 | |||
14 | strip://h1 | ||
15 | strip://ul[@class='meta'] | ||
16 | strip://div[@class='newsletter-slug'] | ||
17 | test_url: http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thebostonchannel.com.txt b/inc/3rdparty/site_config/standard/thebostonchannel.com.txt index 64df90c1..b74442de 100644..100755 --- a/inc/3rdparty/site_config/standard/thebostonchannel.com.txt +++ b/inc/3rdparty/site_config/standard/thebostonchannel.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //meta[@name='og:title']/@content | 1 | title: //meta[@name='og:title']/@content |
2 | date: //meta[@name='created']/@content | 2 | date: //meta[@name='created']/@content |
3 | body: //div[@class="StoryBody" or @class="storyTeaser"] | 3 | body: //div[@class="StoryBody" or @class="storyTeaser"] |
4 | 4 | ||
5 | replace_string(<p></p>): <br /><br /> | 5 | replace_string(<p></p>): <br /><br /> |
6 | 6 | ||
7 | test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html \ No newline at end of file | 7 | test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thebrowser.com.txt b/inc/3rdparty/site_config/standard/thebrowser.com.txt index c3c20504..807e7dad 100644..100755 --- a/inc/3rdparty/site_config/standard/thebrowser.com.txt +++ b/inc/3rdparty/site_config/standard/thebrowser.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //h2[contains(@class, 'page-title')] | 1 | title: //h2[contains(@class, 'page-title')] |
2 | body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content'] | 2 | body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content'] |
3 | 3 | ||
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | strip: //div[contains(@class, 'node-book')]//a[@class='button'] | 6 | strip: //div[contains(@class, 'node-book')]//a[@class='button'] |
7 | 7 | ||
8 | single_page_link: //a[@class='tool-print'] | 8 | single_page_link: //a[@class='tool-print'] |
9 | 9 | ||
10 | test_url: http://thebrowser.com/interviews/yotam-ottolenghi-on-his-favourite-cookery-books \ No newline at end of file | 10 | test_url: http://thebrowser.com/interviews/yotam-ottolenghi-on-his-favourite-cookery-books \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thecarton.net.txt b/inc/3rdparty/site_config/standard/thecarton.net.txt index 9ef4ed8b..13fa35a0 100644..100755 --- a/inc/3rdparty/site_config/standard/thecarton.net.txt +++ b/inc/3rdparty/site_config/standard/thecarton.net.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: substring-before(//title, ' – ') | 1 | title: substring-before(//title, ' – ') |
2 | author:string('Shawn') | 2 | author:string('Shawn') |
3 | date: //*/time/@pubdate | 3 | date: //*/time/@pubdate |
4 | 4 | ||
5 | 5 | ||
6 | strip: //header | 6 | strip: //header |
7 | strip: //div[@id='prev_next'] | 7 | strip: //div[@id='prev_next'] |
8 | strip: //div[@id='masthead'] | 8 | strip: //div[@id='masthead'] |
9 | 9 | ||
10 | test_url: http://thecarton.net/2012/12/20/imdb \ No newline at end of file | 10 | test_url: http://thecarton.net/2012/12/20/imdb \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thedaily.com.txt b/inc/3rdparty/site_config/standard/thedaily.com.txt index 24ebbbac..e255e6a8 100644..100755 --- a/inc/3rdparty/site_config/standard/thedaily.com.txt +++ b/inc/3rdparty/site_config/standard/thedaily.com.txt | |||
@@ -1,24 +1,24 @@ | |||
1 | #keep all body text | 1 | #keep all body text |
2 | prune: no | 2 | prune: no |
3 | 3 | ||
4 | #title, body, metadata | 4 | #title, body, metadata |
5 | title: //div[@class='story_header']/h1 | 5 | title: //div[@class='story_header']/h1 |
6 | body: //div[@id='content'] | 6 | body: //div[@id='content'] |
7 | author: substring-after(//span[@class='byline'], "by ") | 7 | author: substring-after(//span[@class='byline'], "by ") |
8 | author: substring-after(//span[@class='byline'], "By ") | 8 | author: substring-after(//span[@class='byline'], "By ") |
9 | author: //span[@class='byline'] | 9 | author: //span[@class='byline'] |
10 | date: //span[@class='date'] | 10 | date: //span[@class='date'] |
11 | 11 | ||
12 | #formatting | 12 | #formatting |
13 | convert_double_br_tags: yes | 13 | convert_double_br_tags: yes |
14 | dissolve: //div[@class='slides_full']/ul/li | 14 | dissolve: //div[@class='slides_full']/ul/li |
15 | 15 | ||
16 | # cleanup | 16 | # cleanup |
17 | strip: //a[@id='story_note'] | 17 | strip: //a[@id='story_note'] |
18 | strip: //br | 18 | strip: //br |
19 | strip: //div[@class='intro'] | 19 | strip: //div[@class='intro'] |
20 | strip: //div[@class='share-block'] | 20 | strip: //div[@class='share-block'] |
21 | strip: //div[@class='sidebar-social'] | 21 | strip: //div[@class='sidebar-social'] |
22 | strip: //div[@class='top-stories'] | 22 | strip: //div[@class='top-stories'] |
23 | strip: //div[@class='prevnext'] | 23 | strip: //div[@class='prevnext'] |
24 | test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/ \ No newline at end of file | 24 | test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thedailybeast.com.txt b/inc/3rdparty/site_config/standard/thedailybeast.com.txt index 4781c65a..f5e938ae 100644..100755 --- a/inc/3rdparty/site_config/standard/thedailybeast.com.txt +++ b/inc/3rdparty/site_config/standard/thedailybeast.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //article/div[contains(@class, 'article-body')] | 2 | body: //article/div[contains(@class, 'article-body')] |
3 | #strip: //header/hgroup/h1 | 3 | #strip: //header/hgroup/h1 |
4 | strip: //footer[@class='storyFooter'] | 4 | strip: //footer[@class='storyFooter'] |
5 | single_page_link: //li[@class='print']/a | 5 | single_page_link: //li[@class='print']/a |
6 | prune: no | 6 | prune: no |
7 | test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html \ No newline at end of file | 7 | test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt b/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt index 0f15558d..a83a6cf6 100644..100755 --- a/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt +++ b/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | # Remove duplicated title | 1 | # Remove duplicated title |
2 | strip: //div[@id='content']/div[1][@class='full_intro']/h2 | 2 | strip: //div[@id='content']/div[1][@class='full_intro']/h2 |
3 | 3 | ||
4 | # Remove links, ads etc. | 4 | # Remove links, ads etc. |
5 | strip: //*[(@class= "aside")] | 5 | strip: //*[(@class= "aside")] |
6 | 6 | ||
7 | # Remove the date and add it to the date published field in Instapaper | 7 | # Remove the date and add it to the date published field in Instapaper |
8 | strip: //div[@class="date"] | 8 | strip: //div[@class="date"] |
9 | date: //div[@class="date"] | 9 | date: //div[@class="date"] |
10 | 10 | ||
11 | # There is no byline on The Daily Mash. | 11 | # There is no byline on The Daily Mash. |
12 | 12 | ||
13 | convert_double_br_tags: yes | 13 | convert_double_br_tags: yes |
14 | test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29 \ No newline at end of file | 14 | test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thedisneyblog.com.txt b/inc/3rdparty/site_config/standard/thedisneyblog.com.txt new file mode 100755 index 00000000..57b3254a --- /dev/null +++ b/inc/3rdparty/site_config/standard/thedisneyblog.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[contains(@class, 'entry-title')] | ||
2 | author: //span[contains(@class, 'author vcard')] | ||
3 | date: //span[@class = 'entry-date'] | ||
4 | body: //div[@class='entry-content'] | ||
5 | strip_id_or_class: bottomcontainerBox | ||
6 | strip_id_or_class: lightsocial_container | ||
7 | test_url: http://thedisneyblog.com/2012/11/17/videopolis-one-woman-disney-musical-beauty-and-the-beast/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theeuropean-magazine.com.txt b/inc/3rdparty/site_config/standard/theeuropean-magazine.com.txt new file mode 100755 index 00000000..a19bae15 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theeuropean-magazine.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | # Tested on: | ||
2 | # http://theeuropean-magazine.com/352-dyson-george/353-evolution-and-innovation | ||
3 | # http://theeuropean-magazine.com/522-casertano-stefano/919-morsi-and-the-future-of-egypt | ||
4 | |||
5 | title://h2[@class='article-title'] | ||
6 | author:substring-before(substring-after(//p[@class='article-meta'], 'by'), '—') | ||
7 | date:substring-after(//p[@class='article-meta'], '—') | ||
8 | body://div[@class='article'] | ||
9 | |||
10 | wrap_in(strong)://p[@class='article-teaser'] | ||
11 | move_into(//div[@class='article-head'])://li/img | ||
12 | |||
13 | strip://h2[@class='article-title'] | ||
14 | strip://p[@class='article-meta'] | ||
15 | strip://div[@class='copyright'] | ||
16 | strip://div[@class='opinions-of-readers'] | ||
17 | test_url: http://theeuropean-magazine.com/522-casertano-stefano/919-morsi-and-the-future-of-egypt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thefilmexperience.net.txt b/inc/3rdparty/site_config/standard/thefilmexperience.net.txt index e6b5115a..e6b5115a 100644..100755 --- a/inc/3rdparty/site_config/standard/thefilmexperience.net.txt +++ b/inc/3rdparty/site_config/standard/thefilmexperience.net.txt | |||
diff --git a/inc/3rdparty/site_config/standard/thegamedesignforum.com.txt b/inc/3rdparty/site_config/standard/thegamedesignforum.com.txt new file mode 100755 index 00000000..849ede77 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thegamedesignforum.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | ## ERROR: Removes all images. Please fix, have no idea why (bad HTML?) | ||
2 | |||
3 | title: //h1[@class='featuretitle'] | ||
4 | body: //div[@id='nobordercontentarea'] | ||
5 | |||
6 | # remove Twitter badge | ||
7 | strip: //img[@alt='Follow tgdfweb on Twitter'] | ||
8 | |||
9 | # fix for headers not showing for some reason | ||
10 | wrap_in(h2): //h2[@class='sectionheader'] | ||
11 | dissolve: //h2[@class='sectionheader'] | ||
12 | |||
13 | tidy: yes | ||
14 | test_url: http://thegamedesignforum.com/features/acceleration_flow_1.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theglobalmail.org.txt b/inc/3rdparty/site_config/standard/theglobalmail.org.txt index fae0fb29..da1c84f9 100644..100755 --- a/inc/3rdparty/site_config/standard/theglobalmail.org.txt +++ b/inc/3rdparty/site_config/standard/theglobalmail.org.txt | |||
@@ -1,41 +1,41 @@ | |||
1 | title: //h1[@id="headline"] | 1 | title: //h1[@id="headline"] |
2 | author: //div[contains(@class, "editorial-byline-author")]/a | 2 | author: //div[contains(@class, "editorial-byline-author")]/a |
3 | date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") | 3 | date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") |
4 | 4 | ||
5 | # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed | 5 | # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed |
6 | body: //div[@id="template"] | 6 | body: //div[@id="template"] |
7 | strip_id_or_class: editorial-byline-pic | 7 | strip_id_or_class: editorial-byline-pic |
8 | strip_id_or_class: editorial-byline | 8 | strip_id_or_class: editorial-byline |
9 | strip_id_or_class: headline | 9 | strip_id_or_class: headline |
10 | 10 | ||
11 | # Include the leadin paragraph in the body text, but remove quotes because they're out of context | 11 | # Include the leadin paragraph in the body text, but remove quotes because they're out of context |
12 | dissolve: //div[contains(@id, "leadin")] | 12 | dissolve: //div[contains(@id, "leadin")] |
13 | strip_id_or_class: pullquote | 13 | strip_id_or_class: pullquote |
14 | 14 | ||
15 | # Image captions removed because they're confusing in body text | 15 | # Image captions removed because they're confusing in body text |
16 | strip_id_or_class: image-caption-content | 16 | strip_id_or_class: image-caption-content |
17 | 17 | ||
18 | # Remove header and footer | 18 | # Remove header and footer |
19 | strip_id_or_class: header | 19 | strip_id_or_class: header |
20 | strip_id_or_class: footer | 20 | strip_id_or_class: footer |
21 | 21 | ||
22 | # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image | 22 | # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image |
23 | strip: /html/body/span[contains(@style, "display: none")] | 23 | strip: /html/body/span[contains(@style, "display: none")] |
24 | 24 | ||
25 | # Remove search box | 25 | # Remove search box |
26 | strip_id_or_class: searchContainer | 26 | strip_id_or_class: searchContainer |
27 | strip: //div[contains(@class, "searchInstruction")] | 27 | strip: //div[contains(@class, "searchInstruction")] |
28 | strip: //div[contains(@class, "searchResults")]/h4 | 28 | strip: //div[contains(@class, "searchResults")]/h4 |
29 | 29 | ||
30 | # Remove the 'Letters to the Editor' section | 30 | # Remove the 'Letters to the Editor' section |
31 | strip_id_or_class: letter-text | 31 | strip_id_or_class: letter-text |
32 | strip_id_or_class: letter-from | 32 | strip_id_or_class: letter-from |
33 | strip_id_or_class: letter-date | 33 | strip_id_or_class: letter-date |
34 | 34 | ||
35 | # Remove Like/Tweet links | 35 | # Remove Like/Tweet links |
36 | strip_id_or_class: social-tab | 36 | strip_id_or_class: social-tab |
37 | 37 | ||
38 | # Remove 'divider' which causes an inexplicable slash to appear in the article body | 38 | # Remove 'divider' which causes an inexplicable slash to appear in the article body |
39 | strip_id_or_class: divider | 39 | strip_id_or_class: divider |
40 | 40 | ||
41 | test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ \ No newline at end of file | 41 | test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theglobeandmail.com.txt b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt index 90634a08..750f8473 100644..100755 --- a/inc/3rdparty/site_config/standard/theglobeandmail.com.txt +++ b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')] | 1 | single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')] |
2 | tidy: no | 2 | tidy: no |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/ \ No newline at end of file | 5 | test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thegreatdiscontent.com.txt b/inc/3rdparty/site_config/standard/thegreatdiscontent.com.txt new file mode 100755 index 00000000..12442b40 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thegreatdiscontent.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@id='headline'] | ||
2 | author: substring-after(//section[@class="credits"]/ul/li[1],"Interview by ") | ||
3 | date: //time[@pubdate] | ||
4 | body: //article[@class='interview'] | ||
5 | strip: //article[@class='interview']/footer | ||
6 | test_url: http://thegreatdiscontent.com/jeffrey-zeldman \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theguardian.com.txt b/inc/3rdparty/site_config/standard/theguardian.com.txt new file mode 100755 index 00000000..c803e4e4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theguardian.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //div[@id='main-article-info']//h1 | ||
2 | body: //div[@id='article-wrapper'] | ||
3 | date: //li[@class='publication']//time[@pubdate] | //li[@class='publication']//data[@pubdate] | ||
4 | strip: //div[contains(@class, 'email-subscription')] | ||
5 | strip: //div[contains(@class, 'kindleWidget')] | ||
6 | #strip: //a[not(text())] | ||
7 | strip_id_or_class: pocket-btn | ||
8 | author: //li[@class='byline'] | ||
9 | prune: no | ||
10 | tidy: no | ||
11 | test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption | ||
12 | test_url: http://www.theguardian.com/world/2013/oct/03/edward-snowden-files-john-lanchester | ||
13 | test_url: http://www.theguardian.com/commentisfree/2014/jun/15/britishness-search-identity-my-part-in-camerons-odyssey \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theindychannel.com.txt b/inc/3rdparty/site_config/standard/theindychannel.com.txt index 3544f247..2cd865bb 100644..100755 --- a/inc/3rdparty/site_config/standard/theindychannel.com.txt +++ b/inc/3rdparty/site_config/standard/theindychannel.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //h1[@class="Headline"] | 1 | title: //h1[@class="Headline"] |
2 | date: substring-after(//div[@class="posted"], 'EDT ') | 2 | date: substring-after(//div[@class="posted"], 'EDT ') |
3 | body: //div[@class="storyBody"] | 3 | body: //div[@class="storyBody"] |
4 | 4 | ||
5 | strip: //td[@class="AssocContentTD"] | 5 | strip: //td[@class="AssocContentTD"] |
6 | strip: //div[@id="pageTitle"] | 6 | strip: //div[@id="pageTitle"] |
7 | strip: //div[@class="posted"] | 7 | strip: //div[@class="posted"] |
8 | strip: //div[@class="updated"] | 8 | strip: //div[@class="updated"] |
9 | strip: //div[@class="js-kit-disclaimer"] | 9 | strip: //div[@class="js-kit-disclaimer"] |
10 | strip: //table[@class="row3table"] | 10 | strip: //table[@class="row3table"] |
11 | strip: //div[@class="container2"] | 11 | strip: //div[@class="container2"] |
12 | strip: //div[@id="delta"] | 12 | strip: //div[@id="delta"] |
13 | test_url: http://www.theindychannel.com/news/31050840/detail.html \ No newline at end of file | 13 | test_url: http://www.theindychannel.com/news/31050840/detail.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/themarker.com.txt b/inc/3rdparty/site_config/standard/themarker.com.txt new file mode 100755 index 00000000..141b1a3b --- /dev/null +++ b/inc/3rdparty/site_config/standard/themarker.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[contains(@class, 'mainTitle')] | ||
2 | author: //ul[@class='author']//a[@rel='author'] | ||
3 | body: //div[@id='article-box'] | ||
4 | prune: no | ||
5 | tidy: no | ||
6 | strip_id_or_class: head | ||
7 | strip_id_or_class: social-nav | ||
8 | strip_id_or_class: rate | ||
9 | strip_id_or_class: video | ||
10 | |||
11 | test_url: http://www.themarker.com/markerweek/1.2093167 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/themillions.com.txt b/inc/3rdparty/site_config/standard/themillions.com.txt index e3e57fea..4d46daee 100644..100755 --- a/inc/3rdparty/site_config/standard/themillions.com.txt +++ b/inc/3rdparty/site_config/standard/themillions.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: /html/body/div/div[2]/div/div/div/h3 | 1 | title: /html/body/div/div[2]/div/div/div/h3 |
2 | 2 | ||
3 | body: /html/body/div/div[2]/div/div/div/div[2] | 3 | body: /html/body/div/div[2]/div/div/div/div[2] |
4 | 4 | ||
5 | strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div | 5 | strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div |
6 | 6 | ||
7 | tidy: no | 7 | tidy: no |
8 | 8 | ||
9 | # any way to get rid of this word character garbage? | 9 | # any way to get rid of this word character garbage? |
10 | test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html \ No newline at end of file | 10 | test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt b/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt index 518bff93..80aba441 100644..100755 --- a/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt +++ b/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | body: single-review | 1 | body: single-review |
2 | strip_id_or_class: featured-review | 2 | strip_id_or_class: featured-review |
3 | strip_id_or_class: resources | 3 | strip_id_or_class: resources |
4 | strip_id_or_class: rate-the-book | 4 | strip_id_or_class: rate-the-book |
5 | strip_id_or_class: write-review | 5 | strip_id_or_class: write-review |
6 | 6 | ||
7 | test_url: http://themuseumofinnocence.com/review.php?id=1179 \ No newline at end of file | 7 | test_url: http://themuseumofinnocence.com/review.php?id=1179 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thenation.com.txt b/inc/3rdparty/site_config/standard/thenation.com.txt index d88bcdd6..dab17f0b 100644..100755 --- a/inc/3rdparty/site_config/standard/thenation.com.txt +++ b/inc/3rdparty/site_config/standard/thenation.com.txt | |||
@@ -1,11 +1,13 @@ | |||
1 | title: //h1[@class='print-title'] | 1 | title: //h2[@property='dc:title'] |
2 | body: //div[@class='print-content'] | 2 | #body: //div[@class='print-content'] |
3 | author: //a[contains(@href, '/authors')] | 3 | body: //div[@id='wysiwyg'] |
4 | author: substring-before(//div[@class='print-created'], '|') | 4 | author: //a[contains(@href, '/authors')] |
5 | date: //span[@class='article-date'] | 5 | author: substring-before(//div[@class='print-created'], '|') |
6 | date: substring-after(//div[@class='print-created'], '|') | 6 | date: //span[@class='article-date'] |
7 | prune: no | 7 | date: substring-after(//div[@class='print-created'], '|') |
8 | 8 | prune: no | |
9 | single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')] | 9 | |
10 | 10 | #single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')] | |
11 | single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '?page=full')] | ||
12 | |||
11 | test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher \ No newline at end of file | 13 | test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt b/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt index 846b8a8a..b7f5f0f0 100644..100755 --- a/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt +++ b/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@id="beta-inner"] | 1 | body: //div[@id="beta-inner"] |
2 | title: //h3[@class="entry-header"] | 2 | title: //h3[@class="entry-header"] |
3 | 3 | ||
4 | test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html \ No newline at end of file | 4 | test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thenextgeneration.org.txt b/inc/3rdparty/site_config/standard/thenextgeneration.org.txt new file mode 100755 index 00000000..dedd989f --- /dev/null +++ b/inc/3rdparty/site_config/standard/thenextgeneration.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class='interior-page-title'] | ||
2 | author: //span[@class='author']/a | ||
3 | date: //div[@class='byline']/time | ||
4 | body: //div[@class='rich-text-body'] | ||
5 | |||
6 | strip: //div[@class='byline'] | ||
7 | strip: //div[@class='offscreen-menu'] | ||
8 | test_url: http://thenextgeneration.org/blog/post/rebrand-announce/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thenextweb.com.txt b/inc/3rdparty/site_config/standard/thenextweb.com.txt index fdc70005..684fe82d 100644..100755 --- a/inc/3rdparty/site_config/standard/thenextweb.com.txt +++ b/inc/3rdparty/site_config/standard/thenextweb.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body: //div[@class= 'article-body'] | 1 | body: //div[@class= 'article-body'] |
2 | author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')] | 2 | author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')] |
3 | 3 | ||
4 | strip: //div[@class = 'bargo'] | 4 | strip: //div[@class = 'bargo'] |
5 | strip: //div[@class = 'tf'] | 5 | strip: //div[@class = 'tf'] |
6 | strip: //div[@class = 'article']/div[@class = 'blue-box'] | 6 | strip: //div[@class = 'article']/div[@class = 'blue-box'] |
7 | strip_id_or_class: respond | 7 | strip_id_or_class: respond |
8 | 8 | ||
9 | tidy: no | 9 | tidy: no |
10 | next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href | 10 | next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href |
11 | 11 | ||
12 | test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/ \ No newline at end of file | 12 | test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theoaklandpress.com.txt b/inc/3rdparty/site_config/standard/theoaklandpress.com.txt index c7132321..c9abda71 100644..100755 --- a/inc/3rdparty/site_config/standard/theoaklandpress.com.txt +++ b/inc/3rdparty/site_config/standard/theoaklandpress.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='fullstory'] | 1 | body: //div[@id='fullstory'] |
2 | strip: //div[@id='page_leftbar'] | 2 | strip: //div[@id='page_leftbar'] |
3 | test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt \ No newline at end of file | 3 | test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theonion.com.txt b/inc/3rdparty/site_config/standard/theonion.com.txt index 12918b88..90e8d658 100644..100755 --- a/inc/3rdparty/site_config/standard/theonion.com.txt +++ b/inc/3rdparty/site_config/standard/theonion.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h2[@class='title'] | 1 | title: //h2[@class='title'] |
2 | date: substring-before(//p[@class='meta'], '|') | 2 | date: substring-before(//p[@class='meta'], '|') |
3 | body: //div[@class='story'] | 3 | body: //div[@class='story'] |
4 | #body: //div[@class='article_body'] | 4 | #body: //div[@class='article_body'] |
5 | 5 | ||
6 | strip: //h2[@class='title'] | 6 | strip: //h2[@class='title'] |
7 | strip: //p[@class='meta'] | 7 | strip: //p[@class='meta'] |
8 | strip: //div[@class='ga_section'] | 8 | strip: //div[@class='ga_section'] |
9 | strip: //div[@id='recent_slider'] | 9 | strip: //div[@id='recent_slider'] |
10 | 10 | ||
11 | test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/ \ No newline at end of file | 11 | test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt b/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt index f89f3a87..75583cd3 100644..100755 --- a/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt +++ b/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | title: //h1[@class='post-title'] | 1 | title: //h1[@class='post-title'] |
2 | body: //div[@class='post'] | 2 | body: //div[@class='post'] |
3 | author: //p[@class='posted-by'] | 3 | author: //p[@class='posted-by'] |
4 | date: //div[@class='sprite post-date'] | 4 | date: //div[@class='sprite post-date'] |
5 | 5 | ||
6 | # The body of the post doesn't have it's own div so we have to strip out the metadata | 6 | # The body of the post doesn't have it's own div so we have to strip out the metadata |
7 | strip: //div[@class='author_avatar'] | 7 | strip: //div[@class='author_avatar'] |
8 | strip: //div[@class='sprite post-date'] | 8 | strip: //div[@class='sprite post-date'] |
9 | strip: //h1[@class='post-title'] | 9 | strip: //h1[@class='post-title'] |
10 | strip: //p[@class='posted-by'] | 10 | strip: //p[@class='posted-by'] |
11 | test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/ \ No newline at end of file | 11 | test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/theregister.co.uk.txt b/inc/3rdparty/site_config/standard/theregister.co.uk.txt index ebcc55d5..5d30230d 100644..100755 --- a/inc/3rdparty/site_config/standard/theregister.co.uk.txt +++ b/inc/3rdparty/site_config/standard/theregister.co.uk.txt | |||
@@ -1,5 +1,8 @@ | |||
1 | title: //div[@id="article"]/h2 | 1 | # Updated 25-Jan-2014 |
2 | author: //div[@id="article"]/p[@class="byline"]/a[1] | 2 | single_page_link: //a[contains(@href, '/Print/')] |
3 | date: //div[@id="article"]/p[@class="dateline"]/a[2] | 3 | |
4 | body: //div[@id="article"]/div[@id="body"] | 4 | title: //div[@id="article"]/h2 |
5 | test_url: http://www.theregister.co.uk/2011/10/06/gas_bill_shocker/ \ No newline at end of file | 5 | author: //p[@class="byline"]/a |
6 | date: //p[@class="dateline"]/a[last()] | ||
7 | |||
8 | test_url: http://www.theregister.co.uk/2014/01/24/thirty_years_of_the_apple_macintosh_part_2/ | ||
diff --git a/inc/3rdparty/site_config/standard/theroot.com.txt b/inc/3rdparty/site_config/standard/theroot.com.txt index ebff662d..1f56316d 100644..100755 --- a/inc/3rdparty/site_config/standard/theroot.com.txt +++ b/inc/3rdparty/site_config/standard/theroot.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='node-content'] | 1 | body: //div[@id='node-content'] |
2 | strip_id_or_class: pager | 2 | strip_id_or_class: pager |
3 | test_url: http://www.theroot.com/views/why-i-am-male-feminist \ No newline at end of file | 3 | test_url: http://www.theroot.com/views/why-i-am-male-feminist \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/therumpus.net.txt b/inc/3rdparty/site_config/standard/therumpus.net.txt index d01a89bb..84d0e783 100644..100755 --- a/inc/3rdparty/site_config/standard/therumpus.net.txt +++ b/inc/3rdparty/site_config/standard/therumpus.net.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: /html/body/div/div[2]/div/div/h1 | 1 | title: /html/body/div/div[2]/div/div/h1 |
2 | 2 | ||
3 | body: /html/body/div/div[2]/div/div/div[2] | 3 | body: /html/body/div/div[2]/div/div/div[2] |
4 | test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes \ No newline at end of file | 4 | test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thesiasat.com.txt b/inc/3rdparty/site_config/standard/thesiasat.com.txt index ab9a99e8..68a8bc8e 100644..100755 --- a/inc/3rdparty/site_config/standard/thesiasat.com.txt +++ b/inc/3rdparty/site_config/standard/thesiasat.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | 1 | #body: (//div[@class='ftr-yt-vid'])[1] |
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | 2 | body: (//blockquote[contains(@class, 'postcontent')])[1] |
3 | body: (//div[starts-with(@id, 'post_message')])[1] | 3 | body: (//div[starts-with(@id, 'post_message')])[1] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | tidy: no | 6 | tidy: no |
7 | 7 | ||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | 8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" |
9 | #replace_string(</iframe>): </iframe> </div> | 9 | #replace_string(</iframe>): </iframe> </div> |
10 | 10 | ||
11 | test_url: http://www.thesiasat.com/showthread.php?19220-Dunya-News-HASB-E-HAAL-16-06-2012-Part-1-5 \ No newline at end of file | 11 | test_url: http://www.thesiasat.com/showthread.php?19220-Dunya-News-HASB-E-HAAL-16-06-2012-Part-1-5 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thesimpledollar.com.txt b/inc/3rdparty/site_config/standard/thesimpledollar.com.txt index d5c6c9e0..dcdf2572 100644..100755 --- a/inc/3rdparty/site_config/standard/thesimpledollar.com.txt +++ b/inc/3rdparty/site_config/standard/thesimpledollar.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h3[@class='post-title']/a[@class='post-title-link'] | 1 | title: //h3[@class='post-title']/a[@class='post-title-link'] |
2 | body: //div[@class='post-content'] | 2 | body: //div[@class='post-content'] |
3 | author: //div[@class='post-meta-under-title']/a | 3 | author: //div[@class='post-meta-under-title']/a |
4 | test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/ \ No newline at end of file | 4 | test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt b/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt index e2ed1e63..ca983281 100644..100755 --- a/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt +++ b/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | strip: //*[(@id = "content")]/h2 | 1 | strip: //*[(@id = "content")]/h2 |
2 | strip: //*[(@class = "wp-notable-line")] | 2 | strip: //*[(@class = "wp-notable-line")] |
3 | test_url: http://www.thespoiler.co.uk/index.php/2010/10/21/wayne-rooney-tells-man-utd-its-not-me-its-you \ No newline at end of file | 3 | test_url: http://www.thespoiler.co.uk/index.php/2010/10/21/wayne-rooney-tells-man-utd-its-not-me-its-you \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thespoof.com.txt b/inc/3rdparty/site_config/standard/thespoof.com.txt index 409dc0c9..f71cfb6b 100644..100755 --- a/inc/3rdparty/site_config/standard/thespoof.com.txt +++ b/inc/3rdparty/site_config/standard/thespoof.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h1[contains(@class, 'cTitle')] | 1 | title: //h1[contains(@class, 'cTitle')] |
2 | body: //div[contains(@class, 'KonaBody') or @id='articleimageright'] | 2 | body: //div[contains(@class, 'KonaBody') or @id='articleimageright'] |
3 | author: //meta[@name='Author']/@content | 3 | author: //meta[@name='Author']/@content |
4 | date: //meta[@name='OriginalPublicationDate']/@content | 4 | date: //meta[@name='OriginalPublicationDate']/@content |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | tidy: no | 7 | tidy: no |
8 | 8 | ||
9 | test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389 \ No newline at end of file | 9 | test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thestranger.com.txt b/inc/3rdparty/site_config/standard/thestranger.com.txt index 0f9855c8..6fcf4fdf 100644..100755 --- a/inc/3rdparty/site_config/standard/thestranger.com.txt +++ b/inc/3rdparty/site_config/standard/thestranger.com.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | # savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029 | 1 | # savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029 |
2 | 2 | ||
3 | #other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885 | 3 | #other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885 |
4 | 4 | ||
5 | title: //div[@id='savageColumn_head']/h1 | 5 | title: //div[@id='savageColumn_head']/h1 |
6 | title: //h1[@class="headlineLarge"] | 6 | title: //h1[@class="headlineLarge"] |
7 | 7 | ||
8 | strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner'] | 8 | strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner'] |
9 | 9 | ||
10 | body: //div[@id='savageColumn'] | 10 | body: //div[@id='savageColumn'] |
11 | body: //div[@id='story_text'] | 11 | body: //div[@id='story_text'] |
12 | test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029 \ No newline at end of file | 12 | test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thestreet.com.txt b/inc/3rdparty/site_config/standard/thestreet.com.txt index 5de75637..58eabf00 100644..100755 --- a/inc/3rdparty/site_config/standard/thestreet.com.txt +++ b/inc/3rdparty/site_config/standard/thestreet.com.txt | |||
@@ -1,25 +1,25 @@ | |||
1 | title: //div[@id='storyHdr']/h1 | 1 | title: //div[@id='storyHdr']/h1 |
2 | title: //div[@id='print']//h2 | 2 | title: //div[@id='print']//h2 |
3 | body: //div[@class="virtualpage"] | 3 | body: //div[@class="virtualpage"] |
4 | body: //div[@id='print']//div[@id='bd'] | 4 | body: //div[@id='print']//div[@id='bd'] |
5 | author: //meta[@name="AUTHOR"]/@content | 5 | author: //meta[@name="AUTHOR"]/@content |
6 | author: (//div[@id='print']//div[@id='bd']/h4)[1] | 6 | author: (//div[@id='print']//div[@id='bd']/h4)[1] |
7 | date: //meta[@name="DATE"]/@content | 7 | date: //meta[@name="DATE"]/@content |
8 | date: //div[@id='print']//div[@id='dte'] | 8 | date: //div[@id='print']//div[@id='dte'] |
9 | 9 | ||
10 | strip_id_or_class: articleFooter | 10 | strip_id_or_class: articleFooter |
11 | strip_id_or_class: sidebar | 11 | strip_id_or_class: sidebar |
12 | strip_id_or_class: ie6PrintSubhead | 12 | strip_id_or_class: ie6PrintSubhead |
13 | strip_id_or_class: subHdr | 13 | strip_id_or_class: subHdr |
14 | 14 | ||
15 | 15 | ||
16 | replace_string(<P/>): </p><p> | 16 | replace_string(<P/>): </p><p> |
17 | 17 | ||
18 | prune: no | 18 | prune: no |
19 | 19 | ||
20 | #TODO: redirects back - perhaps needs referer to work | 20 | #TODO: redirects back - perhaps needs referer to work |
21 | single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')] | 21 | single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')] |
22 | 22 | ||
23 | test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html | 23 | test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html |
24 | # multi page | 24 | # multi page |
25 | test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html \ No newline at end of file | 25 | test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt b/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt index 6b3277eb..6b3277eb 100644..100755 --- a/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt +++ b/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt | |||
diff --git a/inc/3rdparty/site_config/standard/theverge.com.txt b/inc/3rdparty/site_config/standard/theverge.com.txt index 11c5c153..1e1ce58f 100644..100755 --- a/inc/3rdparty/site_config/standard/theverge.com.txt +++ b/inc/3rdparty/site_config/standard/theverge.com.txt | |||
@@ -1,31 +1,48 @@ | |||
1 | title: //h1[contains(@class, "headline")] | 1 | author: //p[contains(@class, "byline")]/a[contains(@class, "author")] |
2 | 2 | ||
3 | author: //p[contains(@class, "byline")]/a[contains(@class, "author")] | 3 | date: //span[contains(@class, "publish-date")]/time[@pubdate]/@datetime |
4 | 4 | ||
5 | date: substring-after(normalize-space(//p[contains(@class, "byline")]/span[contains(@class, "publish-date")]), "on ") | 5 | body: //div[contains(@class, 'entry-content')] |
6 | 6 | # for vergecasts, e.g. http://www.theverge.com/2013/8/22/4648566/the-vergecast-090-august-22th-2013-video | |
7 | body: //article[contains(@class, 'feature-entry')] | 7 | body: //article |
8 | body: //article | 8 | prune: no |
9 | prune: no | 9 | #tidy: no |
10 | tidy: no | 10 | |
11 | 11 | strip: //article/header | |
12 | strip: //article/header | 12 | strip: //*[@id='sticky-menu'] |
13 | strip: //*[@id='sticky-menu'] | 13 | strip: //aside |
14 | strip: //aside | 14 | strip: //nav |
15 | strip: //nav | 15 | strip: //img[contains(@class, 'vox-lazy-load')] |
16 | 16 | # deal with bad parsing | |
17 | strip_id_or_class: gallery | 17 | strip: //div[contains(@class, 'story-image')]//div[contains(., 'function(')] |
18 | strip_id_or_class: article-meta | 18 | |
19 | strip_id_or_class: story-navigation | 19 | strip_id_or_class: gallery |
20 | strip_id_or_class: slegend | 20 | strip_id_or_class: article-meta |
21 | strip_id_or_class: related-product-meta | 21 | strip_id_or_class: story-navigation |
22 | strip_id_or_class: comments | 22 | strip_id_or_class: slegend |
23 | strip_id_or_class: ui-jump-list | 23 | strip_id_or_class: related-product-meta |
24 | strip_id_or_class: pullquote | 24 | strip_id_or_class: comments |
25 | 25 | strip_id_or_class: ui-jump-list | |
26 | strip: //q | 26 | strip_id_or_class: pullquote |
27 | 27 | strip_id_or_class: m-ad | |
28 | strip: //a[contains(@class, 'entry-section-title')] | 28 | strip_id_or_class: social-sharing |
29 | 29 | strip_id_or_class: m-video-entry__excerpt | |
30 | test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review | 30 | strip_id_or_class: hidden |
31 | test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review \ No newline at end of file | 31 | |
32 | replace_string(<noscript>): <div> | ||
33 | replace_string(</noscript>): </div> | ||
34 | |||
35 | find_string: <script | ||
36 | replace_string: <div style="display:none" | ||
37 | find_string: </script> | ||
38 | replace_string: </div> | ||
39 | |||
40 | strip: //q | ||
41 | |||
42 | strip: //a[contains(@class, 'entry-section-title')] | ||
43 | |||
44 | test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review | ||
45 | test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review | ||
46 | test_url: http://www.theverge.com/2013/2/24/4026114/barnes-noble-shifting-focus-away-from-nook-hardware | ||
47 | test_url: http://www.theverge.com/2014/6/19/5824072/top-shelf-living-the-dream | ||
48 | test_url: http://www.theverge.com/rss/frontpage \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theweek.com.txt b/inc/3rdparty/site_config/standard/theweek.com.txt index 27281ceb..f98749e2 100644..100755 --- a/inc/3rdparty/site_config/standard/theweek.com.txt +++ b/inc/3rdparty/site_config/standard/theweek.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | body: //div[@class="briefingEntry"] | 1 | body: //div[@class="briefingEntry"] |
2 | prune: no | 2 | prune: no |
3 | 3 | ||
4 | test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill \ No newline at end of file | 4 | test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thinkprogress.org.txt b/inc/3rdparty/site_config/standard/thinkprogress.org.txt index 8934b68e..1eec4e3c 100644..100755 --- a/inc/3rdparty/site_config/standard/thinkprogress.org.txt +++ b/inc/3rdparty/site_config/standard/thinkprogress.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | author: //p[@class="byline"]/a | 1 | author: //p[@class="byline"]/a |
2 | body: //div[@class="post"] | 2 | body: //div[@class="post"] |
3 | 3 | ||
4 | test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/ \ No newline at end of file | 4 | test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thisdaylive.com.txt b/inc/3rdparty/site_config/standard/thisdaylive.com.txt index 958d4b27..73b3c9ed 100644..100755 --- a/inc/3rdparty/site_config/standard/thisdaylive.com.txt +++ b/inc/3rdparty/site_config/standard/thisdaylive.com.txt | |||
@@ -1,2 +1,2 @@ | |||
1 | body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body'] | 1 | body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body'] |
2 | test_url: http://www.thisdaylive.com/articles/australia-pm-talks-human-rights-with-chinas-wen/90394/ \ No newline at end of file | 2 | test_url: http://www.thisdaylive.com/articles/australia-pm-talks-human-rights-with-chinas-wen/90394/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/thisismynext.com.txt b/inc/3rdparty/site_config/standard/thisismynext.com.txt index 6850b4be..70b53995 100644..100755 --- a/inc/3rdparty/site_config/standard/thisismynext.com.txt +++ b/inc/3rdparty/site_config/standard/thisismynext.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | author: //div[@class='meta clearfix']/a | 1 | author: //div[@class='meta clearfix']/a |
2 | body: //div[@class='post'] | 2 | body: //div[@class='post'] |
3 | 3 | ||
4 | strip: //div[@class='metaCat'] | 4 | strip: //div[@class='metaCat'] |
5 | strip: //div[@class='post']/h1 | 5 | strip: //div[@class='post']/h1 |
6 | strip: //div[@class='post']/div[@class='meta clearfix'] | 6 | strip: //div[@class='post']/div[@class='meta clearfix'] |
7 | strip: //div[@class='post']/div[@class='social-bar clearfix'] | 7 | strip: //div[@class='post']/div[@class='social-bar clearfix'] |
8 | test_url: http://thisismynext.com/2011/10/18/galaxy-nexus-android-ice-cream-sandwich-pictures-video-hands-on/ \ No newline at end of file | 8 | test_url: http://thisismynext.com/2011/10/18/galaxy-nexus-android-ice-cream-sandwich-pictures-video-hands-on/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tidbits.com.txt b/inc/3rdparty/site_config/standard/tidbits.com.txt index 8bcf2ec1..1950e58e 100644..100755 --- a/inc/3rdparty/site_config/standard/tidbits.com.txt +++ b/inc/3rdparty/site_config/standard/tidbits.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | author: //span[@class='fn'] | 1 | author: //span[@class='fn'] |
2 | date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|') | 2 | date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|') |
3 | test_url: http://tidbits.com/article/12651 \ No newline at end of file | 3 | test_url: http://tidbits.com/article/12651 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/time.com.txt b/inc/3rdparty/site_config/standard/time.com.txt index fd3fe08c..f3f886bc 100644..100755 --- a/inc/3rdparty/site_config/standard/time.com.txt +++ b/inc/3rdparty/site_config/standard/time.com.txt | |||
@@ -1,14 +1,12 @@ | |||
1 | # 2011-10-25 - carlo@... - Initial setup. | 1 | title: //h1[contains(@class, 'article-title')] |
2 | 2 | author: //article//span[contains(@class, 'byline')] | |
3 | single_page_link: //li[@class='print']/a/@href | 3 | date: //time[@pubdate]/@datetime |
4 | 4 | body: //section[contains(@class, 'article-body')] | |
5 | title: //h1 | 5 | prune: no |
6 | author: //meta[@name="byline"]/@content | 6 | tidy: no |
7 | date: //meta[@name="date"]/@content | 7 | |
8 | 8 | strip: //figcaption | |
9 | strip: //span[@class="see"] | 9 | strip: //p[contains(., 'MORE:') and ./a] |
10 | strip: //div[@class="byline"] | 10 | strip: //aside |
11 | strip: //div[@id="date2"] | 11 | |
12 | strip: //h1 | 12 | test_url: http://time.com/14478/emotions-may-not-be-so-universal-after-all/ \ No newline at end of file |
13 | |||
14 | test_url: http://www.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt b/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt index 17297732..af1c23ce 100644..100755 --- a/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt +++ b/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@class="storytext"] | 2 | body: //div[@class="storytext"] |
3 | strip: //div[@id="thelogin"] | 3 | strip: //div[@id="thelogin"] |
4 | strip: //*[@class="hide"] | 4 | strip: //*[@class="hide"] |
5 | strip: //div[@id="anchored"] | 5 | strip: //div[@id="anchored"] |
6 | test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1 \ No newline at end of file | 6 | test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tipb.com.txt b/inc/3rdparty/site_config/standard/tipb.com.txt index 9533eb0f..b8474d97 100644..100755 --- a/inc/3rdparty/site_config/standard/tipb.com.txt +++ b/inc/3rdparty/site_config/standard/tipb.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | body: //div[@id='content'] | 1 | body: //div[@id='content'] |
2 | 2 | ||
3 | strip_id_or_class: featured-box | 3 | strip_id_or_class: featured-box |
4 | strip_id_or_class: postmeta | 4 | strip_id_or_class: postmeta |
5 | strip_id_or_class: respond | 5 | strip_id_or_class: respond |
6 | 6 | ||
7 | author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')] | 7 | author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')] |
8 | date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ') | 8 | date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ') |
9 | test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/ \ No newline at end of file | 9 | test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tnr.com.txt b/inc/3rdparty/site_config/standard/tnr.com.txt index 65a1899f..199f5d13 100644..100755 --- a/inc/3rdparty/site_config/standard/tnr.com.txt +++ b/inc/3rdparty/site_config/standard/tnr.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1 | 1 | title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1 |
2 | title: //div[contains(@class, 'article_detail')]//h1 | 2 | title: //div[contains(@class, 'article_detail')]//h1 |
3 | title: //h1 | 3 | title: //h1 |
4 | 4 | ||
5 | body: //div[contains(@class, 'article_detail')] | 5 | body: //div[contains(@class, 'article_detail')] |
6 | 6 | ||
7 | author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3 | 7 | author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3 |
8 | author: div[@class='author']//h3 | 8 | author: div[@class='author']//h3 |
9 | strip: //div[contains(@class, 'field-field-book-cover')] | 9 | strip: //div[contains(@class, 'field-field-book-cover')] |
10 | 10 | ||
11 | date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '') | 11 | date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '') |
12 | 12 | ||
13 | prune: no | 13 | prune: no |
14 | 14 | ||
15 | single_page_link: //a[@class='print-page'] | 15 | single_page_link: //a[@class='print-page'] |
16 | 16 | ||
17 | test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled \ No newline at end of file | 17 | test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tomdispatch.com.txt b/inc/3rdparty/site_config/standard/tomdispatch.com.txt index d8548c78..701a2122 100644..100755 --- a/inc/3rdparty/site_config/standard/tomdispatch.com.txt +++ b/inc/3rdparty/site_config/standard/tomdispatch.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //div[@id='maincontent']//div[@class='title'] | 1 | title: //div[@id='maincontent']//div[@class='title'] |
2 | body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat'] | 2 | body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat'] |
3 | 3 | ||
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/ \ No newline at end of file | 6 | test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tomshardware.com.txt b/inc/3rdparty/site_config/standard/tomshardware.com.txt index 2bba6de8..2b437574 100644..100755 --- a/inc/3rdparty/site_config/standard/tomshardware.com.txt +++ b/inc/3rdparty/site_config/standard/tomshardware.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | title: //title | 2 | title: //title |
3 | author: //a[@itemprop = 'author'] | 3 | author: //a[@itemprop = 'author'] |
4 | date: //time[@itemprop = 'datePublished'] | 4 | date: //time[@itemprop = 'datePublished'] |
5 | body: //div[@id = 'intelliTXT'] | 5 | body: //div[@id = 'intelliTXT'] |
6 | 6 | ||
7 | next_page_link: //li[@class="pagin next"]/a | 7 | next_page_link: //li[@class="pagin next"]/a |
8 | test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html \ No newline at end of file | 8 | test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tomshardware.de.txt b/inc/3rdparty/site_config/standard/tomshardware.de.txt index e910003c..eee57ccf 100644..100755 --- a/inc/3rdparty/site_config/standard/tomshardware.de.txt +++ b/inc/3rdparty/site_config/standard/tomshardware.de.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | body://div[@id="news-content"]/div[@id="intelliTXT"][1] | 1 | body://div[@id="news-content"]/div[@id="intelliTXT"][1] |
2 | 2 | ||
3 | author://div[@id="header-news-infos"]/a[1] | 3 | author://div[@id="header-news-infos"]/a[1] |
4 | 4 | ||
5 | date: //div[@id="header-news-infos"]/span[1] | 5 | date: //div[@id="header-news-infos"]/span[1] |
6 | 6 | ||
7 | title://h1[@id="header-news-title" and @class="hardwareTitle"][1] | 7 | title://h1[@id="header-news-title" and @class="hardwareTitle"][1] |
8 | 8 | ||
9 | strip://div[@id="news-content"]/div[@id="intelliTXT"]/table | 9 | strip://div[@id="news-content"]/div[@id="intelliTXT"]/table |
10 | 10 | ||
11 | footnotes: no | 11 | footnotes: no |
12 | test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html \ No newline at end of file | 12 | test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/toolsandtoys.net.txt b/inc/3rdparty/site_config/standard/toolsandtoys.net.txt index dbe60b15..bb45d890 100644..100755 --- a/inc/3rdparty/site_config/standard/toolsandtoys.net.txt +++ b/inc/3rdparty/site_config/standard/toolsandtoys.net.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@class='post'] | 1 | body: //div[@class='post'] |
2 | 2 | ||
3 | strip: //div[@class='social'] | 3 | strip: //div[@class='social'] |
4 | strip: //span[@class='next'] | 4 | strip: //span[@class='next'] |
5 | strip: //span[@class='previous'] | 5 | strip: //span[@class='previous'] |
6 | test_url: http://toolsandtoys.net/noble-tonic-02/ \ No newline at end of file | 6 | test_url: http://toolsandtoys.net/noble-tonic-02/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tracks.ranea.org.txt b/inc/3rdparty/site_config/standard/tracks.ranea.org.txt new file mode 100755 index 00000000..5a386470 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tracks.ranea.org.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | # Metadata | ||
2 | title: substring-after(//title, 'Coyote Tracks - ') | ||
3 | author: //meta[@name="author"]/@content | ||
4 | date: //div[@class="post_header"]/a | ||
5 | |||
6 | # Content Pruning | ||
7 | strip: //div[@class="column left"] | ||
8 | strip: //div[@class="pages"] | ||
9 | strip: //a[@class="text_title"] | ||
10 | strip: //ol[@class="notes"] | ||
11 | |||
12 | dissolve: //div[@class='column right']/ul | ||
13 | dissolve: //li[@class='post'] | ||
14 | test_url: http://tracks.ranea.org/post/31431060205/the-next-big-uh-slightly-taller-thing \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/trailer.web-view.net.txt b/inc/3rdparty/site_config/standard/trailer.web-view.net.txt index e7a9c82d..e7a9c82d 100644..100755 --- a/inc/3rdparty/site_config/standard/trailer.web-view.net.txt +++ b/inc/3rdparty/site_config/standard/trailer.web-view.net.txt | |||
diff --git a/inc/3rdparty/site_config/standard/trailerzone.de.txt b/inc/3rdparty/site_config/standard/trailerzone.de.txt new file mode 100755 index 00000000..02151a63 --- /dev/null +++ b/inc/3rdparty/site_config/standard/trailerzone.de.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='video' or @id='main'] | ||
2 | |||
3 | strip_id_or_class: socialshareprivacy2 | ||
4 | strip_id_or_class: wp_rp_first | ||
5 | |||
6 | find_string: Genre</strong> | ||
7 | replace_string: </strong></p><p><strong>Genre</strong> | ||
8 | |||
9 | test_url: http://www.trailerzone.de/g-i-joe-2-die-abrechnung/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/traningslara.se.txt b/inc/3rdparty/site_config/standard/traningslara.se.txt index 96e491fa..d6cfb6db 100644..100755 --- a/inc/3rdparty/site_config/standard/traningslara.se.txt +++ b/inc/3rdparty/site_config/standard/traningslara.se.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //div[@class="Post-body"]//span[@class="PostHeader"] | 1 | title: //div[@class="Post-body"]//span[@class="PostHeader"] |
2 | author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"] | 2 | author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"] |
3 | date: substring-before(//div[@class="PostHeaderIcons metadata"], '|') | 3 | date: substring-before(//div[@class="PostHeaderIcons metadata"], '|') |
4 | body: //div[@class="Post-body"] | 4 | body: //div[@class="Post-body"] |
5 | strip_id_or_class: print1 | 5 | strip_id_or_class: print1 |
6 | strip_id_or_class: metadata | 6 | strip_id_or_class: metadata |
7 | strip_id_or_class: authorbox | 7 | strip_id_or_class: authorbox |
8 | test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/ \ No newline at end of file | 8 | test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/triblive.com.txt b/inc/3rdparty/site_config/standard/triblive.com.txt index 82797db9..663cafe1 100644..100755 --- a/inc/3rdparty/site_config/standard/triblive.com.txt +++ b/inc/3rdparty/site_config/standard/triblive.com.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //title | 1 | title: //title |
2 | author: //span/a | 2 | author: //span/a |
3 | date: substring-after(//small,'Published:') | 3 | date: substring-after(//small,'Published:') |
4 | 4 | ||
5 | strip: //h1[@class='vert_class'] | 5 | strip: //h1[@class='vert_class'] |
6 | strip: //h1[@class='headline'] | 6 | strip: //h1[@class='headline'] |
7 | strip: //img[contains(@src,'logo_triblive.gif')] | 7 | strip: //img[contains(@src,'logo_triblive.gif')] |
8 | 8 | ||
9 | #strip: //h6 | 9 | #strip: //h6 |
10 | #strip_img_src: logo_triblive.gif | 10 | #strip_img_src: logo_triblive.gif |
11 | 11 | ||
12 | single_page_link: //a[@class='stprint'] | 12 | single_page_link: //a[@class='stprint'] |
13 | test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case \ No newline at end of file | 13 | test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/truthdig.com.txt b/inc/3rdparty/site_config/standard/truthdig.com.txt index e7c1a4bc..9e0663b0 100644..100755 --- a/inc/3rdparty/site_config/standard/truthdig.com.txt +++ b/inc/3rdparty/site_config/standard/truthdig.com.txt | |||
@@ -1,10 +1,12 @@ | |||
1 | title: //div[@class='printbody']/h1 | 1 | title: //div[@class='printbody']/h1 |
2 | body: //div[@class='printbody'] | 2 | body: //div[@class='printbody'] |
3 | prune: no | 3 | prune: no |
4 | 4 | ||
5 | strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/'] | 5 | strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/'] |
6 | strip: //table[@class='footer'] | 6 | strip: //table[@class='footer'] |
7 | 7 | strip: //h6[contains(., 'http://')] | |
8 | single_page_link: //div[@class='article_tools']//a[contains(@href, '/print/')] | 8 | |
9 | 9 | single_page_link: //a[contains(@href, '/print/')] | |
10 | test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/ \ No newline at end of file | 10 | |
11 | test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/ | ||
12 | test_url: http://www.truthdig.com/dig/item/the_death_of_truth_20130505/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tthfanfic.org.txt b/inc/3rdparty/site_config/standard/tthfanfic.org.txt index 0dab5b0f..63537c10 100644..100755 --- a/inc/3rdparty/site_config/standard/tthfanfic.org.txt +++ b/inc/3rdparty/site_config/standard/tthfanfic.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | author: //a[starts-with(@href, '/AuthorStories')] | 2 | author: //a[starts-with(@href, '/AuthorStories')] |
3 | body: //div[@id='storyinnerbody'] | 3 | body: //div[@id='storyinnerbody'] |
4 | test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm \ No newline at end of file | 4 | test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tthor.com.txt b/inc/3rdparty/site_config/standard/tthor.com.txt index 902fcd13..902fcd13 100644..100755 --- a/inc/3rdparty/site_config/standard/tthor.com.txt +++ b/inc/3rdparty/site_config/standard/tthor.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/tuaw.com.txt b/inc/3rdparty/site_config/standard/tuaw.com.txt index b86f8ccb..2af00c27 100644..100755 --- a/inc/3rdparty/site_config/standard/tuaw.com.txt +++ b/inc/3rdparty/site_config/standard/tuaw.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class='posttitle'] | 1 | title: //h1[@class='posttitle'] |
2 | author: //span[@class='author']/a | 2 | author: //span[@class='author']/a |
3 | date: //span[@class='timestamp'] | 3 | date: //span[@class='timestamp'] |
4 | body: //div[@class='body'] | 4 | body: //div[@class='body'] |
5 | 5 | ||
6 | test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/ \ No newline at end of file | 6 | test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tuckreview.com.txt b/inc/3rdparty/site_config/standard/tuckreview.com.txt index a3946cbc..6e18e3da 100644..100755 --- a/inc/3rdparty/site_config/standard/tuckreview.com.txt +++ b/inc/3rdparty/site_config/standard/tuckreview.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class='post-title'] | 1 | title: //h1[@class='post-title'] |
2 | author: //div[@class='display-name'] | 2 | author: //div[@class='display-name'] |
3 | date: //div[@class='date'] | 3 | date: //div[@class='date'] |
4 | body: //div[@class='body'] | 4 | body: //div[@class='body'] |
5 | footnotes: no | 5 | footnotes: no |
6 | test_url: http://tuckreview.com/2012/8/14/migrating-to-v6 \ No newline at end of file | 6 | test_url: http://tuckreview.com/2012/8/14/migrating-to-v6 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/tvtropes.org.txt b/inc/3rdparty/site_config/standard/tvtropes.org.txt index 08dbba59..3cc3a9cf 100644..100755 --- a/inc/3rdparty/site_config/standard/tvtropes.org.txt +++ b/inc/3rdparty/site_config/standard/tvtropes.org.txt | |||
@@ -1,20 +1,20 @@ | |||
1 | # Google Custom Search | 1 | # Google Custom Search |
2 | strip_id_or_class: google_branding_style | 2 | strip_id_or_class: google_branding_style |
3 | 3 | ||
4 | # Avoid double title | 4 | # Avoid double title |
5 | strip_id_or_class: pagetitle | 5 | strip_id_or_class: pagetitle |
6 | 6 | ||
7 | # external links are labelled | 7 | # external links are labelled |
8 | strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif | 8 | strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif |
9 | 9 | ||
10 | title: //div[@class="pagetitle"] | 10 | title: //div[@class="pagetitle"] |
11 | body: //div[@id="wikitext"] | 11 | body: //div[@id="wikitext"] |
12 | 12 | ||
13 | # don't get clever. | 13 | # don't get clever. |
14 | strip_comments: no | 14 | strip_comments: no |
15 | prune: no | 15 | prune: no |
16 | 16 | ||
17 | # navigation in footer lives inside the wikitext div, annoyingly. | 17 | # navigation in footer lives inside the wikitext div, annoyingly. |
18 | strip_id_or_class: pathholder | 18 | strip_id_or_class: pathholder |
19 | 19 | ||
20 | test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters \ No newline at end of file | 20 | test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/twitter.com.txt b/inc/3rdparty/site_config/standard/twitter.com.txt index 12ab1546..520ebd85 100644..100755 --- a/inc/3rdparty/site_config/standard/twitter.com.txt +++ b/inc/3rdparty/site_config/standard/twitter.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //title | 1 | title: //title |
2 | body: (//p[contains(@class, 'js-tweet-text')])[1] | 2 | body: (//p[contains(@class, 'js-tweet-text')])[1] |
3 | author: (//strong[contains(@class, 'fullname')])[1] | 3 | author: (//strong[contains(@class, 'fullname')])[1] |
4 | date: //span[contains(@class, 'js-short-timestamp')]/@data-time | 4 | date: //span[contains(@class, 'js-short-timestamp')]/@data-time |
5 | 5 | ||
6 | prune: no | 6 | prune: no |
7 | tidy: no | 7 | tidy: no |
8 | 8 | ||
9 | test_url: https://twitter.com/medialens/status/216883678582804480 \ No newline at end of file | 9 | test_url: https://twitter.com/medialens/status/216883678582804480 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/uefa.com.txt b/inc/3rdparty/site_config/standard/uefa.com.txt index 088d6586..3469be03 100644..100755 --- a/inc/3rdparty/site_config/standard/uefa.com.txt +++ b/inc/3rdparty/site_config/standard/uefa.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText'] | 1 | body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText'] |
2 | strip: //div[contains(@class, 'mpindex')] | 2 | strip: //div[contains(@class, 'mpindex')] |
3 | prune: no | 3 | prune: no |
4 | tidy: no | 4 | tidy: no |
5 | 5 | ||
6 | test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html \ No newline at end of file | 6 | test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt b/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt index 29e19565..cd9c1361 100644..100755 --- a/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt +++ b/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt | |||
@@ -1,23 +1,23 @@ | |||
1 | # applies to uk.ds.ign.com, uk.wii.ign.com etc. | 1 | # applies to uk.ds.ign.com, uk.wii.ign.com etc. |
2 | # possibly to non-UK versions, but I can’t test that | 2 | # possibly to non-UK versions, but I can’t test that |
3 | 3 | ||
4 | title: //h1[@class="headline"] | 4 | title: //h1[@class="headline"] |
5 | author: //div[@class="hdr-sub byline"]/a | 5 | author: //div[@class="hdr-sub byline"]/a |
6 | date: //h2[@class="publish-date"]/span | 6 | date: //h2[@class="publish-date"]/span |
7 | body: //div[@id="main-article-content"] | 7 | body: //div[@id="main-article-content"] |
8 | 8 | ||
9 | strip: //ul[@class="lnks-readmore"] | 9 | strip: //ul[@class="lnks-readmore"] |
10 | 10 | ||
11 | strip: //div[@class="inlineImageCaption"] | 11 | strip: //div[@class="inlineImageCaption"] |
12 | # can’t make the images appear, so remove the captions | 12 | # can’t make the images appear, so remove the captions |
13 | 13 | ||
14 | strip: //div[@style="width:468px"] | 14 | strip: //div[@style="width:468px"] |
15 | # video caption links | 15 | # video caption links |
16 | 16 | ||
17 | convert_double_br_tags: yes | 17 | convert_double_br_tags: yes |
18 | 18 | ||
19 | strip_comments: no | 19 | strip_comments: no |
20 | # otherwise the ‘Closing Comments’ are removed | 20 | # otherwise the ‘Closing Comments’ are removed |
21 | 21 | ||
22 | # Ratings box could do with some rearranging, but it’s tricky | 22 | # Ratings box could do with some rearranging, but it’s tricky |
23 | test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html \ No newline at end of file | 23 | test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/uni-watch.com.txt b/inc/3rdparty/site_config/standard/uni-watch.com.txt index cbe87d19..4a5ae344 100644..100755 --- a/inc/3rdparty/site_config/standard/uni-watch.com.txt +++ b/inc/3rdparty/site_config/standard/uni-watch.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on') | 1 | author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on') |
2 | date: substring-after(//div[@class='post-byline'], ', on') | 2 | date: substring-after(//div[@class='post-byline'], ', on') |
3 | 3 | ||
4 | # for some reason, the following is producing a "no text [48]" error | 4 | # for some reason, the following is producing a "no text [48]" error |
5 | #title: //div[@class='post-headline'] | 5 | #title: //div[@class='post-headline'] |
6 | 6 | ||
7 | # for some reason, the following doesn't appear to isolate just the body copy | 7 | # for some reason, the following doesn't appear to isolate just the body copy |
8 | body: //div[@class='post-bodycopy'] | 8 | body: //div[@class='post-bodycopy'] |
9 | 9 | ||
10 | # we solve the above issue by stripping out everything else we don't want | 10 | # we solve the above issue by stripping out everything else we don't want |
11 | # these can probably all be removed if the body: command above worked | 11 | # these can probably all be removed if the body: command above worked |
12 | strip_id_or_class: reply | 12 | strip_id_or_class: reply |
13 | strip_id_or_class: left | 13 | strip_id_or_class: left |
14 | strip_id_or_class: post-headline | 14 | strip_id_or_class: post-headline |
15 | strip_id_or_class: post-byline | 15 | strip_id_or_class: post-byline |
16 | strip_id_or_class: footer | 16 | strip_id_or_class: footer |
17 | test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/ \ No newline at end of file | 17 | test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/unwinnable.com.txt b/inc/3rdparty/site_config/standard/unwinnable.com.txt new file mode 100755 index 00000000..05ad86a5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/unwinnable.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h1[@class='postTitle'] | ||
2 | author: //a[@rel='author'] | ||
3 | date: substring-before(//h4[@class='postAuthor'], '|') | ||
4 | body: //div[@class='postContent'] | ||
5 | |||
6 | strip: //div[@class='simplePullQuote'] | ||
7 | |||
8 | wrap_in(figure): //img | ||
9 | test_url: http://www.unwinnable.com/2013/04/23/gratifying-play/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/uppsalafria.se.txt b/inc/3rdparty/site_config/standard/uppsalafria.se.txt new file mode 100755 index 00000000..79c59ece --- /dev/null +++ b/inc/3rdparty/site_config/standard/uppsalafria.se.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')] | ||
2 | author: //article//div[contains(@class, 'field-byline')] | ||
3 | strip_id_or_class: rekommenderade | ||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: annonser | ||
6 | |||
7 | test_url: http://www.uppsalafria.se/artikel/97167 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/urbandictionary.com.txt b/inc/3rdparty/site_config/standard/urbandictionary.com.txt index 86061f77..385c95ca 100644..100755 --- a/inc/3rdparty/site_config/standard/urbandictionary.com.txt +++ b/inc/3rdparty/site_config/standard/urbandictionary.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //title | 1 | title: //title |
2 | body: //td[@id='content'] | 2 | body: //table[@id='entries'] |
3 | test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass \ No newline at end of file | 3 | test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass |
diff --git a/inc/3rdparty/site_config/standard/usatoday.com.txt b/inc/3rdparty/site_config/standard/usatoday.com.txt new file mode 100755 index 00000000..710a7b37 --- /dev/null +++ b/inc/3rdparty/site_config/standard/usatoday.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | date: //meta[@itemprop="datePublished"]/@content | ||
2 | author: //div[@itemprop="author"] | ||
3 | body: //div[@itemprop='articleBody'] | ||
4 | |||
5 | strip_id_or_class: share-tools | ||
6 | |||
7 | test_url: http://www.usatoday.com/story/news/world/2014/03/18/malaysia-plane-search/6552429/ | ||
8 | test_url: http://rssfeeds.usatoday.com/usatoday-NewsTopStories \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/usccb.org.txt b/inc/3rdparty/site_config/standard/usccb.org.txt index eb10a48f..30c28823 100644..100755 --- a/inc/3rdparty/site_config/standard/usccb.org.txt +++ b/inc/3rdparty/site_config/standard/usccb.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@id='CS_Element_maincontent'] | 1 | body: //div[@id='CS_Element_maincontent'] |
2 | 2 | ||
3 | tidy: no | 3 | tidy: no |
4 | prune: no | 4 | prune: no |
5 | 5 | ||
6 | test_url: http://www.usccb.org/bible/readings/072412.cfm \ No newline at end of file | 6 | test_url: http://www.usccb.org/bible/readings/072412.cfm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/useit.com.txt b/inc/3rdparty/site_config/standard/useit.com.txt index f6be84c4..b8511c7c 100644..100755 --- a/inc/3rdparty/site_config/standard/useit.com.txt +++ b/inc/3rdparty/site_config/standard/useit.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | 2 | ||
3 | date: substring-after(//p[@class='overline']/strong, ',') | 3 | date: substring-after(//p[@class='overline']/strong, ',') |
4 | body: //div[@class="maintext"] | 4 | body: //div[@class="maintext"] |
5 | strip: //p[@class='overline'] | 5 | strip: //p[@class='overline'] |
6 | strip: //h1 | 6 | strip: //h1 |
7 | tidy: no | 7 | tidy: no |
8 | test_url: http://www.useit.com/alertbox/mobile-startup-screen.html \ No newline at end of file | 8 | test_url: http://www.useit.com/alertbox/mobile-startup-screen.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/usfirst.org.txt b/inc/3rdparty/site_config/standard/usfirst.org.txt new file mode 100755 index 00000000..f02b2d3e --- /dev/null +++ b/inc/3rdparty/site_config/standard/usfirst.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //meta[@property='dc:title']/@content | ||
2 | date: //div[@class='content']//span[@property='dc:date']/@content | ||
3 | body: //div[@property='content:encoded'] | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.usfirst.org/roboticsprograms/frc/Photo-From-Kickoff-Filming \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/utdailybeacon.com.txt b/inc/3rdparty/site_config/standard/utdailybeacon.com.txt new file mode 100755 index 00000000..d37911bc --- /dev/null +++ b/inc/3rdparty/site_config/standard/utdailybeacon.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1 | ||
2 | author: //*[@class='byline'] | ||
3 | date: substring-after(//*[@class='pubdatetime'], 'Published: ') | ||
4 | body: //*[@class='body-block'] | ||
5 | test_url: http://utdailybeacon.com/news/2012/oct/8/energy-forum-continues/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ux.artu.tv.txt b/inc/3rdparty/site_config/standard/ux.artu.tv.txt index a893bda0..c69f2df9 100644..100755 --- a/inc/3rdparty/site_config/standard/ux.artu.tv.txt +++ b/inc/3rdparty/site_config/standard/ux.artu.tv.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | author: ("Arturo Toledo") | 1 | author: ("Arturo Toledo") |
2 | title: //div[@class="post"]/h2 | 2 | title: //div[@class="post"]/h2 |
3 | body: //div[@class="entry"] | 3 | body: //div[@class="entry"] |
4 | 4 | ||
5 | # Remove Twitter button | 5 | # Remove Twitter button |
6 | strip: //div[@class="entry"]/p[2]/a/img | 6 | strip: //div[@class="entry"]/p[2]/a/img |
7 | test_url: http://ux.artu.tv/?p=192 \ No newline at end of file | 7 | test_url: http://ux.artu.tv/?p=192 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt b/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt index 3661b06a..3661b06a 100644..100755 --- a/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt +++ b/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt | |||
diff --git a/inc/3rdparty/site_config/standard/vanityfair.com.txt b/inc/3rdparty/site_config/standard/vanityfair.com.txt index bfc47d1f..efa38224 100644..100755 --- a/inc/3rdparty/site_config/standard/vanityfair.com.txt +++ b/inc/3rdparty/site_config/standard/vanityfair.com.txt | |||
@@ -1,30 +1,30 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@property="og:title"]/@content |
2 | author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')] | 2 | author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')] |
3 | date: //div[contains(@class, 'cn_date_time')] | 3 | date: //div[contains(@class, 'cn_date_time')] |
4 | body: //div[contains(@class, 'pageContainers')] | 4 | body: //div[contains(@class, 'pageContainers')] |
5 | body: //article[@id='items-container'] | 5 | body: //article[@id='items-container'] |
6 | #body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container'] | 6 | #body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container'] |
7 | 7 | ||
8 | strip_id_or_class: bc | 8 | strip_id_or_class: bc |
9 | strip_id_or_class: utilities | 9 | strip_id_or_class: utilities |
10 | strip_id_or_class: list-supporting | 10 | strip_id_or_class: list-supporting |
11 | strip_id_or_class: yrail | 11 | strip_id_or_class: yrail |
12 | strip_id_or_class: urail | 12 | strip_id_or_class: urail |
13 | 13 | ||
14 | prune: no | 14 | prune: no |
15 | #tidy: no | 15 | #tidy: no |
16 | 16 | ||
17 | strip_id_or_class: super-rubric-section | 17 | strip_id_or_class: super-rubric-section |
18 | strip_id_or_class: cn_date_time | 18 | strip_id_or_class: cn_date_time |
19 | strip_id_or_class: cn_contributors | 19 | strip_id_or_class: cn_contributors |
20 | strip_id_or_class: cn_pagination_controls | 20 | strip_id_or_class: cn_pagination_controls |
21 | strip_id_or_class: cn_features_container | 21 | strip_id_or_class: cn_features_container |
22 | strip_id_or_class: global-footer | 22 | strip_id_or_class: global-footer |
23 | strip_id_or_class: cn_ecom_placement | 23 | strip_id_or_class: cn_ecom_placement |
24 | strip: //li[@class='blogNavPrev'] | 24 | strip: //li[@class='blogNavPrev'] |
25 | 25 | ||
26 | single_page_link: //a[@title='Print this page'] | 26 | single_page_link: //a[@title='Print this page'] |
27 | 27 | ||
28 | test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105 | 28 | test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105 |
29 | test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808 | 29 | test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808 |
30 | test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201 \ No newline at end of file | 30 | test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/varingen.no.txt b/inc/3rdparty/site_config/standard/varingen.no.txt index 6b5e0ae0..c0133c95 100644..100755 --- a/inc/3rdparty/site_config/standard/varingen.no.txt +++ b/inc/3rdparty/site_config/standard/varingen.no.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class='ArticleHeadlineDetailedView'] | 1 | title: //div[@class='ArticleHeadlineDetailedView'] |
2 | date: //span[@class='ArticlePublicationDateTimeDetailedView'] | 2 | date: //span[@class='ArticlePublicationDateTimeDetailedView'] |
3 | author://span[@class='ArticleBylineDetailedView'] | 3 | author://span[@class='ArticleBylineDetailedView'] |
4 | body: //div[@class='ArticleTextDetailedView'] | 4 | body: //div[@class='ArticleTextDetailedView'] |
5 | test_url: http://www.varingen.no/Nyheter/tabid/392/Default.aspx?ModuleId=56651&articleView=true \ No newline at end of file | 5 | test_url: http://www.varingen.no/Nyheter/tabid/392/Default.aspx?ModuleId=56651&articleView=true \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/varsity.co.uk.txt b/inc/3rdparty/site_config/standard/varsity.co.uk.txt index b1db4c35..dfbf69cf 100644..100755 --- a/inc/3rdparty/site_config/standard/varsity.co.uk.txt +++ b/inc/3rdparty/site_config/standard/varsity.co.uk.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | # FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser | 1 | # FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser |
2 | 2 | ||
3 | strip: //h2 | 3 | strip: //h2 |
4 | test_url: http://www.varsity.co.uk/reviews/2662 \ No newline at end of file | 4 | test_url: http://www.varsity.co.uk/reviews/2662 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vea.gov.vn.txt b/inc/3rdparty/site_config/standard/vea.gov.vn.txt new file mode 100755 index 00000000..9c8420ce --- /dev/null +++ b/inc/3rdparty/site_config/standard/vea.gov.vn.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title://div[@class="detail-new-title"] | ||
2 | body://div[@class="innerpad"] | ||
3 | strip://div[@class="ArticleUtility"] | ||
4 | strip://div[@class="commentPost"] | ||
5 | strip://div[@class="comment-box"] | ||
6 | strip://div[@id="TinLienQuan"] | ||
7 | test_url: http://vea.gov.vn/vn/tintuc/tintuchangngay/Pages/T%C4%83ng-c%C6%B0%E1%BB%9Dng-b%E1%BA%A3o-t%E1%BB%93n-%C4%91%E1%BB%99ng-v%E1%BA%ADt-hoang-d%C3%A3-%E1%BB%9F-Vi%E1%BB%87t-Nam.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vedomosti.ru.txt b/inc/3rdparty/site_config/standard/vedomosti.ru.txt index ba999171..265f9fc7 100644..100755 --- a/inc/3rdparty/site_config/standard/vedomosti.ru.txt +++ b/inc/3rdparty/site_config/standard/vedomosti.ru.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //td[@class='second_content']/h1 | 1 | title: //td[@class='second_content']/h1 |
2 | body: //td[@class='second_content']/div[@class='article_text'] | 2 | body: //td[@class='second_content']/div[@class='article_text'] |
3 | test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru \ No newline at end of file | 3 | test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/veggbilder.no.txt b/inc/3rdparty/site_config/standard/veggbilder.no.txt index 14144c0f..2a44c317 100644..100755 --- a/inc/3rdparty/site_config/standard/veggbilder.no.txt +++ b/inc/3rdparty/site_config/standard/veggbilder.no.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | author: //div[@class="blogginnleggForfatter"] | 1 | author: //div[@class="blogginnleggForfatter"] |
2 | date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd']) | 2 | date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd']) |
3 | strip: //div[contains(@id,"bloggDelingslenker")] | 3 | strip: //div[contains(@id,"bloggDelingslenker")] |
4 | strip: //div[contains(@id,"bloggDelingslenker")] | 4 | strip: //div[contains(@id,"bloggDelingslenker")] |
5 | test_url: http://veggbilder.no/blogginnlegg/fristelser \ No newline at end of file | 5 | test_url: http://veggbilder.no/blogginnlegg/fristelser \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vemedio.com.txt b/inc/3rdparty/site_config/standard/vemedio.com.txt index 294ace9c..d22fc5cf 100644..100755 --- a/inc/3rdparty/site_config/standard/vemedio.com.txt +++ b/inc/3rdparty/site_config/standard/vemedio.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2 | 1 | title: //h2 |
2 | date: substring-before(//small," • Permalink") | 2 | date: substring-before(//small," • Permalink") |
3 | author:string('Martin Hering') | 3 | author:string('Martin Hering') |
4 | 4 | ||
5 | Strip: //p/small | 5 | Strip: //p/small |
6 | test_url: http://vemedio.com/blog/posts/state-of-support-and-icloud \ No newline at end of file | 6 | test_url: http://vemedio.com/blog/posts/state-of-support-and-icloud \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/venturebeat.com.txt b/inc/3rdparty/site_config/standard/venturebeat.com.txt index 41bfa8c5..d6321d79 100644..100755 --- a/inc/3rdparty/site_config/standard/venturebeat.com.txt +++ b/inc/3rdparty/site_config/standard/venturebeat.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class="entry-title"] | 1 | title: //h1[@class="entry-title"] |
2 | author: //div[@class="author-name"] | 2 | author: //div[@class="author-name"] |
3 | date: //span[@class="the-time"] | 3 | date: //span[@class="the-time"] |
4 | body: //div[@class="entry-content"] | 4 | body: //div[@class="entry-content"] |
5 | strip: //div[@class="vb-gallery"] | 5 | strip: //div[@class="vb-gallery"] |
6 | test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1 \ No newline at end of file | 6 | test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/version2.dk.txt b/inc/3rdparty/site_config/standard/version2.dk.txt index 74203cad..418b83a1 100644..100755 --- a/inc/3rdparty/site_config/standard/version2.dk.txt +++ b/inc/3rdparty/site_config/standard/version2.dk.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //article/header/h1 | 1 | title: //article/header/h1 |
2 | 2 | ||
3 | author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a | 3 | author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a |
4 | date: //article/header/section[@class='byline']/span[@class='published']/span | 4 | date: //article/header/section[@class='byline']/span[@class='published']/span |
5 | 5 | ||
6 | body: //article/section[@class='body'] | 6 | body: //article/section[@class='body'] |
7 | 7 | ||
8 | convert_double_br_tags: yes | 8 | convert_double_br_tags: yes |
9 | 9 | ||
10 | # This is required, because Tidy chokes on the HTML5 tags... | 10 | # This is required, because Tidy chokes on the HTML5 tags... |
11 | tidy: no | 11 | tidy: no |
12 | test_url: http://www.version2.dk/artikel/17069-amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere \ No newline at end of file | 12 | test_url: http://www.version2.dk/artikel/17069-amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/verybestbaking.com.txt b/inc/3rdparty/site_config/standard/verybestbaking.com.txt index 4cdd0c0f..ad0fec66 100644..100755 --- a/inc/3rdparty/site_config/standard/verybestbaking.com.txt +++ b/inc/3rdparty/site_config/standard/verybestbaking.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | title: //title | 1 | title: //title |
2 | body: //div[contains(@class, 'printRecipe')] | 2 | body: //div[contains(@class, 'printRecipe')] |
3 | strip: //div[@class='recipeHeader'] | 3 | strip: //div[@class='recipeHeader'] |
4 | prune: no | 4 | prune: no |
5 | tidy: no | 5 | tidy: no |
6 | single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')] | 6 | single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')] |
7 | test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx \ No newline at end of file | 7 | test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vg.no.txt b/inc/3rdparty/site_config/standard/vg.no.txt index fceeea09..bfadb4a7 100644..100755 --- a/inc/3rdparty/site_config/standard/vg.no.txt +++ b/inc/3rdparty/site_config/standard/vg.no.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@id='artikkelspalte'] | 1 | body: //div[@id='artikkelspalte'] |
2 | strip_id_or_class: 'breadcrumb' | 2 | strip_id_or_class: 'breadcrumb' |
3 | test_url: http://www.vg.no/spill/artikkel.php?artid=10003628 \ No newline at end of file | 3 | test_url: http://www.vg.no/spill/artikkel.php?artid=10003628 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/video.forbes.com.txt b/inc/3rdparty/site_config/standard/video.forbes.com.txt index 1dca55a3..5db77463 100644..100755 --- a/inc/3rdparty/site_config/standard/video.forbes.com.txt +++ b/inc/3rdparty/site_config/standard/video.forbes.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: concat("Video: ", //div[@id='currentVideoTitleDivId']) | 1 | title: concat("Video: ", //div[@id='currentVideoTitleDivId']) |
2 | body: //div[@id='currentVideoDescriptionId'] | 2 | body: //div[@id='currentVideoDescriptionId'] |
3 | author: //meta[@name='author']/@content | 3 | author: //meta[@name='author']/@content |
4 | 4 | ||
5 | replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease | 5 | replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease |
6 | 6 | ||
7 | replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease | 7 | replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease |
8 | 8 | ||
9 | test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works \ No newline at end of file | 9 | test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/videogum.com.txt b/inc/3rdparty/site_config/standard/videogum.com.txt index a1663813..d93780ca 100644..100755 --- a/inc/3rdparty/site_config/standard/videogum.com.txt +++ b/inc/3rdparty/site_config/standard/videogum.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2[@class='posttitle'] | 1 | title: //h2[@class='posttitle'] |
2 | date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by') | 2 | date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by') |
3 | date: //span[@class='postdate'] | 3 | date: //span[@class='postdate'] |
4 | author: //span[@class='postdate']/a | 4 | author: //span[@class='postdate']/a |
5 | body: //div[@class='entry line_top'] | 5 | body: //div[@class='entry line_top'] |
6 | test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/ \ No newline at end of file | 6 | test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/villagevoice.com.txt b/inc/3rdparty/site_config/standard/villagevoice.com.txt index df374602..36e4a2f5 100644..100755 --- a/inc/3rdparty/site_config/standard/villagevoice.com.txt +++ b/inc/3rdparty/site_config/standard/villagevoice.com.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h2[@class='headline'] | 1 | title: //h2[@class='headline'] |
2 | 2 | ||
3 | body: //div[@class='ContentPrint'] | 3 | body: //div[@class='ContentPrint'] |
4 | 4 | ||
5 | prune: no | 5 | prune: no |
6 | 6 | ||
7 | single_page_link: //a[contains(@href, '/printVersion/')] | 7 | single_page_link: //a[contains(@href, '/printVersion/')] |
8 | 8 | ||
9 | test_url: http://www.villagevoice.com/2010-03-16/news/new-york-s-ten-worst-landlords/ \ No newline at end of file | 9 | test_url: http://www.villagevoice.com/2010-03-16/news/new-york-s-ten-worst-landlords/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vimeo.com.txt b/inc/3rdparty/site_config/standard/vimeo.com.txt index d6c6701a..f36c9c57 100644..100755 --- a/inc/3rdparty/site_config/standard/vimeo.com.txt +++ b/inc/3rdparty/site_config/standard/vimeo.com.txt | |||
@@ -1,17 +1,17 @@ | |||
1 | title: //title | 1 | title: //title |
2 | body: //iframe | 2 | body: //iframe |
3 | 3 | ||
4 | find_string: <html><iframe | 4 | find_string: <html><iframe |
5 | replace_string: <iframe id="video" | 5 | replace_string: <iframe id="video" |
6 | 6 | ||
7 | find_string: ></iframe></html> | 7 | find_string: ></iframe></html> |
8 | replace_string: ></iframe> | 8 | replace_string: ></iframe> |
9 | 9 | ||
10 | replace_string("): " | 10 | replace_string("): " |
11 | 11 | ||
12 | single_page_link: //link[@type='text/xml+oembed'] | 12 | single_page_link: //link[@type='text/xml+oembed'] |
13 | 13 | ||
14 | prune: no | 14 | prune: no |
15 | tidy: no | 15 | tidy: no |
16 | 16 | ||
17 | test_url: http://vimeo.com/35941909 \ No newline at end of file | 17 | test_url: http://vimeo.com/35941909 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/viply.de.txt b/inc/3rdparty/site_config/standard/viply.de.txt new file mode 100755 index 00000000..e3599c9d --- /dev/null +++ b/inc/3rdparty/site_config/standard/viply.de.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //div[@id='singletext']//h1 | ||
2 | body: //div[contains(@class, 'mypictureborder')] | //div[@id='singletext'] | ||
3 | prune: no | ||
4 | |||
5 | strip_id_or_class: singletostart | ||
6 | strip_id_or_class: navigation | ||
7 | strip_id_or_class: social | ||
8 | strip_id_or_class: single_topwrapper | ||
9 | strip: //a[contains(., 'Nächster Artikel')] | ||
10 | |||
11 | test_url: http://www.viply.de/?p=87973 | ||
12 | test_url: http://www.viply.de/?feed=rss2 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/visir.is.txt b/inc/3rdparty/site_config/standard/visir.is.txt index 0f03198e..04e09102 100644..100755 --- a/inc/3rdparty/site_config/standard/visir.is.txt +++ b/inc/3rdparty/site_config/standard/visir.is.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | # Author's name, when present, has 'skrifar:' ('writes:') appended to it. | 1 | # Author's name, when present, has 'skrifar:' ('writes:') appended to it. |
2 | # In case of multiple authors, this would be 'skrifa:', hence only 7 characters | 2 | # In case of multiple authors, this would be 'skrifa:', hence only 7 characters |
3 | # are stripped off. | 3 | # are stripped off. |
4 | author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7) | 4 | author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7) |
5 | 5 | ||
6 | date: //span[@class='date'] | 6 | date: //span[@class='date'] |
7 | title: //h1 | 7 | title: //h1 |
8 | body: //div[@class='paragraph'] | 8 | body: //div[@class='paragraph'] |
9 | 9 | ||
10 | # Strip out author string when present | 10 | # Strip out author string when present |
11 | strip: //div[@class='paragraph']/div[@class='meta'] | 11 | strip: //div[@class='paragraph']/div[@class='meta'] |
12 | 12 | ||
13 | convert_double_br_tags: yes | 13 | convert_double_br_tags: yes |
14 | test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997 \ No newline at end of file | 14 | test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vitispr.com.txt b/inc/3rdparty/site_config/standard/vitispr.com.txt index 8b2a300e..f2d11c7c 100644..100755 --- a/inc/3rdparty/site_config/standard/vitispr.com.txt +++ b/inc/3rdparty/site_config/standard/vitispr.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | strip: //*[(@id = "ja-search")] | 1 | strip: //*[(@id = "ja-search")] |
2 | body: //*[(@id = "ja-mainbody")] | 2 | body: //*[(@id = "ja-mainbody")] |
3 | body: //*[(@id = "content-mass-bottom")] | 3 | body: //*[(@id = "content-mass-bottom")] |
4 | strip://h3[contains(span,'Related Posts')] | 4 | strip://h3[contains(span,'Related Posts')] |
5 | strip://img | 5 | strip://img |
6 | test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot \ No newline at end of file | 6 | test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vivirmexico.com.txt b/inc/3rdparty/site_config/standard/vivirmexico.com.txt index e6a72700..e6a72700 100644..100755 --- a/inc/3rdparty/site_config/standard/vivirmexico.com.txt +++ b/inc/3rdparty/site_config/standard/vivirmexico.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/vnexpress.net.txt b/inc/3rdparty/site_config/standard/vnexpress.net.txt index 23c928bf..e5ebc435 100644..100755 --- a/inc/3rdparty/site_config/standard/vnexpress.net.txt +++ b/inc/3rdparty/site_config/standard/vnexpress.net.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table | 1 | body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table |
2 | strip://div[@class="box-item"] | 2 | strip://div[@class="box-item"] |
3 | strip://div[@id="ARTICLE_BANNER"] | 3 | strip://div[@id="ARTICLE_BANNER"] |
4 | strip://a | 4 | strip://a |
5 | strip://div[@class="tag-parent"] | 5 | strip://div[@class="tag-parent"] |
6 | strip://div[@class="email-print txtr"] | 6 | strip://div[@class="email-print txtr"] |
7 | 7 | ||
8 | test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/ \ No newline at end of file | 8 | test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt b/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt index 6bd0e855..b754aeb8 100644..100755 --- a/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt +++ b/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@class='entrytext'] | 2 | body: //div[@class='entrytext'] |
3 | test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html \ No newline at end of file | 3 | test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/vworker.com.txt b/inc/3rdparty/site_config/standard/vworker.com.txt index a39c9f4e..cfb9ea1c 100644..100755 --- a/inc/3rdparty/site_config/standard/vworker.com.txt +++ b/inc/3rdparty/site_config/standard/vworker.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[contains(@class, 'KonaBody')] | 1 | body: //div[contains(@class, 'KonaBody')] |
2 | 2 | ||
3 | test_url: http://www.vworker.com/RentACoder/misc/BidRequests/ShowBidRequest.asp?lngBidRequestId=1634186 \ No newline at end of file | 3 | test_url: http://www.vworker.com/RentACoder/misc/BidRequests/ShowBidRequest.asp?lngBidRequestId=1634186 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/waffle.wootest.net.txt b/inc/3rdparty/site_config/standard/waffle.wootest.net.txt index afcba0f3..e92757d7 100644..100755 --- a/inc/3rdparty/site_config/standard/waffle.wootest.net.txt +++ b/inc/3rdparty/site_config/standard/waffle.wootest.net.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h2[@class="title"] | 1 | title: //h2[@class="title"] |
2 | body: //div[@class="post"] | 2 | body: //div[@class="post"] |
3 | 3 | ||
4 | test_url: http://waffle.wootest.net/2011/06/22/on-reading-news/ \ No newline at end of file | 4 | test_url: http://waffle.wootest.net/2011/06/22/on-reading-news/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/walrusmagazine.com.txt b/inc/3rdparty/site_config/standard/walrusmagazine.com.txt index 3ab22172..c53eb0dd 100644..100755 --- a/inc/3rdparty/site_config/standard/walrusmagazine.com.txt +++ b/inc/3rdparty/site_config/standard/walrusmagazine.com.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | title: //div[@id='pr']/h3 | 1 | title: //div[@id='pr']/h3 |
2 | author: //div[@class='dateline']//a[contains(@href, '/author/')] | 2 | author: //div[@class='dateline']//a[contains(@href, '/author/')] |
3 | 3 | ||
4 | # print page | 4 | # print page |
5 | body: //div[@id='prbody'] | 5 | body: //div[@id='prbody'] |
6 | # standard page | 6 | # standard page |
7 | body: //div[@id='pgbody'] | 7 | body: //div[@id='pgbody'] |
8 | 8 | ||
9 | # for multi-page articles | 9 | # for multi-page articles |
10 | single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')] | 10 | single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')] |
11 | 11 | ||
12 | prune: no | 12 | prune: no |
13 | 13 | ||
14 | test_url: http://www.walrusmagazine.com/articles/2011.12-memoir-kidnapped \ No newline at end of file | 14 | test_url: http://www.walrusmagazine.com/articles/2011.12-memoir-kidnapped \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/warnerbros.fr.txt b/inc/3rdparty/site_config/standard/warnerbros.fr.txt index a41a3511..21f56352 100644..100755 --- a/inc/3rdparty/site_config/standard/warnerbros.fr.txt +++ b/inc/3rdparty/site_config/standard/warnerbros.fr.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h3 | 1 | title: //h3 |
2 | body: //div[@class="content_wysiwyg"] | 2 | body: //div[@class="content_wysiwyg"] |
3 | test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html \ No newline at end of file | 3 | test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/washingtoninstitute.org.txt b/inc/3rdparty/site_config/standard/washingtoninstitute.org.txt new file mode 100755 index 00000000..17f45677 --- /dev/null +++ b/inc/3rdparty/site_config/standard/washingtoninstitute.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class='main']//article | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.washingtoninstitute.org/policy-analysis/view/striking-syria-lessons-from-the-israeli-experience?goback=.gde_3822158_member_273623672 | ||
6 | test_url: http://www.washingtoninstitute.org/rss/11/10 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt b/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt index edf16422..8f8902a5 100644..100755 --- a/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt +++ b/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title://a[@class = 'headline-article'] | 1 | title://a[@class = 'headline-article'] |
2 | 2 | ||
3 | author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ') | 3 | author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ') |
4 | date://div[@class = 'article']/span[@class = 'date'] | 4 | date://div[@class = 'article']/span[@class = 'date'] |
5 | body://div[@class = 'article'] | 5 | body://div[@class = 'article'] |
6 | single_page_link://a[@class = 'print'] | 6 | single_page_link://a[@class = 'print'] |
7 | strip://p[@class = 'author'] | 7 | strip://p[@class = 'author'] |
8 | strip://a[@class = 'headline-article'] | 8 | strip://a[@class = 'headline-article'] |
9 | strip://span[@class = 'date'] | 9 | strip://span[@class = 'date'] |
10 | test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php \ No newline at end of file | 10 | test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/washingtonpost.com.txt b/inc/3rdparty/site_config/standard/washingtonpost.com.txt index 2931ca5f..0aa9f1d8 100644..100755 --- a/inc/3rdparty/site_config/standard/washingtonpost.com.txt +++ b/inc/3rdparty/site_config/standard/washingtonpost.com.txt | |||
@@ -1,21 +1,32 @@ | |||
1 | body: //div[@class="article_body"] | 1 | # Seems to be redirecting to articles.washingtonpost.com for many users |
2 | author://meta[@name='DC.creator']/@content | 2 | |
3 | title://meta[@name='title']/@content | 3 | body: //div[contains(@class, "article_body")] |
4 | date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title | 4 | # print view |
5 | date://meta[@name="DC.date.issued"]/@content | 5 | body: //div[@id='print_facet']//div[@id='body'] |
6 | strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] | 6 | |
7 | strip://div[@id="wp-column six end"] | 7 | author://meta[@name='DC.creator']/@content |
8 | strip://div[contains(@class,'hidden')] | 8 | title://meta[@name='title']/@content |
9 | strip://div[@id='article-side-rail'] | 9 | date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title |
10 | strip://div[@class="module component todays-paper-module curved"] | 10 | date://meta[@name="DC.date.issued"]/@content |
11 | strip://div[@class="module component live-qa curved img-border"] | 11 | strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] |
12 | strip://div[@class="module component newsletter-signup curved"] | 12 | strip://div[@id="wp-column six end"] |
13 | strip://div[@class="module featured-stories component curved img-border"] | 13 | strip://div[contains(@class,'hidden')] |
14 | 14 | strip://div[@id='article-side-rail'] | |
15 | strip_id_or_class: carousel | 15 | strip://div[@class="module component todays-paper-module curved"] |
16 | strip_id_or_class: toolbar | 16 | strip://div[@class="module component live-qa curved img-border"] |
17 | strip_id_or_class: module | 17 | strip://div[@class="module component newsletter-signup curved"] |
18 | 18 | strip://div[@class="module featured-stories component curved img-border"] | |
19 | test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 | 19 | |
20 | test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html | 20 | strip_id_or_class: carousel |
21 | strip_id_or_class: toolbar | ||
22 | strip_id_or_class: module | ||
23 | |||
24 | # Change gJQAwdJG4U_story.html to gJQAwdJG4U_print.html | ||
25 | single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html") | ||
26 | |||
27 | # [OLD] Change gJQAwdJG4U_story.html to gJQAwdJG4U_story_print.html | ||
28 | #single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_story_print.html") | ||
29 | |||
30 | test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 | ||
31 | test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html | ||
21 | test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html \ No newline at end of file | 32 | test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/web-libre.org.txt b/inc/3rdparty/site_config/standard/web-libre.org.txt index dfcd0081..9ed43a25 100644..100755 --- a/inc/3rdparty/site_config/standard/web-libre.org.txt +++ b/inc/3rdparty/site_config/standard/web-libre.org.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | body: //div[@id='template_article'] | 1 | body: //div[@id='template_article'] |
2 | 2 | ||
3 | strip_id_or_class: article_more | 3 | strip_id_or_class: article_more |
4 | strip: //hr | 4 | strip: //hr |
5 | 5 | ||
6 | test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html \ No newline at end of file | 6 | test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt b/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt index 9e75a8a8..578ba523 100644..100755 --- a/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt +++ b/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title://div[@class="post"]/h2 | 1 | title://div[@class="post"]/h2 |
2 | author://p[@class="postinfo"]/a | 2 | author://p[@class="postinfo"]/a |
3 | date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ') | 3 | date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ') |
4 | body://div[@class="contenttext"] | 4 | body://div[@class="contenttext"] |
5 | test_url: http://weblog.bignerdranch.com/?p=304 \ No newline at end of file | 5 | test_url: http://weblog.bignerdranch.com/?p=304 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/weblogs.asp.net.txt b/inc/3rdparty/site_config/standard/weblogs.asp.net.txt index 3fabda0b..7cfa49d2 100644..100755 --- a/inc/3rdparty/site_config/standard/weblogs.asp.net.txt +++ b/inc/3rdparty/site_config/standard/weblogs.asp.net.txt | |||
@@ -1,9 +1,9 @@ | |||
1 | title: //h2[@class="pageTitle"] | 1 | title: //h2[@class="pageTitle"] |
2 | strip: //div[@class="postfoot"] | 2 | strip: //div[@class="postfoot"] |
3 | strip: //h2[@class="pageTitle"] | 3 | strip: //h2[@class="pageTitle"] |
4 | strip: //h3[@class="pageTitle"] | 4 | strip: //h3[@class="pageTitle"] |
5 | body: //div[@class="post"] | 5 | body: //div[@class="post"] |
6 | author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed') | 6 | author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed') |
7 | date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by') | 7 | date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by') |
8 | 8 | ||
9 | test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx \ No newline at end of file | 9 | test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt b/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt index 8922b02f..cea10147 100644..100755 --- a/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt +++ b/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | tidy: no | 1 | tidy: no |
2 | dissolve: //div[@id="content"]/div/article/header | 2 | dissolve: //div[@id="content"]/div/article/header |
3 | body: //div[@id="content"]/div/article | 3 | body: //div[@id="content"]/div/article |
4 | title: //div[@id="content"]/div/article/h1 | 4 | title: //div[@id="content"]/div/article/h1 |
5 | date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"] | 5 | date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"] |
6 | strip: //div[@id="content"]/div/article/h1 | 6 | strip: //div[@id="content"]/div/article/h1 |
7 | 7 | ||
8 | test_url: http://webpaper.nzz.ch/2012/06/23/front/JJKMS/aphrodite-und-die-kommunisten?guest_pass=24a3ca5b6d%3AJJKMS%3Ad30e1be8628c099669671d4da56cdce4187790ba \ No newline at end of file | 8 | test_url: http://webpaper.nzz.ch/2012/06/23/front/JJKMS/aphrodite-und-die-kommunisten?guest_pass=24a3ca5b6d%3AJJKMS%3Ad30e1be8628c099669671d4da56cdce4187790ba \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/webwereld.nl.txt b/inc/3rdparty/site_config/standard/webwereld.nl.txt new file mode 100755 index 00000000..40a5aa36 --- /dev/null +++ b/inc/3rdparty/site_config/standard/webwereld.nl.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | strip: //*[@class="paginator"] | ||
2 | body: //*[@id="articleText"] | ||
3 | next_page_link: //a[@class="next"] | ||
4 | |||
5 | # No author detection | ||
6 | # No publishing date detection | ||
7 | # No author and intro deduplication over multiple pages | ||
8 | test_url: http://webwereld.nl/analyse/111452/de-code-van-dorifel-nader-bekeken.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/welt.de.txt b/inc/3rdparty/site_config/standard/welt.de.txt index 6e4f828f..42e65e97 100644..100755 --- a/inc/3rdparty/site_config/standard/welt.de.txt +++ b/inc/3rdparty/site_config/standard/welt.de.txt | |||
@@ -1,22 +1,22 @@ | |||
1 | # set body | 1 | # set body |
2 | tidy: no | 2 | tidy: no |
3 | body: //div[contains(@class, 'articleContent')] | 3 | body: //div[contains(@class, 'articleContent')] |
4 | 4 | ||
5 | # remove clutter | 5 | # remove clutter |
6 | strip: //div[@class='advertising'] | 6 | strip: //div[@class='advertising'] |
7 | strip: //div[@class='themenalarm'] | 7 | strip: //div[@class='themenalarm'] |
8 | strip: //div[contains(@class, 'inTextTeaser')] | 8 | strip: //div[contains(@class, 'inTextTeaser')] |
9 | 9 | ||
10 | # remove captions | 10 | # remove captions |
11 | strip: //span[@class='copyRight'] | 11 | strip: //span[@class='copyRight'] |
12 | 12 | ||
13 | # remove photo galleries and extras | 13 | # remove photo galleries and extras |
14 | strip: //div[contains(@class, 'textGallery')] | 14 | strip: //div[contains(@class, 'textGallery')] |
15 | strip: //div[contains(@class, 'videoGallery')] | 15 | strip: //div[contains(@class, 'videoGallery')] |
16 | strip: //div[contains(@class, 'imageGallery')] | 16 | strip: //div[contains(@class, 'imageGallery')] |
17 | strip: //div[contains(@class, 'openContent')] | 17 | strip: //div[contains(@class, 'openContent')] |
18 | 18 | ||
19 | # remove comments | 19 | # remove comments |
20 | strip: //div[@id = 'writeComment'] | 20 | strip: //div[@id = 'writeComment'] |
21 | 21 | ||
22 | test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html \ No newline at end of file | 22 | test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/westhamtillidie.com.txt b/inc/3rdparty/site_config/standard/westhamtillidie.com.txt index b9343029..3132e98a 100644..100755 --- a/inc/3rdparty/site_config/standard/westhamtillidie.com.txt +++ b/inc/3rdparty/site_config/standard/westhamtillidie.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: substring-before(//title, '«') | 1 | title: substring-before(//title, '«') |
2 | 2 | ||
3 | body: //div[@class='entry'] | 3 | body: //div[@class='entry'] |
4 | strip: //div[@class='sharing_label'] | 4 | strip: //div[@class='sharing_label'] |
5 | strip: //div[@class='snap_nopreview sharing robots-nocontent'] | 5 | strip: //div[@class='snap_nopreview sharing robots-nocontent'] |
6 | test_url: http://www.westhamtillidie.com/2012/03/11/twelve-things-we-learned-from-the-doncaster-game/ \ No newline at end of file | 6 | test_url: http://www.westhamtillidie.com/2012/03/11/twelve-things-we-learned-from-the-doncaster-game/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt b/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt index a88a02c9..a88a02c9 100644..100755 --- a/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt +++ b/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt b/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt index 52c5cf1b..100a8c88 100644..100755 --- a/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt +++ b/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | strip: //div[@class="navigation"] | 1 | strip: //div[@class="navigation"] |
2 | strip: //div[@id="sidebar"] | 2 | strip: //div[@id="sidebar"] |
3 | strip: //div[@id="post-extra-content"] | 3 | strip: //div[@id="post-extra-content"] |
4 | strip: //div[@id="footer"] | 4 | strip: //div[@id="footer"] |
5 | strip: //div[contains(@class, "sharing")] | 5 | strip: //div[contains(@class, "sharing")] |
6 | 6 | ||
7 | test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/ \ No newline at end of file | 7 | test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wheelyric.com.txt b/inc/3rdparty/site_config/standard/wheelyric.com.txt index aa9783cf..b9eeaa0c 100644..100755 --- a/inc/3rdparty/site_config/standard/wheelyric.com.txt +++ b/inc/3rdparty/site_config/standard/wheelyric.com.txt | |||
@@ -1,11 +1,11 @@ | |||
1 | body://div[contains(@class,'oAndtLyrics')] | 1 | body://div[contains(@class,'oAndtLyrics')] |
2 | strip://div[contains(@class,'info')] | 2 | strip://div[contains(@class,'info')] |
3 | strip://div[contains(@id,'romanization')] | 3 | strip://div[contains(@id,'romanization')] |
4 | strip://div[contains(@id,'youtube')] | 4 | strip://div[contains(@id,'youtube')] |
5 | strip://div[contains(@id,'romanizationSelector')] | 5 | strip://div[contains(@id,'romanizationSelector')] |
6 | strip://div[contains(@id,'langSelectWrap')] | 6 | strip://div[contains(@id,'langSelectWrap')] |
7 | strip://div[contains(@id,'requestTranslationWrap')] | 7 | strip://div[contains(@id,'requestTranslationWrap')] |
8 | strip://div[contains(@id,'viewMore')] | 8 | strip://div[contains(@id,'viewMore')] |
9 | strip://div[contains(@class,'lyricsListInMainContent')] | 9 | strip://div[contains(@class,'lyricsListInMainContent')] |
10 | strip://div[contains(@class,'descIpNoti')] | 10 | strip://div[contains(@class,'descIpNoti')] |
11 | test_url: http://wheelyric.com/lyrics/121#2 \ No newline at end of file | 11 | test_url: http://wheelyric.com/lyrics/121#2 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt b/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt index 1f262a0a..b80fe5d1 100644..100755 --- a/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt +++ b/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='content'] | 2 | body: //div[@id='content'] |
3 | strip_id_or_class: editsection | 3 | strip_id_or_class: editsection |
4 | strip_id_or_class: toc | 4 | strip_id_or_class: toc |
5 | strip: //div[@id='siteNotice'] | 5 | strip: //div[@id='siteNotice'] |
6 | strip: //div[@id='content']//table[last()] | 6 | strip: //div[@id='content']//table[last()] |
7 | prune: no | 7 | prune: no |
8 | test_url: http://wiki.guildwars.com/wiki/Monk \ No newline at end of file | 8 | test_url: http://wiki.guildwars.com/wiki/Monk \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt b/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt index e176907e..e9233998 100644..100755 --- a/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt +++ b/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt | |||
@@ -1,8 +1,8 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id='content'] | 2 | body: //div[@id='content'] |
3 | strip_id_or_class: editsection | 3 | strip_id_or_class: editsection |
4 | strip_id_or_class: toc | 4 | strip_id_or_class: toc |
5 | strip: //div[@id='siteNotice'] | 5 | strip: //div[@id='siteNotice'] |
6 | strip: //div[@id='content']//table[last()] | 6 | strip: //div[@id='content']//table[last()] |
7 | prune: no | 7 | prune: no |
8 | test_url: http://wiki.guildwars2.com/wiki/Guardian \ No newline at end of file | 8 | test_url: http://wiki.guildwars2.com/wiki/Guardian \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wikihow.com.txt b/inc/3rdparty/site_config/standard/wikihow.com.txt new file mode 100755 index 00000000..fe95d3f9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wikihow.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | # ...&printable=yes | ||
2 | body: //div[@id='bodycontents'] | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | strip_id_or_class: gatEditSection | ||
6 | strip_id_or_class: relatedwikihows | ||
7 | #strip: //div[contains(@class, 'step_num')] | ||
8 | |||
9 | replace_string(<script ): <div style="display: none" | ||
10 | replace_string(</script>): </div> | ||
11 | |||
12 | single_page_link: //a[@id='gatPrintView'] | ||
13 | single_page_link: concat(//link[@rel='canonical']/@href, '?printable=yes') | ||
14 | |||
15 | test_url: http://www.wikihow.com/Start-Your-Own-Country \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wikitravel.org.txt b/inc/3rdparty/site_config/standard/wikitravel.org.txt index da5bd0b5..1f32a372 100644..100755 --- a/inc/3rdparty/site_config/standard/wikitravel.org.txt +++ b/inc/3rdparty/site_config/standard/wikitravel.org.txt | |||
@@ -1,14 +1,14 @@ | |||
1 | # copied from .wikipedia.org.txt | 1 | # copied from .wikipedia.org.txt |
2 | title: //h1[@id='firstHeading' or @class='firstHeading'] | 2 | title: //h1[@id='firstHeading' or @class='firstHeading'] |
3 | body: //div[@id = 'bodyContent'] | 3 | body: //div[@id = 'bodyContent'] |
4 | strip_id_or_class: editsection | 4 | strip_id_or_class: editsection |
5 | #strip_id_or_class: toc | 5 | #strip_id_or_class: toc |
6 | strip_id_or_class: vertical-navbox | 6 | strip_id_or_class: vertical-navbox |
7 | strip: //table[@id='toc'] | //div[@id='p-toc'] | 7 | strip: //table[@id='toc'] | //div[@id='p-toc'] |
8 | strip: //div[@id='catlinks' or @id='contentSub'] | 8 | strip: //div[@id='catlinks' or @id='contentSub'] |
9 | strip: //div[@id='jump-to-nav'] | 9 | strip: //div[@id='jump-to-nav'] |
10 | strip: //div[@class='thumbcaption']//div[@class='magnify'] | 10 | strip: //div[@class='thumbcaption']//div[@class='magnify'] |
11 | strip: //table[@class='navbox'] | 11 | strip: //table[@class='navbox'] |
12 | prune: no | 12 | prune: no |
13 | tidy: no | 13 | tidy: no |
14 | test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes \ No newline at end of file | 14 | test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/will-self.com.txt b/inc/3rdparty/site_config/standard/will-self.com.txt index 24467c22..394f9ca4 100644..100755 --- a/inc/3rdparty/site_config/standard/will-self.com.txt +++ b/inc/3rdparty/site_config/standard/will-self.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | strip: //div[@class="widget-area"] | 1 | strip: //div[@class="widget-area"] |
2 | title: //*[@class="entry-title"] | 2 | title: //*[@class="entry-title"] |
3 | date: //time[@class="entry-date"] | 3 | date: //time[@class="entry-date"] |
4 | test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/ \ No newline at end of file | 4 | test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/williampfaff.com.txt b/inc/3rdparty/site_config/standard/williampfaff.com.txt index fb5f92ed..cefabec0 100644..100755 --- a/inc/3rdparty/site_config/standard/williampfaff.com.txt +++ b/inc/3rdparty/site_config/standard/williampfaff.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: substring-after(//span[@class='itemTitle'], ':') | 1 | title: substring-after(//span[@class='itemTitle'], ':') |
2 | body: //div[@id='content'] | 2 | body: //div[@id='content'] |
3 | test_url: http://www.williampfaff.com/modules/news/article.php?storyid=491 \ No newline at end of file | 3 | test_url: http://www.williampfaff.com/modules/news/article.php?storyid=491 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/winfuture.de.txt b/inc/3rdparty/site_config/standard/winfuture.de.txt index bc936370..dddc6f9e 100644..100755 --- a/inc/3rdparty/site_config/standard/winfuture.de.txt +++ b/inc/3rdparty/site_config/standard/winfuture.de.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | title: //h1/span | 1 | title: //h1/span |
2 | 2 | ||
3 | body: //div[@id="news_content"] | 3 | body: //div[@id="news_content"] |
4 | 4 | ||
5 | author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text() | 5 | author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text() |
6 | 6 | ||
7 | date: //span[@class='date'] | 7 | date: //span[@class='date'] |
8 | 8 | ||
9 | # Rubrikenbild entfernen | 9 | # Rubrikenbild entfernen |
10 | strip: //div[@id="news_content"]/a[1] | 10 | strip: //div[@id="news_content"]/a[1] |
11 | 11 | ||
12 | test_url: http://winfuture.de/news,69672.html \ No newline at end of file | 12 | test_url: http://winfuture.de/news,69672.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/winrumors.com.txt b/inc/3rdparty/site_config/standard/winrumors.com.txt index cedb4390..f25f9c9e 100644..100755 --- a/inc/3rdparty/site_config/standard/winrumors.com.txt +++ b/inc/3rdparty/site_config/standard/winrumors.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h1[@class='page-heading'] | 1 | title: //h1[@class='page-heading'] |
2 | author: //small/strong/a | 2 | author: //small/strong/a |
3 | #their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time' | 3 | #their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time' |
4 | date: substring-before(substring-after(//small,'on'),'with') | 4 | date: substring-before(substring-after(//small,'on'),'with') |
5 | body: //div[@class='entry'] | 5 | body: //div[@class='entry'] |
6 | test_url: http://www.winrumors.com/chinese-windows-phone-launch-still-on-track-for-early-2012/ \ No newline at end of file | 6 | test_url: http://www.winrumors.com/chinese-windows-phone-launch-still-on-track-for-early-2012/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/winsupersite.com.txt b/inc/3rdparty/site_config/standard/winsupersite.com.txt index db6a6fc9..f725b67a 100644..100755 --- a/inc/3rdparty/site_config/standard/winsupersite.com.txt +++ b/inc/3rdparty/site_config/standard/winsupersite.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | date: //*[@class='kicker'] | 1 | date: //*[@class='kicker'] |
2 | body: //*[@class='KonaBody'] | 2 | body: //*[@class='KonaBody'] |
3 | test_url: http://www.winsupersite.com/article/paul-thurrotts-wininfo/android-malware-surges-separate-studies-141364 \ No newline at end of file | 3 | test_url: http://www.winsupersite.com/article/paul-thurrotts-wininfo/android-malware-surges-separate-studies-141364 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wired.com.txt b/inc/3rdparty/site_config/standard/wired.com.txt index 69bbf5b7..f5a72d14 100644..100755 --- a/inc/3rdparty/site_config/standard/wired.com.txt +++ b/inc/3rdparty/site_config/standard/wired.com.txt | |||
@@ -1,22 +1,25 @@ | |||
1 | title: //meta[@property="og:title"]/@content | 1 | title: //meta[@name='Title']/@content |
2 | title: //h1 | 2 | author: //meta[@name='Author']/@content |
3 | title: //*[@class='posttitle'] | 3 | date: //meta[@name='DisplayDate']/@content |
4 | author: //*[@class='entryAuthor']/a[1] | 4 | body: //div[@class='entry'] |
5 | author://*[@class='member-title'] | 5 | strip: //p[contains(., 'Pages:') and contains(., 'View All')] |
6 | author://li[@class='author']/a[contains(@href, '/author/')] | 6 | strip: //p[@class='caption'] |
7 | date: substring-after(//div[@class='entryAuthor'], '·') | 7 | strip: //div[@class='desc' or @class='slide' or @id='slide-info'] |
8 | date: substring-before(//*[@class='entryDate'], '|') | 8 | |
9 | body: //div[@class='entry'] | 9 | strip_id_or_class: pullquote |
10 | strip: //span[contains(@class, 'nextprev')] | 10 | strip_id_or_class: left_rail |
11 | #strip_id_or_class: ngg-galleryoverview | 11 | strip_id_or_class: related-container |
12 | # ngg-galleryoverview is the whole content sometimes, e.g. http://www.wired.com/underwire/2011/12/best-mixtapes-of-2011/?pid=5736&viewall=true | 12 | strip_id_or_class: radvert-caption-wrap |
13 | 13 | ||
14 | strip: //p[span[contains(@class, 'contentjump')]] | 14 | # Remove gallery? |
15 | strip: //text()[contains(., 'nextpage')] | 15 | strip_id_or_class: wpgallery |
16 | 16 | ||
17 | prune: no | 17 | #strip: //text()[contains(., 'nextpage')] |
18 | 18 | ||
19 | single_page_link: //a[contains(@href, '/all/1') and contains(@class, 'contentjumpall')] | 19 | prune: no |
20 | 20 | ||
21 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ | 21 | single_page_link: //a[.='View All' and contains(@href, '/all/')] |
22 | test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/1 \ No newline at end of file | 22 | |
23 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ | ||
24 | test_url: http://www.wired.com/wiredenterprise/2013/09/docker/ | ||
25 | test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/ | ||
diff --git a/inc/3rdparty/site_config/standard/wmnf.org.txt b/inc/3rdparty/site_config/standard/wmnf.org.txt index ffb6b2d1..1d403a91 100644..100755 --- a/inc/3rdparty/site_config/standard/wmnf.org.txt +++ b/inc/3rdparty/site_config/standard/wmnf.org.txt | |||
@@ -1,13 +1,13 @@ | |||
1 | title: //div[@class="bodyText"]/h1/text() | 1 | title: //div[@class="bodyText"]/h1/text() |
2 | body: //div[@class="bodyText"] | 2 | body: //div[@class="bodyText"] |
3 | 3 | ||
4 | # author and date are separated by only a newline | 4 | # author and date are separated by only a newline |
5 | # can't figure out how to tokenize that yet | 5 | # can't figure out how to tokenize that yet |
6 | author: //div[@class="bodyText"]/span[@class="info"]/text() | 6 | author: //div[@class="bodyText"]/span[@class="info"]/text() |
7 | date: //div[@class="bodyText"]/span[@class="info"]/text() | 7 | date: //div[@class="bodyText"]/span[@class="info"]/text() |
8 | 8 | ||
9 | # strip metdata from body text | 9 | # strip metdata from body text |
10 | strip: //div[@class="bodyText"]/h1/text() | 10 | strip: //div[@class="bodyText"]/h1/text() |
11 | strip: //div[@class="bodyText"]/span[@class="info"] | 11 | strip: //div[@class="bodyText"]/span[@class="info"] |
12 | strip: //div[@class="bodyText"]/span[@class="info"] | 12 | strip: //div[@class="bodyText"]/span[@class="info"] |
13 | test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas \ No newline at end of file | 13 | test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wmpoweruser.com.txt b/inc/3rdparty/site_config/standard/wmpoweruser.com.txt index d9011d24..70168fbe 100644..100755 --- a/inc/3rdparty/site_config/standard/wmpoweruser.com.txt +++ b/inc/3rdparty/site_config/standard/wmpoweruser.com.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | date://*[@class="entry-date"] | 1 | date://*[@class="entry-date"] |
2 | author://*[@class="author vcard"] | 2 | author://*[@class="author vcard"] |
3 | strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"] | 3 | strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"] |
4 | test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/ \ No newline at end of file | 4 | test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/worldpoultry.net.txt b/inc/3rdparty/site_config/standard/worldpoultry.net.txt index 0e42ca5e..b88f9279 100644..100755 --- a/inc/3rdparty/site_config/standard/worldpoultry.net.txt +++ b/inc/3rdparty/site_config/standard/worldpoultry.net.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title: //div[@class="content article"]/h1 | 1 | title: //div[@class="content article"]/h1 |
2 | date: substring-after(//*[@class='date'], '//') | 2 | date: substring-after(//*[@class='date'], '//') |
3 | body: //*[@class='article-content'] | 3 | body: //*[@class='article-content'] |
4 | strip: //*[@id='nomodal'] | 4 | strip: //*[@id='nomodal'] |
5 | test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html \ No newline at end of file | 5 | test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/worldwidewords.org.txt b/inc/3rdparty/site_config/standard/worldwidewords.org.txt index 733d607f..4682e0d3 100644..100755 --- a/inc/3rdparty/site_config/standard/worldwidewords.org.txt +++ b/inc/3rdparty/site_config/standard/worldwidewords.org.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //p[@id='content'] | 1 | title: //p[@id='content'] |
2 | 2 | ||
3 | body: //div[@class='contentblock'] | 3 | body: //div[@class='contentblock'] |
4 | test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm \ No newline at end of file | 4 | test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wow.joystiq.com.txt b/inc/3rdparty/site_config/standard/wow.joystiq.com.txt index 759fb81f..44add9c9 100644..100755 --- a/inc/3rdparty/site_config/standard/wow.joystiq.com.txt +++ b/inc/3rdparty/site_config/standard/wow.joystiq.com.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | title: //h2[@class="posttitle"] | 1 | title: //h2[@class="posttitle"] |
2 | body: //div[@class="post"] | 2 | body: //div[@class="post"] |
3 | strip: //h2[@class="posttitle"] | 3 | strip: //h2[@class="posttitle"] |
4 | strip: //p[@class="filed-under"] | 4 | strip: //p[@class="filed-under"] |
5 | convert_double_br_tags: yes | 5 | convert_double_br_tags: yes |
6 | test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/ \ No newline at end of file | 6 | test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wpmayor.com.txt b/inc/3rdparty/site_config/standard/wpmayor.com.txt new file mode 100755 index 00000000..bb4fffc7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wpmayor.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[@id='nrelate_flyout_placeholder'] | ||
2 | |||
3 | strip_id_or_class: share | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.wpmayor.com/themes/wordpress-portfolio-resume-themes/ | ||
8 | test_url: http://www.wpmayor.com/feed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wtatennis.com.txt b/inc/3rdparty/site_config/standard/wtatennis.com.txt new file mode 100755 index 00000000..1000ab26 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wtatennis.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[contains(@class, 'header-2')] | ||
2 | body: //article//*[contains(@class, 'teaserText') or contains(@class, 'lastUpdated') or contains(@class, 'image') or contains(@class, 'body')] | ||
3 | strip_id_or_class: articleIndex | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.wtatennis.com/news/article/3190914 | ||
7 | test_url: http://www.wtatennis.com/news/article/3190244 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt b/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt index 0846be2c..97a5c19d 100644..100755 --- a/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt +++ b/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | body://div[@id='articleNew'] | 1 | body://div[@id='articleNew'] |
2 | strip://div[@id='articleBy'] | 2 | strip://div[@id='articleBy'] |
3 | strip://div[@id='articleDate'] | 3 | strip://div[@id='articleDate'] |
4 | strip://td[@class='articleGraphicCredit'] | 4 | strip://td[@class='articleGraphicCredit'] |
5 | strip://h1 | 5 | strip://h1 |
6 | strip://div[@id='articleEnd'] | 6 | strip://div[@id='articleEnd'] |
7 | strip://p[@class='tagline'] | 7 | strip://p[@class='tagline'] |
8 | strip://div[@class='openBox adslibraryArticle'] | 8 | strip://div[@class='openBox adslibraryArticle'] |
9 | strip_id_or_class:ad-180x150-1 | 9 | strip_id_or_class:ad-180x150-1 |
10 | 10 | ||
11 | 11 | ||
12 | title: //div[@id="articleNew"]/h1 | 12 | title: //div[@id="articleNew"]/h1 |
13 | author: //div[@id="articleBy"]/p/b | 13 | author: //div[@id="articleBy"]/p/b |
14 | date: substring-before(//div[@id="articleDate"], "-") | 14 | date: substring-before(//div[@id="articleDate"], "-") |
15 | test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml \ No newline at end of file | 15 | test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt b/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt index 71306af2..71306af2 100644..100755 --- a/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt +++ b/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt | |||
diff --git a/inc/3rdparty/site_config/standard/wyborcza.pl.txt b/inc/3rdparty/site_config/standard/wyborcza.pl.txt index f99467c2..638583dc 100644..100755 --- a/inc/3rdparty/site_config/standard/wyborcza.pl.txt +++ b/inc/3rdparty/site_config/standard/wyborcza.pl.txt | |||
@@ -1,11 +1,9 @@ | |||
1 | title:h1 | 1 | body: //div[@id='article'] |
2 | author: //*[@class = 'author'] | 2 | strip: //div[@class='head'] |
3 | date: //*[@class = 'date'] | 3 | |
4 | body: //*[@id = 'art'] | 4 | strip_id_or_class: txt_upl |
5 | next_page_link: //*[@id='Str']/a[contains(text(), 'nastepne')] | 5 | |
6 | strip: //*[@class = 'rel_zdjTOP'] | 6 | single_page_link: //div[@id='gazeta_article_tools']//a[contains(@class, 'print')] |
7 | strip: //*[@id = 'rel'] | 7 | |
8 | strip: //*[@class = 'txt_upl'] | 8 | test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x |
9 | strip: //*[@id='Str'] | 9 | test_url: http://wyborcza.pl/1,75478,14880255,Biskup_Dydycz_o_pedofilii_i_tajemnicy_spowiedzi__Zamiast.html \ No newline at end of file |
10 | strip: //*[@id='source'] | ||
11 | test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wyctim.com.txt b/inc/3rdparty/site_config/standard/wyctim.com.txt index d8c8713b..bd7ecf2a 100644..100755 --- a/inc/3rdparty/site_config/standard/wyctim.com.txt +++ b/inc/3rdparty/site_config/standard/wyctim.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | body: //div[@class='article-body'] | 1 | body: //div[@class='article-body'] |
2 | title: //h1 | 2 | title: //h1 |
3 | test_url: http://wyctim.com/icloud-sync-regebbi-rendszereken/ \ No newline at end of file | 3 | test_url: http://wyctim.com/icloud-sync-regebbi-rendszereken/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/wz-newsline.de.txt b/inc/3rdparty/site_config/standard/wz-newsline.de.txt index fbc1d3d2..5b2be744 100644..100755 --- a/inc/3rdparty/site_config/standard/wz-newsline.de.txt +++ b/inc/3rdparty/site_config/standard/wz-newsline.de.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title://h1 | 1 | title://h1 |
2 | 2 | ||
3 | date://p[@class='articleDate'] | 3 | date://p[@class='articleDate'] |
4 | body://div[@class='articleBody wzStandardArticle'] | 4 | body://div[@class='articleBody wzStandardArticle'] |
5 | test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050 \ No newline at end of file | 5 | test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050 \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/xfgjls.com.txt b/inc/3rdparty/site_config/standard/xfgjls.com.txt new file mode 100755 index 00000000..2dc247a0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/xfgjls.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.xfgjls.com/magazine/html/?131.html | ||
3 | # http://www.xfgjls.com/magazine/html/?170.html | ||
4 | |||
5 | body://h3/following-sibling::div | ||
6 | title: //h3 | ||
7 | date: substring-before(//h3/following-sibling::div/p, ' ') | ||
8 | author: substring-before(substring-after(//h3/following-sibling::div/p, '作者:'), 'æ¥æº') | ||
9 | wrap_in(strong)://span[contains(@style, "FONT-WEIGHT: bold")] | ||
10 | dissolve://span[@style="FONT-FAMILY: '宋体'; FONT-SIZE: 10.5pt; FONT-WEIGHT: bold; mso-spacerun: 'yes'"] | ||
11 | test_url: http://www.xfgjls.com/magazine/html/?170.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/xoeb.us.txt b/inc/3rdparty/site_config/standard/xoeb.us.txt index e02960e0..c09fa4df 100644..100755 --- a/inc/3rdparty/site_config/standard/xoeb.us.txt +++ b/inc/3rdparty/site_config/standard/xoeb.us.txt | |||
@@ -1,4 +1,4 @@ | |||
1 | title: //h1[@class="entry-title"] | 1 | title: //h1[@class="entry-title"] |
2 | author: //span[@class="fn"] | 2 | author: //span[@class="fn"] |
3 | date: //p[@class="meta"] | 3 | date: //p[@class="meta"] |
4 | test_url: http://xoeb.us/blog/2012/03/16/my-mistakes-with-our-first-release/ \ No newline at end of file | 4 | test_url: http://xoeb.us/blog/2012/03/16/my-mistakes-with-our-first-release/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/yated.com.txt b/inc/3rdparty/site_config/standard/yated.com.txt index 13a3ea64..13a3ea64 100644..100755 --- a/inc/3rdparty/site_config/standard/yated.com.txt +++ b/inc/3rdparty/site_config/standard/yated.com.txt | |||
diff --git a/inc/3rdparty/site_config/standard/ynet.co.il.txt b/inc/3rdparty/site_config/standard/ynet.co.il.txt new file mode 100755 index 00000000..aa86566a --- /dev/null +++ b/inc/3rdparty/site_config/standard/ynet.co.il.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | body: //span[@id='article_content' or @class='text16g'] | ||
2 | |||
3 | # ads | ||
4 | strip: //div[.//div[contains(@id, 'ads.')]] | ||
5 | # related content heading | ||
6 | strip: //p[contains(., 'עוד בערוץ החדשות של ynet:')] | ||
7 | strip: //p[contains(., 'כותרות ××—×¨×•× ×•×ª ×ž×”×¢×•×œ× ×‘×—×“×©×•×ª ynet:')] | ||
8 | strip: //div[contains(., '××™× ×¦×™×§×œ×•×¤×“×™×™×ª ynet:')] | ||
9 | # related content links | ||
10 | strip: //a[@class='bluelink'] | ||
11 | # strip image bullets | ||
12 | strip_image_src: ynet_manual_bullet.png | ||
13 | |||
14 | prune: no | ||
15 | tidy: no | ||
16 | |||
17 | # prevent JS issues | ||
18 | find_string: <script type='text/javascript'> | ||
19 | replace_string: <div style="display:none;"> | ||
20 | find_string: </script> | ||
21 | replace_string: </div> | ||
22 | |||
23 | test_url: http://www.ynet.co.il/articles/0,7340,L-4354266,00.html | ||
24 | test_url: http://www.ynet.co.il/articles/0,7340,L-4354268,00.html | ||
25 | #feed | ||
26 | test_url: http://www.ynet.co.il/Integration/StoryRss2.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/yostivanich.com.txt b/inc/3rdparty/site_config/standard/yostivanich.com.txt index 9e24db3c..2aeb7e05 100644..100755 --- a/inc/3rdparty/site_config/standard/yostivanich.com.txt +++ b/inc/3rdparty/site_config/standard/yostivanich.com.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | title://div[@class='entry-title'] | 1 | title://div[@class='entry-title'] |
2 | body://div[@class='entry-content'] | 2 | body://div[@class='entry-content'] |
3 | strip_comments:yes | 3 | strip_comments:yes |
4 | convert_double_br_tags:yes | 4 | convert_double_br_tags:yes |
5 | test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/ \ No newline at end of file | 5 | test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/yourerie.com.txt b/inc/3rdparty/site_config/standard/yourerie.com.txt new file mode 100755 index 00000000..b46b09e8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/yourerie.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class="nxFullTextData"] | ||
2 | test_url: http://yourerie.com/fulltext?nxd_id=306552 | ||
diff --git a/inc/3rdparty/site_config/standard/youtube.com.txt b/inc/3rdparty/site_config/standard/youtube.com.txt index d52b7356..b0d95f1f 100644..100755 --- a/inc/3rdparty/site_config/standard/youtube.com.txt +++ b/inc/3rdparty/site_config/standard/youtube.com.txt | |||
@@ -1,15 +1,15 @@ | |||
1 | title: //title | 1 | title: //title |
2 | body: //iframe | 2 | body: //iframe |
3 | 3 | ||
4 | find_string: <html><iframe | 4 | find_string: <html><iframe |
5 | replace_string: <iframe id="video" | 5 | replace_string: <iframe id="video" |
6 | 6 | ||
7 | find_string: ></iframe></html> | 7 | find_string: ></iframe></html> |
8 | replace_string: ></iframe> | 8 | replace_string: ></iframe> |
9 | 9 | ||
10 | single_page_link: //link[@type='text/xml+oembed'] | 10 | single_page_link: //link[@type='text/xml+oembed'] |
11 | 11 | ||
12 | prune: no | 12 | prune: no |
13 | tidy: no | 13 | tidy: no |
14 | 14 | ||
15 | test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU \ No newline at end of file | 15 | test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/zcommunications.org.txt b/inc/3rdparty/site_config/standard/zcommunications.org.txt new file mode 100755 index 00000000..4deb49bf --- /dev/null +++ b/inc/3rdparty/site_config/standard/zcommunications.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@id='view_title'] | ||
2 | author: //div[contains(@class, 'content_authors')]//a | ||
3 | body: //div[@id='view_body'] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.zcommunications.org/orwellian-language-update-by-edward-s-herman.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zdnet.com.txt b/inc/3rdparty/site_config/standard/zdnet.com.txt index b244b229..939fb0e3 100644..100755 --- a/inc/3rdparty/site_config/standard/zdnet.com.txt +++ b/inc/3rdparty/site_config/standard/zdnet.com.txt | |||
@@ -1,10 +1,10 @@ | |||
1 | title: //h1[@class="h s-1"] | 1 | title: //h1[@class="h s-1"] |
2 | author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|') | 2 | author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|') |
3 | author: substring-after(//div[@class="bio"]//h3, 'About ') | 3 | author: substring-after(//div[@class="bio"]//h3, 'About ') |
4 | date: substring-after(//p[@class="meta s-10"], '|') | 4 | date: substring-after(//p[@class="meta s-10"], '|') |
5 | date: substring-after(//p[@class="meta"], '|') | 5 | date: substring-after(//p[@class="meta"], '|') |
6 | body: //div[@class="content-1 entry space-1 clear"] | 6 | body: //div[@class="content-1 entry space-1 clear"] |
7 | body: //div[@class="storyBody"] | 7 | body: //div[@class="storyBody"] |
8 | 8 | ||
9 | test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920 | 9 | test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920 |
10 | test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/ \ No newline at end of file | 10 | test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/zeit.de.txt b/inc/3rdparty/site_config/standard/zeit.de.txt index 66a7f1ac..9815d478 100644..100755 --- a/inc/3rdparty/site_config/standard/zeit.de.txt +++ b/inc/3rdparty/site_config/standard/zeit.de.txt | |||
@@ -1,44 +1,45 @@ | |||
1 | # 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions | 1 | # 2013.10.30 [rezor92] fixed single_page_link |
2 | # 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section) | 2 | # 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions |
3 | # 2011-12-09 [carlo@...] Removed "related articles" block | 3 | # 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section) |
4 | # 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications. | 4 | # 2011-12-09 [carlo@...] Removed "related articles" block |
5 | # 2011-08-20 [carlo@...] added author, fixed date | 5 | # 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications. |
6 | 6 | # 2011-08-20 [carlo@...] added author, fixed date | |
7 | 7 | ||
8 | single_page_link: //a[@title='Druckversion'] | 8 | |
9 | tidy: no | 9 | single_page_link: //a[@title='Auf einer Seite'] |
10 | 10 | tidy: no | |
11 | title: //title | 11 | |
12 | date: substring-before( //li[@class="date"], " " ) | 12 | title: //title |
13 | author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text() | 13 | date: substring-before( //li[@class="date"], " " ) |
14 | author: substring-after(//li[@class='source first '], 'Quelle: ') | 14 | author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text() |
15 | 15 | author: substring-after(//li[@class='source first '], 'Quelle: ') | |
16 | strip_id_or_class: articleheader | 16 | |
17 | strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"] | 17 | strip_id_or_class: articleheader |
18 | 18 | strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"] | |
19 | #Removes author and date from the start | 19 | |
20 | strip: //ul[@class="tools"] | 20 | #Removes author and date from the start |
21 | #Removes copyright statement - often disturb as first line of the news | 21 | strip: //ul[@class="tools"] |
22 | strip: //p[@class="copyright"] | 22 | #Removes copyright statement - often disturb as first line of the news |
23 | strip: //div[@class="copyright"] | 23 | strip: //p[@class="copyright"] |
24 | #Removes pagination links at the end | 24 | strip: //div[@class="copyright"] |
25 | strip: //div[@class="pagination"] | 25 | #Removes pagination links at the end |
26 | 26 | strip: //div[@class="pagination"] | |
27 | # Fix picture captions | 27 | |
28 | wrap_in(small): //p[@class="caption"]/text() | 28 | # Fix picture captions |
29 | 29 | wrap_in(small): //p[@class="caption"]/text() | |
30 | # Fix sub-headlines | 30 | |
31 | wrap_in(h2): //p/strong | 31 | # Fix sub-headlines |
32 | dissolve: //h2/strong | 32 | wrap_in(h2): //p/strong |
33 | 33 | dissolve: //h2/strong | |
34 | #Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here. | 34 | |
35 | strip_id_or_class:"informatives" | 35 | #Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here. |
36 | strip_id_or_class:"bottom" | 36 | strip_id_or_class:"informatives" |
37 | strip_id_or_class:"teasermosaic" | 37 | strip_id_or_class:"bottom" |
38 | strip_id_or_class:"comments" | 38 | strip_id_or_class:"teasermosaic" |
39 | strip_id_or_class:"articlefooter af" | 39 | strip_id_or_class:"comments" |
40 | strip_id_or_class:"relateds" | 40 | strip_id_or_class:"articlefooter af" |
41 | strip_id_or_class:"pagination" | 41 | strip_id_or_class:"relateds" |
42 | 42 | strip_id_or_class:"pagination" | |
43 | footnotes: no | 43 | |
44 | test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag \ No newline at end of file | 44 | footnotes: no |
45 | test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag | ||
diff --git a/inc/3rdparty/site_config/standard/zerohedge.com.txt b/inc/3rdparty/site_config/standard/zerohedge.com.txt new file mode 100755 index 00000000..7e76aee5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/zerohedge.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | author: //span[@class='submitted']/a | ||
2 | strip: //div[@class='clear-block clr'] | ||
3 | strip: //div[@class='picture'] | ||
4 | strip: //span[@class='submitted'] | ||
5 | strip: //div[@class='breadcrumb'] | ||
6 | strip: //div[@class='fivestar-static-form-item'] | ||
7 | strip: //div[@class='js-links'] | ||
8 | strip: //div[@class='links clear-block clear'] | ||
9 | strip: //div[@class='block block-block'] | ||
10 | test_url: http://www.zerohedge.com/news/bernankes-columbus-voyage-end-monetary-policy-world \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zerokspot.com.txt b/inc/3rdparty/site_config/standard/zerokspot.com.txt index ea9132aa..afa964db 100644..100755 --- a/inc/3rdparty/site_config/standard/zerokspot.com.txt +++ b/inc/3rdparty/site_config/standard/zerokspot.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: //h1 | 1 | title: //h1 |
2 | body: //div[@id="primarycontent"] | 2 | body: //div[@id="primarycontent"] |
3 | test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/ \ No newline at end of file | 3 | test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/ \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/zhihu.com.txt b/inc/3rdparty/site_config/standard/zhihu.com.txt new file mode 100755 index 00000000..3c9d8c1a --- /dev/null +++ b/inc/3rdparty/site_config/standard/zhihu.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.zhihu.com/question/19587406 | ||
3 | # http://www.zhihu.com/question/20649035 | ||
4 | # http://www.zhihu.com/question/20637942 | ||
5 | |||
6 | author: //h3[@class='zm-item-answer-author-wrap'] | ||
7 | title://h2[@class='zm-item-title'] | ||
8 | date://a[@class='answer-date-link meta-item'] | ||
9 | convert_double_br_tags: yes | ||
10 | |||
11 | wrap_in(blockquote)://div[@class='zm-editable-content'] | ||
12 | wrap_in(blockquote)://sup/text() | ||
13 | dissolve://sup | ||
14 | |||
15 | strip://div[@class='zh-answers-title'] | ||
16 | strip:///div[@class='zm-item-vote-info '] | ||
17 | strip://div[@class='zm-item-answer-author-info'] | ||
18 | strip://div[@class='zu-blue-info-board zg-r3px'] | ||
19 | test_url: http://www.zhihu.com/question/20637942 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zingtrain.com.txt b/inc/3rdparty/site_config/standard/zingtrain.com.txt index 2a2f58a8..188d4dd6 100644..100755 --- a/inc/3rdparty/site_config/standard/zingtrain.com.txt +++ b/inc/3rdparty/site_config/standard/zingtrain.com.txt | |||
@@ -1,3 +1,3 @@ | |||
1 | title: substring-after(id, 'post')/h2 | 1 | title: substring-after(id, 'post')/h2 |
2 | body://div[@class = 'entry'] | 2 | body://div[@class = 'entry'] |
3 | test_url: http://www.zingtrain.com/category/ontrack/january-2007/ \ No newline at end of file | 3 | test_url: http://www.zingtrain.com/category/ontrack/january-2007/ \ No newline at end of file |