diff options
author | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
---|---|---|
committer | tcitworld <thomas.citharet@gmail.com> | 2014-01-04 12:30:31 -0800 |
commit | 7f667839764621b5aa01c9db8ce5dde2a29ef18f (patch) | |
tree | 93d8241ee81c87e18494325ae02f0589a8e328a2 /inc/3rdparty/site_config | |
parent | a84f77d6ba15a64ff00453f5d5190c021ce460ed (diff) | |
parent | 2abcccb37180c17318f5226f5d4bc28f30b621ea (diff) | |
download | wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.gz wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.tar.zst wallabag-7f667839764621b5aa01c9db8ce5dde2a29ef18f.zip |
Merge pull request #1 from inthepoche/dev
Dev
Diffstat (limited to 'inc/3rdparty/site_config')
783 files changed, 7014 insertions, 18 deletions
diff --git a/inc/3rdparty/site_config/README.md b/inc/3rdparty/site_config/README.txt index 0aff456b..e966ee74 100644 --- a/inc/3rdparty/site_config/README.md +++ b/inc/3rdparty/site_config/README.txt | |||
@@ -1,6 +1,6 @@ | |||
1 | Full-Text RSS Site Patterns | 1 | Full-Text RSS Site Patterns |
2 | --------------------------- | 2 | --------------------------- |
3 | 3 | ||
4 | Site patterns allow you to specify what should be extracted from specific sites. | 4 | Site patterns allow you to specify what should be extracted from specific sites. |
5 | 5 | ||
6 | Please see http://help.fivefilters.org/customer/portal/articles/223153-site-patterns for more information. \ No newline at end of file | 6 | Please see http://help.fivefilters.org/customer/portal/articles/223153-site-patterns for more information. \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/custom/dilbert.com.txt b/inc/3rdparty/site_config/custom/dilbert.com.txt new file mode 100644 index 00000000..6c8d95a2 --- /dev/null +++ b/inc/3rdparty/site_config/custom/dilbert.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[contains(@class, 'SB_Title')]//a | ||
2 | body: //div[contains(@class, 'STR_Content')] | ||
3 | |||
4 | test_url: http://dilbert.com/strips/comic/2013-10-22 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/custom/interviewmagazine.com.txt b/inc/3rdparty/site_config/custom/interviewmagazine.com.txt new file mode 100644 index 00000000..a9d4f8ca --- /dev/null +++ b/inc/3rdparty/site_config/custom/interviewmagazine.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //title | ||
2 | body: //div[contains(@class, 'block')] | ||
3 | |||
4 | test_url: http://www.interviewmagazine.com/film/spike-jonze \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/custom/inthepoche.com.txt b/inc/3rdparty/site_config/custom/inthepoche.com.txt deleted file mode 100644 index ede74b97..00000000 --- a/inc/3rdparty/site_config/custom/inthepoche.com.txt +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | title: //title | ||
2 | body: //div[@class='post-content'] | ||
3 | |||
4 | prune: no | ||
5 | tidy: no | ||
6 | |||
7 | test_url: http://www.inthepoche.com/?post/poche-hosting \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/custom/stackexchange.com.txt b/inc/3rdparty/site_config/custom/stackexchange.com.txt new file mode 100755 index 00000000..c9d44b1d --- /dev/null +++ b/inc/3rdparty/site_config/custom/stackexchange.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //title | ||
2 | body: //div[@id='question']//div[contains(@class,'post-text')] | //div[@id='answers-header']//h2 | //div[contains(@class,'accepted-answer')]//div[contains(@class,'post-text')] | ||
3 | |||
4 | test_url: http://cstheory.stackexchange.com/questions/14811/what-is-the-enlightenment-im-supposed-to-attain-after-studying-finite-automata/14818#14818 | ||
diff --git a/inc/3rdparty/site_config/custom/stackoverflow.com.txt b/inc/3rdparty/site_config/custom/stackoverflow.com.txt new file mode 100755 index 00000000..d2eb984d --- /dev/null +++ b/inc/3rdparty/site_config/custom/stackoverflow.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //title | ||
2 | body: //div[@id='question']//div[contains(@class,'post-text')] | //div[@id='answers-header']//h2 | //div[contains(@class,'accepted-answer')]//div[contains(@class,'post-text')] | ||
3 | |||
4 | test_url: http://stackoverflow.com/questions/20302422/calling-a-function-from-a-javascript-object | ||
diff --git a/inc/3rdparty/site_config/index.php b/inc/3rdparty/site_config/index.php index a3d5f739..a1b767fd 100644 --- a/inc/3rdparty/site_config/index.php +++ b/inc/3rdparty/site_config/index.php | |||
@@ -1,3 +1,3 @@ | |||
1 | <?php | 1 | <?php |
2 | // this is here to prevent directory listing over the web | 2 | // this is here to prevent directory listing over the web |
3 | ?> \ No newline at end of file | 3 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/24ways.org.txt b/inc/3rdparty/site_config/standard/24ways.org.txt new file mode 100644 index 00000000..03bd1950 --- /dev/null +++ b/inc/3rdparty/site_config/standard/24ways.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='meta']/h2/a | ||
2 | author: //div[@class='meta']/h2/following-sibling::p/a/text() | ||
3 | date://div[@class='meta']/h2/strong | ||
4 | body: //div[@id='article'] | ||
5 | strip: //div[@class='domore'] | ||
6 | test_url: http://24ways.org/2011/composing-the-new-canon \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/37signals.com.txt b/inc/3rdparty/site_config/standard/37signals.com.txt new file mode 100644 index 00000000..43a10ae5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/37signals.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='post_header']//h2/a | ||
2 | author: //span[@class='author'] | ||
3 | date: //span[@class='date'] | ||
4 | body: //div[@id='Content'] | ||
5 | |||
6 | test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/3quarksdaily.com.txt b/inc/3rdparty/site_config/standard/3quarksdaily.com.txt new file mode 100644 index 00000000..c4e7940f --- /dev/null +++ b/inc/3rdparty/site_config/standard/3quarksdaily.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@class='content'] | ||
2 | date: //div[@class='content']/h2 | ||
3 | strip: //div[@class='content']/h2 | ||
4 | title: //div[@class='content']/h3 | ||
5 | |||
6 | strip: //div[@id='postmenu'] | ||
7 | strip: //div[@class='trackback'] | ||
8 | tidy: no | ||
9 | test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt b/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt new file mode 100644 index 00000000..b846b050 --- /dev/null +++ b/inc/3rdparty/site_config/standard/3voor12.vpro.nl.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@id='main'] | ||
2 | title: //div[@class='intro']/h1 | ||
3 | author: //ul[@class='text-data']/li[@class='author'] | ||
4 | date: //ul[@class='text-data']/li[@class='date'] | ||
5 | convert_double_br_tags: yes | ||
6 | tidy: no | ||
7 | |||
8 | strip: //div[@class='share'] | ||
9 | strip: //*[@class='zoom'] | ||
10 | strip: //div[@id='disqus_thread'] | ||
11 | test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/43folders.com.txt b/inc/3rdparty/site_config/standard/43folders.com.txt new file mode 100644 index 00000000..e8073f6f --- /dev/null +++ b/inc/3rdparty/site_config/standard/43folders.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //*[@class = 'content'] | ||
2 | author: //*[@class = 'submitted']/a | ||
3 | date: substring-after(//*[@class = 'submitted']/text(), '|') | ||
4 | test_url: http://www.43folders.com/2011/04/22/cranking \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/500px.com.txt b/inc/3rdparty/site_config/standard/500px.com.txt new file mode 100644 index 00000000..68e6b2d0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/500px.com.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | # very loose setup for both 500px.com/photo/* and 500px.com/blog/* | ||
2 | # photo page example: http://500px.com/photo/4181666 | ||
3 | # blog page example: http://500px.com/blog/110 | ||
4 | |||
5 | # avoid "no text" error | ||
6 | tidy:no | ||
7 | prune:no | ||
8 | |||
9 | # reorganize photo page elements | ||
10 | #body://div[contains(@class,'container')] | ||
11 | move_into(body)://div[contains(@id,'thephoto')] | ||
12 | move_into(body)://div[contains(@id,'description')] | ||
13 | move_into(body)://div[contains(@id,'tags')] | ||
14 | move_into(body)://div[contains(@id,'photo-info')] | ||
15 | |||
16 | # clean photo page info | ||
17 | strip://span[contains(@id,'copyright')] | ||
18 | strip://*[contains(@id,'store')] | ||
19 | strip://*[contains(@id,'user-info')] | ||
20 | strip://*[contains(@id,'photo-stats')] | ||
21 | strip://*[contains(@id,'voting_controls_container')] | ||
22 | strip://*[contains(@id,'more-photos')] | ||
23 | strip://*[contains(@id,'embed-photo')] | ||
24 | |||
25 | # clean blog page side bar | ||
26 | strip://*[contains(@class,'col d3 clearafter')] | ||
27 | test_url: http://500px.com/photo/3641041?from=editors \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/512pixels.net.txt b/inc/3rdparty/site_config/standard/512pixels.net.txt new file mode 100644 index 00000000..e458980f --- /dev/null +++ b/inc/3rdparty/site_config/standard/512pixels.net.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: substring-before(//title, '—') | ||
2 | test_url: http://512pixels.net/more-on-linked-lists/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/5by5.tv.txt b/inc/3rdparty/site_config/standard/5by5.tv.txt new file mode 100644 index 00000000..dce0df4e --- /dev/null +++ b/inc/3rdparty/site_config/standard/5by5.tv.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //*[@id="episode"] | ||
2 | prune: no | ||
3 | tidy: no | ||
4 | |||
5 | autodetect_next_page: no | ||
6 | strip_id_or_class: player | ||
7 | |||
8 | strip://*[@id="header"] | ||
9 | test_url: http://5by5.tv/buildanalyze/60 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/944.com.txt b/inc/3rdparty/site_config/standard/944.com.txt new file mode 100644 index 00000000..84380e79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/944.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h2[@class='border'] | ||
2 | body: //div[@class='padding'] | ||
3 | |||
4 | convert_double_br_tags: yes | ||
5 | |||
6 | strip: //div[@id='social_sharing'] | ||
7 | strip: //div[@class='socialLinks'] | ||
8 | |||
9 | test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt b/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt new file mode 100644 index 00000000..379592e0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aachener-nachrichten.de.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] | ||
3 | |||
4 | strip_id_or_class: socialshareprivacy1 | ||
5 | strip_id_or_class: zvaFacebookButton | ||
6 | |||
7 | tidy: no | ||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt b/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt new file mode 100644 index 00000000..4d76fac7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aachener-zeitung.de.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")] | ||
3 | |||
4 | strip_id_or_class: socialshareprivacy1 | ||
5 | strip_id_or_class: zvaFacebookButton | ||
6 | |||
7 | tidy: no | ||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/abc.es.txt b/inc/3rdparty/site_config/standard/abc.es.txt new file mode 100644 index 00000000..a99833de --- /dev/null +++ b/inc/3rdparty/site_config/standard/abc.es.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text'] | ||
3 | strip_id_or_class: colB | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/abc.net.au.txt b/inc/3rdparty/site_config/standard/abc.net.au.txt new file mode 100644 index 00000000..5e6269cb --- /dev/null +++ b/inc/3rdparty/site_config/standard/abc.net.au.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1 | ||
2 | author: //div[@class="byline"]/a | ||
3 | date: //span[@class="timestamp"] | ||
4 | |||
5 | strip: //p[@class="topics"] | ||
6 | strip: //h1 | ||
7 | strip: //div[@class="byline"] | ||
8 | strip: //p[@class="published"] | ||
9 | strip: //div[contains(@class,"featured-scroller")] | ||
10 | test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/abcnews.go.com.txt b/inc/3rdparty/site_config/standard/abcnews.go.com.txt new file mode 100644 index 00000000..c515d3e4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/abcnews.go.com.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | title: //h1[@class='headline'] | ||
2 | body: //div[@id='storyText'] | ||
3 | # for video entries | ||
4 | body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')] | ||
5 | author: //div[@class='byline'] | ||
6 | date: //div[@class='date'] | ||
7 | strip: //*[@id='date_partner'] | ||
8 | |||
9 | strip: //div[@class='breadcrumb'] | ||
10 | strip: //div[contains(@class,'show_tools')] | ||
11 | strip: //div[@id='sponsoredByAd'] | ||
12 | strip: //div[contains(@class,'rel_container')] | ||
13 | strip: //p[a[starts-with(@href, 'http://www.twitter.com')]] | ||
14 | strip: //p[a[starts-with(@href, 'http://www.facebook.com')]] | ||
15 | strip: //p[contains(., 'Click here to return to')] | ||
16 | #strip_id_or_class: media | ||
17 | strip_id_or_class: mediaplayer | ||
18 | |||
19 | replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http | ||
20 | |||
21 | prune: no | ||
22 | |||
23 | single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true') | ||
24 | |||
25 | test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744 | ||
26 | # multi-page | ||
27 | test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/accesstoinsight.org.txt b/inc/3rdparty/site_config/standard/accesstoinsight.org.txt new file mode 100644 index 00000000..b5d85079 --- /dev/null +++ b/inc/3rdparty/site_config/standard/accesstoinsight.org.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='H_docTitle'] | ||
2 | |||
3 | body: //div[@id='H_meta' or @id='H_content' or @id='F_footer'] | ||
4 | |||
5 | strip_id_or_class: F_toenail | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/acidcow.com.txt b/inc/3rdparty/site_config/standard/acidcow.com.txt new file mode 100644 index 00000000..60ede6a6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/acidcow.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[starts-with(@id, 'news-id-')] | ||
2 | |||
3 | test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/acquia.com.txt b/inc/3rdparty/site_config/standard/acquia.com.txt new file mode 100644 index 00000000..5ddf542e --- /dev/null +++ b/inc/3rdparty/site_config/standard/acquia.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title://h1[@class="title"] | ||
2 | author://div[@class="submitted"]/span/a | ||
3 | date://div[@class="submitted"]/span | ||
4 | body://div[@class="content-wrapper"] | ||
5 | |||
6 | strip://div[@id="skip-link"] | ||
7 | strip://div[@id="region-content-3-3"] | ||
8 | strip://div[@id="section-footer"] | ||
9 | test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/acroswing.fr.txt b/inc/3rdparty/site_config/standard/acroswing.fr.txt new file mode 100644 index 00000000..57d86d2f --- /dev/null +++ b/inc/3rdparty/site_config/standard/acroswing.fr.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | tidy:no | ||
2 | date: //time[@class='updated'] | ||
3 | dissolve: //ul[@class='video-gallery']/li | ||
4 | dissolve: //ul[@class='video-gallery'] | ||
5 | test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt b/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt new file mode 100644 index 00000000..408e9099 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aht.seriouseats.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body: //div[@id='content'] | ||
2 | |||
3 | # clean up recipe pages | ||
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | ||
5 | |||
6 | #recipe pages | ||
7 | strip_id_or_class: "recipe-feedback" | ||
8 | strip_id_or_class: "comments" | ||
9 | strip_id_or_class: "procedure-number" | ||
10 | strip_id_or_class: "more-with-author" | ||
11 | |||
12 | #slice | ||
13 | strip_id_or_class: "inner" | ||
14 | |||
15 | test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alex.mullr.net.txt b/inc/3rdparty/site_config/standard/alex.mullr.net.txt new file mode 100644 index 00000000..c5f15370 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alex.mullr.net.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class="entry"] | ||
2 | test_url: http://alex.mullr.net/blog/2011/05/on-spotify/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alistapart.com.txt b/inc/3rdparty/site_config/standard/alistapart.com.txt new file mode 100644 index 00000000..090f7eb1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alistapart.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1[@class='title'] | ||
2 | author: //h3[@class='byline']/a | ||
3 | date: //div[@class='ishinfo'] | ||
4 | |||
5 | body: //*[@id='articletext'] | ||
6 | strip_id_or_class: 'ishinfo' | ||
7 | strip_id_or_class: 'metastuff' | ||
8 | strip_id_or_class: 'learnmore' | ||
9 | strip_id_or_class: 'discuss' | ||
10 | |||
11 | prune: no | ||
12 | test_url: http://www.alistapart.com/articles/organizing-mobile/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/aljazeera.com.txt b/inc/3rdparty/site_config/standard/aljazeera.com.txt new file mode 100644 index 00000000..4f0148f4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/aljazeera.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //span[@id='DetailedTitle'] | ||
2 | body: //td[@id='tdTextContent'] | ||
3 | strip_id_or_class: Skyscrapper_Body | ||
4 | date: //span[@id='ctl00_cphBody_lblDate'] | ||
5 | author: //div[@id="dvAuthorInfo"]//a/text() | ||
6 | strip: //table[ tbody/tr/td/object ] | ||
7 | prune: no | ||
8 | test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/allrecipes.com.txt b/inc/3rdparty/site_config/standard/allrecipes.com.txt new file mode 100644 index 00000000..e9767bda --- /dev/null +++ b/inc/3rdparty/site_config/standard/allrecipes.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //h1[@id='itemTitle'] | ||
2 | body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')] | ||
3 | strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right'] | ||
4 | strip: //div[contains(@class, 'rightcoltoolsdiv')] | ||
5 | strip: //div[contains(@class, 'servings-form')] | ||
6 | strip: //p[@class='nutritional-information'] | ||
7 | strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')] | ||
8 | strip: //div[@id='nutri-info']/div[contains(@class, 'title')] | ||
9 | strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter'] | ||
10 | strip_id_or_class: eshaAttribute | ||
11 | strip_id_or_class: eshaParagraph | ||
12 | prune: no | ||
13 | |||
14 | test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/allthingsd.com.txt b/inc/3rdparty/site_config/standard/allthingsd.com.txt new file mode 100644 index 00000000..cd52498f --- /dev/null +++ b/inc/3rdparty/site_config/standard/allthingsd.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title://div[@class="article-title"]/h1[@class="title"] | ||
2 | date: //p[@class="article-date"] | ||
3 | body://*[@class="article-body article-text"] | ||
4 | # Trim out related posts at bottom of article | ||
5 | strip://blockquote[@class="memo"] | ||
6 | |||
7 | # Yup, no idea why author won't work... | ||
8 | author://div[@class="page-header article-header clearfix"]/p[@class="title"] | ||
9 | # [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it. | ||
10 | test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/allyou.com.txt b/inc/3rdparty/site_config/standard/allyou.com.txt new file mode 100644 index 00000000..3c26c682 --- /dev/null +++ b/inc/3rdparty/site_config/standard/allyou.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@id='pageHdr']//h1 | ||
2 | body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint'] | ||
3 | strip: //div[contains(@class, 'infoBox') or @id='infoBox'] | ||
4 | single_page_link: //li[@id='print']/a | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt b/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt new file mode 100644 index 00000000..f5865f89 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alphabeta.argaam.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class = 'entry'] | ||
2 | date: substring-after(//p[@class="date"],'بتاريخ ') | ||
3 | strip_id_or_class: date | ||
4 | strip_id_or_class: follow-single | ||
5 | strip_id_or_class: ratingblock | ||
6 | strip_id_or_class: newRatingHolder | ||
7 | strip_id_or_class: postmetadata | ||
8 | strip_id_or_class: addthis_toolbox | ||
9 | strip_id_or_class: addthis_default_style | ||
10 | strip_id_or_class: size-full | ||
11 | test_url: http://alphabeta.argaam.com/?p=35657 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alriyadh.com.txt b/inc/3rdparty/site_config/standard/alriyadh.com.txt new file mode 100644 index 00000000..d0060000 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alriyadh.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id = "article-view"] | ||
2 | body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')] | ||
3 | author: //p[@class = "author"] | ||
4 | strip: //h1 | ||
5 | strip: //h2 | ||
6 | strip_id_or_class: author | ||
7 | prune: no | ||
8 | test_url: http://www.alriyadh.com/2011/10/10/article674357.html | ||
9 | test_url: http://www.alriyadh.com/net/article/780935 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alseraj.net.txt b/inc/3rdparty/site_config/standard/alseraj.net.txt new file mode 100644 index 00000000..107d82d6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alseraj.net.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: //*[@id='normalfontyellow'] | ||
2 | test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alt1040.com.txt b/inc/3rdparty/site_config/standard/alt1040.com.txt new file mode 100644 index 00000000..4fd45719 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alt1040.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://alt1040.com/2011/09/banda-ancha-en-america-latina-insignificante \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/altfoto.com.txt b/inc/3rdparty/site_config/standard/altfoto.com.txt new file mode 100644 index 00000000..d974cf4a --- /dev/null +++ b/inc/3rdparty/site_config/standard/altfoto.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt b/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt new file mode 100644 index 00000000..7fd47193 --- /dev/null +++ b/inc/3rdparty/site_config/standard/alumni.stanford.edu.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ") | ||
4 | |||
5 | date: //div/a[contains (@href, "issue")] | ||
6 | |||
7 | move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1] | ||
8 | |||
9 | body: //div[@class="enableBullets"] | ||
10 | test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/amazon.com.txt b/inc/3rdparty/site_config/standard/amazon.com.txt new file mode 100644 index 00000000..1a23c4b7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/amazon.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //span[@id = 'btAsinTitle'] | ||
2 | body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div | ||
3 | #strip_id_or_class: quantityDropdownDiv | ||
4 | #strip_id_or_class: addToCartSpan | ||
5 | #strip_id_or_class: oneClickDiv | ||
6 | strip_id_or_class: nocontent | ||
7 | strip_id_or_class: masDynamicConten | ||
8 | strip_id_or_class: dynamic-content | ||
9 | prune: no | ||
10 | |||
11 | find_string: <span id="actualPriceValue"> | ||
12 | replace_string: <span id="actualPriceValue"><br />Price: | ||
13 | |||
14 | strip_id_or_class: collapsePS | ||
15 | strip_id_or_class: expandPS | ||
16 | strip_id_or_class: psPlaceHolde | ||
17 | strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')] | ||
18 | |||
19 | test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/americandrink.net.txt b/inc/3rdparty/site_config/standard/americandrink.net.txt new file mode 100644 index 00000000..dee0e868 --- /dev/null +++ b/inc/3rdparty/site_config/standard/americandrink.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='head']/h2/a | ||
2 | author: //div[@class='head']/a | ||
3 | date: //div[@class='head']/p[@class='date']/a | ||
4 | body: //div[@class='copy'] | ||
5 | strip: //p[@class='meta'] | ||
6 | test_url: http://americandrink.net/post/10567188712/free-the-hooch \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/americascup.com.txt b/inc/3rdparty/site_config/standard/americascup.com.txt new file mode 100644 index 00000000..b1673b6a --- /dev/null +++ b/inc/3rdparty/site_config/standard/americascup.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@class="editorial-content"]/h3 | ||
2 | body: //div[@class="hero-image" or @class="editorial-content"] | ||
3 | |||
4 | strip: //ul[@class="hero-caption"] | ||
5 | strip_id_or_class: footer | ||
6 | |||
7 | prune: no | ||
8 | tidy: no | ||
9 | |||
10 | test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt b/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt new file mode 100644 index 00000000..8bf31ec2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/americastestkitchenfeed.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1[@class="post-title"] | ||
2 | author: //span[@class="author"]/a | ||
3 | date: //span[@class="date"] | ||
4 | body: //div[@class="post-content main"] | ||
5 | test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/anandtech.com.txt b/inc/3rdparty/site_config/standard/anandtech.com.txt new file mode 100644 index 00000000..8067e03c --- /dev/null +++ b/inc/3rdparty/site_config/standard/anandtech.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | author: //a[@class='b'][1] | ||
2 | date: substring-after(substring-before(//div, 'Posted in'), ' on ') | ||
3 | strip_image_src: /content/images/globals/ | ||
4 | strip: //h2[. = 'Page 1']/preceding::p | ||
5 | strip: //h2 | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/')) | ||
10 | |||
11 | test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/andyrutledge.com.txt b/inc/3rdparty/site_config/standard/andyrutledge.com.txt new file mode 100644 index 00000000..f9ffd3c3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/andyrutledge.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h2 | ||
2 | author: string('Andy Rutledge') | ||
3 | date: //div[@class='articledate'] | ||
4 | body: //div[@class='copybody'] | ||
5 | |||
6 | strip: //*[@class='space'] | ||
7 | strip: //*[@class='articleFoot'] | ||
8 | |||
9 | test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt b/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt new file mode 100644 index 00000000..a5c7c08a --- /dev/null +++ b/inc/3rdparty/site_config/standard/annatravelling.wordpress.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h1[@class="title"] | ||
2 | |||
3 | author: ("Anna Manasova") | ||
4 | # is ignored, unfortunately | ||
5 | |||
6 | date: //p[@class="date"] | ||
7 | |||
8 | body: //div[@class="entry"] | ||
9 | test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/applature.com.txt b/inc/3rdparty/site_config/standard/applature.com.txt new file mode 100644 index 00000000..a78a6150 --- /dev/null +++ b/inc/3rdparty/site_config/standard/applature.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //h1[contains(@class, 'title')# | ||
2 | body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer'] | ||
3 | date: //div[@class='date'] | ||
4 | |||
5 | strip_id_or_class: sharethis | ||
6 | strip_id_or_class: stats | ||
7 | strip_id_or_class: apply_form | ||
8 | strip_id_or_class: job_map | ||
9 | strip_id_or_class: respond | ||
10 | strip: //h1//span[@class='type'] | ||
11 | strip: //li[@class='print' or @class='map'] | ||
12 | |||
13 | replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla | ||
14 | |||
15 | prune: no | ||
16 | tidy: no | ||
17 | |||
18 | test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/apple.com.txt b/inc/3rdparty/site_config/standard/apple.com.txt new file mode 100644 index 00000000..4c483955 --- /dev/null +++ b/inc/3rdparty/site_config/standard/apple.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | strip: //p[@class='sosumi'] | ||
2 | # Aren't they witty? | ||
3 | |||
4 | # I can't work out what causes the  before the title. | ||
5 | title: //h1[@class='title'] | ||
6 | strip: //h1[@class='title'] | ||
7 | test_url: http://www.apple.com/pr/library/2011/02/15appstore.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/appleinsider.com.txt b/inc/3rdparty/site_config/standard/appleinsider.com.txt new file mode 100644 index 00000000..279fbce1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/appleinsider.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //p[@class='title'] | ||
2 | |||
3 | author: //p[text() = 'By ']/a/text() | ||
4 | strip: //p[text() = 'By '] | ||
5 | |||
6 | body: //td[@class='bod'] | ||
7 | strip_id_or_class: title | ||
8 | strip_id_or_class: minor | ||
9 | |||
10 | strip_id_or_class: multipagefooter | ||
11 | test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/appleweblog.com.txt b/inc/3rdparty/site_config/standard/appleweblog.com.txt new file mode 100644 index 00000000..023c9ccb --- /dev/null +++ b/inc/3rdparty/site_config/standard/appleweblog.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://appleweblog.com/2011/09/encontrada-vulnerabilidad-grave-en-skype-para-ios \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/archdaily.com.txt b/inc/3rdparty/site_config/standard/archdaily.com.txt new file mode 100644 index 00000000..9476cf56 --- /dev/null +++ b/inc/3rdparty/site_config/standard/archdaily.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | date: //div[@class='post_date'] | ||
2 | |||
3 | body: //div[@class='post_content'] | ||
4 | |||
5 | test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/archiveofourown.org.txt b/inc/3rdparty/site_config/standard/archiveofourown.org.txt new file mode 100644 index 00000000..50ff632d --- /dev/null +++ b/inc/3rdparty/site_config/standard/archiveofourown.org.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | # Description: Fix XPaths to include ALL chapters on 'view_full_work' pages. | ||
2 | # Include: work meta, summary, chapter information, and notes which Instapaper strips out on default. | ||
3 | # Exclude: header, footer, navigation, comments. | ||
4 | # Notes: User is a newbie with XPaths. | ||
5 | |||
6 | title: //h2[@class='title'] | ||
7 | author: //h3[@class='byline'] | ||
8 | author: //a[@class='login author'] | ||
9 | |||
10 | strip_id_or_class:header | ||
11 | strip_id_or_class:navigation | ||
12 | strip_id_or_class:feedback | ||
13 | strip_id_or_class:kudos | ||
14 | strip_id_or_class:add_comment_placeholder | ||
15 | strip_id_or_class:add_comment | ||
16 | strip_id_or_class:globalize | ||
17 | strip_id_or_class:footer | ||
18 | test_url: http://archiveofourown.org/works/229402?view_full_work=true \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/arstechnica.com.txt b/inc/3rdparty/site_config/standard/arstechnica.com.txt new file mode 100644 index 00000000..49bb3dbc --- /dev/null +++ b/inc/3rdparty/site_config/standard/arstechnica.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | author: //p[@class='byline']/a | ||
2 | body: //div[contains(@class,'article-content')] | ||
3 | strip: //h2[@class='title'] | ||
4 | strip_id_or_class: byline | ||
5 | prune: no | ||
6 | |||
7 | date: //div[@class='byline']/span[@class='posted']//abbr/@original-title | ||
8 | date: //div[@class='byline']/span[@class='posted']//abbr | ||
9 | |||
10 | title: //div[@id='story']//h2[@class='title'] | ||
11 | |||
12 | strip: //div[@class='pager'] | ||
13 | next_page_link: //nav//a[span/@class='next']/@href | ||
14 | |||
15 | test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars | ||
16 | test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/articles.boston.com.txt b/inc/3rdparty/site_config/standard/articles.boston.com.txt new file mode 100644 index 00000000..e54423be --- /dev/null +++ b/inc/3rdparty/site_config/standard/articles.boston.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1 | ||
2 | author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ") | ||
3 | date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"] | ||
4 | |||
5 | strip_id_or_class: mod-pagination | ||
6 | test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/articles.courant.com.txt b/inc/3rdparty/site_config/standard/articles.courant.com.txt new file mode 100644 index 00000000..a08f2041 --- /dev/null +++ b/inc/3rdparty/site_config/standard/articles.courant.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1 | ||
2 | date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"] | ||
3 | author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3] | ||
4 | |||
5 | strip_id_or_class: mod-article-byline | ||
6 | strip_id_or_class: mod-article-header | ||
7 | strip_id_or_class: mod-article-subtitle | ||
8 | #This leaves some crud after the article, but it's better than nothing. | ||
9 | #It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element. | ||
10 | |||
11 | test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/asahi.com.txt b/inc/3rdparty/site_config/standard/asahi.com.txt new file mode 100644 index 00000000..2562edb9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/asahi.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='HeadLine'] | ||
2 | strip: //div[@id='utility_right'] | ||
3 | test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ascarter.net.txt b/inc/3rdparty/site_config/standard/ascarter.net.txt new file mode 100644 index 00000000..5236d09e --- /dev/null +++ b/inc/3rdparty/site_config/standard/ascarter.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1[@class='article_title'] | ||
2 | author: //span[@class='author'] | ||
3 | date: //h2[@class='dateline'] | ||
4 | body: //div[@class='article_body'] | ||
5 | test_url: http://ascarter.net/2012/02/20/enough-is-enough.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/astronews.com.txt b/inc/3rdparty/site_config/standard/astronews.com.txt new file mode 100644 index 00000000..33e8153d --- /dev/null +++ b/inc/3rdparty/site_config/standard/astronews.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //span[@class='titel'] | ||
2 | author: //span[@class='metadaten_C']/a//span[@class='metadaten_C'] | ||
3 | date: substring-after(//span[@class='metadaten_C'],'astronews.com') | ||
4 | strip: //span[@class='bu'] | ||
5 | strip_image_src: '/_images/' | ||
6 | |||
7 | test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/asymco.com.txt b/inc/3rdparty/site_config/standard/asymco.com.txt new file mode 100644 index 00000000..adad5f18 --- /dev/null +++ b/inc/3rdparty/site_config/standard/asymco.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | # Johannes Stühler | ||
2 | |||
3 | title://h2 | ||
4 | author://span[@class='meta-content'] | ||
5 | date://abbr[@class='date published']/@title | ||
6 | body://div[@class='entry-content'] | ||
7 | |||
8 | test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/autoblog.com.txt b/inc/3rdparty/site_config/standard/autoblog.com.txt new file mode 100644 index 00000000..58681bf9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/autoblog.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | prune: no | ||
2 | body: //div[@class='post-body'] | ||
3 | author: //p[@class='byline']//a | ||
4 | date: substring-after(//div[@class='about']/p[2], 'Posted') | ||
5 | strip: //div[@class='body']/div[@class='meta'] | ||
6 | test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/avclub.com.txt b/inc/3rdparty/site_config/standard/avclub.com.txt new file mode 100644 index 00000000..776ee108 --- /dev/null +++ b/inc/3rdparty/site_config/standard/avclub.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //*[@id="article_wrapper"]/div[1]/a[1] | ||
2 | body: //*[@id="article_wrapper"]/div[2] | ||
3 | date: //*[@id="article_wrapper"]/div[1]/text()[2] | ||
4 | test_url: http://www.avclub.com/articles/forgetmenot,70904 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/baltimoresun.com.txt b/inc/3rdparty/site_config/standard/baltimoresun.com.txt new file mode 100644 index 00000000..32adff8d --- /dev/null +++ b/inc/3rdparty/site_config/standard/baltimoresun.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | single_page_link: //div[@class='toppaginate']//a[@rel='nofollow'] | ||
2 | convert_double_br_tags: yes | ||
3 | |||
4 | title: //div[@class="story"]/h1 | ||
5 | body: //div[@id="story-body-text"] | ||
6 | author: //span[@class="byline"] | ||
7 | date: //p[@class="date"] | ||
8 | |||
9 | strip: //*[@class='all'] | ||
10 | strip: //*[@class='articlerail'] | ||
11 | |||
12 | test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/basicthinking.de.txt b/inc/3rdparty/site_config/standard/basicthinking.de.txt new file mode 100644 index 00000000..ab583145 --- /dev/null +++ b/inc/3rdparty/site_config/standard/basicthinking.de.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h2 | ||
2 | date: //span[@class='date'] | ||
3 | body: //div[@class='entry'] | ||
4 | |||
5 | strip: //div[@class='zusatz'] | ||
6 | |||
7 | test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bb.is.txt b/inc/3rdparty/site_config/standard/bb.is.txt new file mode 100644 index 00000000..eaafaf18 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bb.is.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20) | ||
2 | |||
3 | |||
4 | date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12) | ||
5 | |||
6 | |||
7 | body: //div[@class='first-article-big'] | ||
8 | strip: //table[@class='newsimagecontainer'] | ||
9 | strip: //h3[@class='headlines'] | ||
10 | strip: //iframe[@class='headlines'] | ||
11 | strip: //a[@class='newslink'] | ||
12 | convert_double_br_tags: yes | ||
13 | test_url: http://bb.is/Pages/82?NewsID=174119 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bbc.co.uk.txt b/inc/3rdparty/site_config/standard/bbc.co.uk.txt new file mode 100644 index 00000000..9c5c3419 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bbc.co.uk.txt | |||
@@ -0,0 +1,32 @@ | |||
1 | body: //div[@class="story-body"] | ||
2 | title: //h1[@class="story-header"] | ||
3 | date: //span[@class="story-date"]/span[@class='date'] | ||
4 | |||
5 | # recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055 | ||
6 | body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1'] | ||
7 | |||
8 | #strip: //div[@class="story-feature narrow"] | ||
9 | #strip: //div[@class="story-feature wide"] | ||
10 | #strip: //div[@class="story-feature dslideshow-enclosure"] | ||
11 | strip: //div[contains(@class, "story-feature")] | ||
12 | strip: //span[@class="story-date"] | ||
13 | #strip: //div[@class="caption body-narrow-width"] | ||
14 | strip: //div[@class="warning"]//p | ||
15 | strip: //div[@id='page-bookmark-links-head'] | ||
16 | strip: //object | ||
17 | strip: //div[contains(@class, "bbccom_advert_placeholder")] | ||
18 | strip: //div[contains(@class, "embedded-hyper")] | ||
19 | strip: //div[contains(@class, 'market-data')] | ||
20 | strip: //a[contains(@class, 'hidden')] | ||
21 | strip: //div[contains(@class, 'hypertabs')] | ||
22 | strip: //div[contains(@class, 'related')] | ||
23 | strip: //form[@id='comment-form'] | ||
24 | strip: //div[contains(@class, 'comment-introduction')] | ||
25 | |||
26 | replace_string(<noscript>): <div> | ||
27 | replace_string(</noscript>): </div> | ||
28 | |||
29 | prune: no | ||
30 | |||
31 | dissolve: //h2 | ||
32 | test_url: http://www.bbc.co.uk/news/business-15060862 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/benoitmaison.org.txt b/inc/3rdparty/site_config/standard/benoitmaison.org.txt new file mode 100644 index 00000000..f341d593 --- /dev/null +++ b/inc/3rdparty/site_config/standard/benoitmaison.org.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | body: //div[@class="entry-content"] | ||
2 | |||
3 | # Remove text ‘Tweet’ | ||
4 | strip: //div[@class="entry-content"]/div[last()] | ||
5 | |||
6 | title: h1[@class="entry-title"] | ||
7 | |||
8 | # If the Instapaper text parser worked with HTML5 tags, we would use: | ||
9 | date: //time[@class="entry-date"] | ||
10 | |||
11 | # But since it does not, use this more complicated rule: | ||
12 | date: //div[@class="entry-meta"]/a[@rel="bookmark"] | ||
13 | |||
14 | # Unfortunately, the following rule is overridden by the automatically found author. | ||
15 | author: ("Benoit Maison") | ||
16 | test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/berlingske.dk.txt b/inc/3rdparty/site_config/standard/berlingske.dk.txt new file mode 100644 index 00000000..607c998d --- /dev/null +++ b/inc/3rdparty/site_config/standard/berlingske.dk.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1[@class='headline'] | ||
2 | body: //div[contains(@class, 'article-wrapper')] | ||
3 | test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/betabeat.com.txt b/inc/3rdparty/site_config/standard/betabeat.com.txt new file mode 100644 index 00000000..7815cf26 --- /dev/null +++ b/inc/3rdparty/site_config/standard/betabeat.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class="entry-content"] | ||
2 | test_url: http://www.betabeat.com/2011/07/04/sheryl-sandberg-breaks-through-silicon-valleys-boys-club-sort-of/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/betanews.com.txt b/inc/3rdparty/site_config/standard/betanews.com.txt new file mode 100644 index 00000000..0eaf085e --- /dev/null +++ b/inc/3rdparty/site_config/standard/betanews.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | # some articles at this site like this one doesn't | ||
2 | # seem to pick up the article body via normal | ||
3 | # processing, other articles come through fine | ||
4 | # http://www.betanews.com/joewilcox/article | ||
5 | # /Google-is-a-marketing-sensation/1309708375 | ||
6 | body: //*[@id="article"] | ||
7 | test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/biography.com.txt b/inc/3rdparty/site_config/standard/biography.com.txt new file mode 100644 index 00000000..dc071299 --- /dev/null +++ b/inc/3rdparty/site_config/standard/biography.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[contains(@class, 'main-content')]//h1 | ||
2 | body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')] | ||
7 | |||
8 | test_url: http://www.biography.com/print/profile/martin-luther-9389283 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bitelia.com.txt b/inc/3rdparty/site_config/standard/bitelia.com.txt new file mode 100644 index 00000000..7bffae93 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bitelia.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://bitelia.com/2011/09/klout-midiendo-influencia \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bjango.com.txt b/inc/3rdparty/site_config/standard/bjango.com.txt new file mode 100644 index 00000000..6cb04631 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bjango.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@class='articlehead'] | ||
2 | body: //div[@class='column'] | ||
3 | strip: //h1 | ||
4 | strip: //div[@class='help'] | ||
5 | |||
6 | #no author or date/time provided in current layout | ||
7 | test_url: http://bjango.com/articles/actions/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.arsln.org.txt b/inc/3rdparty/site_config/standard/blog.arsln.org.txt new file mode 100644 index 00000000..1f43f490 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.arsln.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | tidy: no | ||
2 | prune: no | ||
3 | date: //article/header/h6/time | ||
4 | title: //article/header/h3 | ||
5 | author: //meta[@name='author']/@content | ||
6 | body: //article//post | ||
7 | |||
8 | test_url: http://blog.arsln.org/aska-ayip-oluyor/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt b/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt new file mode 100644 index 00000000..81c3bda6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.asmartbear.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //title | ||
2 | author: //span[@class='author vcard']/a | ||
3 | date: //p[@class='headline_meta']/abbr[@class='published'] | ||
4 | body: //div[@class='format_text entry-content'] | ||
5 | |||
6 | strip: //div[@id='dd_ajax_float'] | ||
7 | test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt b/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt new file mode 100644 index 00000000..a4c5aaea --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.cloudflare.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | # Instapaper gets this back to front and only gets the blog title instead of the article title. | ||
2 | title: substring-before(//title, '-') | ||
3 | |||
4 | author: //a[ contains(@href, '/people') ] | ||
5 | |||
6 | body: //div[ @class='post' ] | ||
7 | |||
8 | # Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous. | ||
9 | test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.fefe.de.txt b/inc/3rdparty/site_config/standard/blog.fefe.de.txt new file mode 100644 index 00000000..92272b70 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.fefe.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h2 | ||
2 | date: //h3 | ||
3 | body: //ul | ||
4 | |||
5 | test_url: http://blog.fefe.de/?ts=b063bf55 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.instagram.com.txt b/inc/3rdparty/site_config/standard/blog.instagram.com.txt new file mode 100644 index 00000000..3065dd80 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.instagram.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # clean Instagram blog a little bit | ||
2 | |||
3 | tidy:no | ||
4 | prune:no | ||
5 | |||
6 | body://div[contains(@id,'content')] | ||
7 | |||
8 | strip_id_or_class:meta | ||
9 | strip_id_or_class:notes | ||
10 | strip_id_or_class:pagination | ||
11 | test_url: http://blog.instagram.com/post/8757832007/fromwhereistand \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt b/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt new file mode 100644 index 00000000..4e467fe9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.jaysalvat.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | date: //span[contains(@class, 'date-links')] | ||
2 | author: //span[contains(@class, 'author-links')] | ||
3 | body: //div[contains(@class, 'entry-content')] | ||
4 | test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt b/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt new file mode 100644 index 00000000..ac18ad15 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.kaelig.fr.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //*[contains(@class, 'post_content')] | ||
2 | author: string('Kaelig Deloumeau-Prigent') | ||
3 | title: //h1[@class='title'] | ||
4 | date: //span[@class='date'] | ||
5 | test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.naver.com.txt b/inc/3rdparty/site_config/standard/blog.naver.com.txt new file mode 100644 index 00000000..702789ad --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.naver.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //span[@class='pcol1 itemSubjectBoldfont'] | ||
2 | body: //div[@id='postListBody'] | ||
3 | date: //p[@class='date fil5 pcol2'] | ||
4 | single_page_link: /html/frameset/frame[1]/attribute::src | ||
5 | strip: //div[@class='post-btn'] | ||
6 | test_url: http://blog.naver.com/how2invest/110135068757 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.pchome.net.txt b/inc/3rdparty/site_config/standard/blog.pchome.net.txt new file mode 100644 index 00000000..3089001e --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.pchome.net.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | # PCHOME blog, a popular Chinese blog host | ||
2 | # Oct 15, 2011 | ||
3 | # | ||
4 | |||
5 | title://*[contains(@class,'imp')]/h2 | ||
6 | |||
7 | date://*[contains(@class,'imp')]/span | ||
8 | body://div[contains(@id,'blog_content')] | ||
9 | |||
10 | |||
11 | |||
12 | test_url: http://blog.pchome.net/article/462502.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.pinboard.in.txt b/inc/3rdparty/site_config/standard/blog.pinboard.in.txt new file mode 100644 index 00000000..b7afe455 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.pinboard.in.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //a[@class="blog_title"] | ||
2 | date: //p[@class="when"]/a | ||
3 | body: //div[@class="blog_entry"] | ||
4 | strip_id_or_class:blog_title | ||
5 | strip_id_or_class:when | ||
6 | test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt b/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt new file mode 100644 index 00000000..acb9ce81 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.sina.com.cn.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | # Sina blog, the most popular blog host in China. | ||
2 | # Its source code is horrible. | ||
3 | # | ||
4 | # Issue: | ||
5 | # Only the first image in the article is displayed. | ||
6 | # The rest images are replace by a 1x1 transparent gif by sina blog host. | ||
7 | # | ||
8 | |||
9 | title://*[contains(@class,'titName SG_txta')] | ||
10 | author://*[contains(@id,'ownernick')] | ||
11 | date://*[contains(@class,'time SG_txtc')] | ||
12 | body://div[contains(@class,'articalContent')] | ||
13 | |||
14 | # Remove redundant content which has span class start with "MASS" | ||
15 | # Example <span class="MASSf21674ffeef7"></span> | ||
16 | strip://span[contains(@class,'MASS')] | ||
17 | |||
18 | # Remove comment | ||
19 | strip://div[contains(@class,'allComm')] | ||
20 | |||
21 | # Remove hiden text and link | ||
22 | strip://ins | ||
23 | |||
24 | tidy:no | ||
25 | convert_double_br_tags:yes | ||
26 | test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.spu.edu.txt b/inc/3rdparty/site_config/standard/blog.spu.edu.txt new file mode 100644 index 00000000..68bd4e39 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.spu.edu.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body://div[@class='post'] | ||
2 | test_url: http://blog.spu.edu/lectio/from-the-frying-pan-into-the-fire/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blog.wells.ee.txt b/inc/3rdparty/site_config/standard/blog.wells.ee.txt new file mode 100644 index 00000000..8c8b3838 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blog.wells.ee.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2/a[@class="no-link title"] | ||
2 | author: //h2[@id="blog_owner"] | ||
3 | date: //time | ||
4 | strip: //h2/a[@class="no-link title"] | ||
5 | test_url: http://blog.wells.ee/retina | ||
6 | test_url: http://blog.wells.ee/skeuomorphism \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt b/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt new file mode 100644 index 00000000..f630127b --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.aljazeera.net.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | # 2011-08-23 [carlo@...] Initial version. | ||
2 | |||
3 | author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text() | ||
4 | |||
5 | # why yes, I do feel a bit dirty | ||
6 | date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " ) | ||
7 | |||
8 | test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.forbes.com.txt b/inc/3rdparty/site_config/standard/blogs.forbes.com.txt new file mode 100644 index 00000000..86580d21 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.forbes.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='entry'] | ||
2 | test_url: http://blogs.forbes.com/adamhartung/2011/04/08/apple-is-better-managed-than-microsoft/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.hbr.org.txt b/inc/3rdparty/site_config/standard/blogs.hbr.org.txt new file mode 100644 index 00000000..3664d16c --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.hbr.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[@id='pageFeature']/h1 | ||
2 | body: //div[@id='articleBody'] | ||
3 | strip: //div[@class='module wide'] | ||
4 | test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.msdn.com.txt b/inc/3rdparty/site_config/standard/blogs.msdn.com.txt new file mode 100644 index 00000000..3d3ec020 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.msdn.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h3[@class="post-name"] | ||
2 | author: //span[@class="user-name"] | ||
3 | date: //div[@class="post-date"] | ||
4 | body: //div[@class="post-content user-defined-markup"] | ||
5 | footnotes: no | ||
6 | test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.reuters.com.txt b/inc/3rdparty/site_config/standard/blogs.reuters.com.txt new file mode 100644 index 00000000..6907bcb2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.reuters.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //div[@id='single']/h1 | ||
2 | body: //div[@id='postcontent'] | ||
3 | test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt b/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt new file mode 100644 index 00000000..a7d15081 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.scientificamerican.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | # meta data | ||
2 | title://h1[@class = 'postTitle'] | ||
3 | author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|') | ||
4 | date://span[@class = 'datestamp'] | ||
5 | |||
6 | #body content | ||
7 | body://div[@id = 'singleBlogPost'] | ||
8 | |||
9 | #reclaim author info | ||
10 | move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv'] | ||
11 | strip://p[@class = 'moreLink mobileHide'] | ||
12 | |||
13 | #cleanup comments, there might be some open <div> sections | ||
14 | strip://div[@id = 'comments2'] | ||
15 | strip://h3[a[@href = '#add-comment']] | ||
16 | test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt b/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt new file mode 100644 index 00000000..ba8bc6e7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.smithsonianmag.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | # metadata | ||
2 | author://div[@class = 'post']/div[@class='meta']/a[1] | ||
3 | date://div[@id = 'rap']/h2[1] | ||
4 | body://div[@class = 'post'] | ||
5 | |||
6 | # wrapping caption and image | ||
7 | wrap_in(fieldset)://div[contains(@class, 'wp-caption')] | ||
8 | |||
9 | |||
10 | # clean up | ||
11 | strip://div[@class = 'post']/h3[@class = 'storytitle'] | ||
12 | strip://div[@class = 'post']/div[@class = 'social'] | ||
13 | strip://img[@style = 'display:none;'] | ||
14 | strip://img[@height='0' and @width='0'] | ||
15 | test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/blogs.technet.com.txt b/inc/3rdparty/site_config/standard/blogs.technet.com.txt new file mode 100644 index 00000000..a2909fd1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/blogs.technet.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h3[@class="post-name"] | ||
2 | author: //span[@class="user-name"] | ||
3 | date: //div[@class="post-date"] | ||
4 | body: //div[@class="post-content user-defined-markup"] | ||
5 | footnotes: no | ||
6 | test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bluetouff.com.txt b/inc/3rdparty/site_config/standard/bluetouff.com.txt new file mode 100644 index 00000000..fbe7a5c6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bluetouff.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body://div[@class='entry'] | ||
2 | date://div[@class='meta'] | ||
3 | strip://a[@class='FlattrButton'] | ||
4 | test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/boagworld.com.txt b/inc/3rdparty/site_config/standard/boagworld.com.txt new file mode 100644 index 00000000..91e48fdb --- /dev/null +++ b/inc/3rdparty/site_config/standard/boagworld.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class="entry-title"][2] | ||
2 | author: string("Paul Boag") | ||
3 | date: substring(//span[@class="meta"], 11) | ||
4 | body: //article | ||
5 | strip: //h2 | ||
6 | strip: //h1 | ||
7 | strip: //div[@id="callsToAction"] | ||
8 | test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/boingboing.net.txt b/inc/3rdparty/site_config/standard/boingboing.net.txt new file mode 100644 index 00000000..9169e8fb --- /dev/null +++ b/inc/3rdparty/site_config/standard/boingboing.net.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # This is far from perfect, but so is BoingBoing's markup | ||
2 | title: //h2[@class="headline"] | ||
3 | single_page_link: //h2[@class="headline"]/a | ||
4 | #date: //p[@class="byline"] | ||
5 | body: //div[@class="post"] | ||
6 | |||
7 | strip_id_or_class: shareMe | ||
8 | strip_id_or_class: authorbox | ||
9 | strip_id_or_class: byline | ||
10 | |||
11 | test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt b/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt new file mode 100644 index 00000000..4cc49043 --- /dev/null +++ b/inc/3rdparty/site_config/standard/boldizsar.palotas.eu.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h2[@class='entry-title'] | ||
2 | body: //div[@class='entry-content'] | ||
3 | test_url: http://boldizsar.palotas.eu/blog/?p=1394 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/book.douban.com.txt b/inc/3rdparty/site_config/standard/book.douban.com.txt new file mode 100644 index 00000000..8b958562 --- /dev/null +++ b/inc/3rdparty/site_config/standard/book.douban.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //span[@property='v:description'] | ||
2 | date: //span[@property='v:dtreviewed'] | ||
3 | author: //span[@property='v:reviewer'] | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://book.douban.com/review/2422662/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bookforum.com.txt b/inc/3rdparty/site_config/standard/bookforum.com.txt new file mode 100644 index 00000000..331f415e --- /dev/null +++ b/inc/3rdparty/site_config/standard/bookforum.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | #metadata | ||
2 | title://div[@class = 'Topper']/h1 | ||
3 | author://div[@class = 'Topper']/h3 | ||
4 | date://div[@class = 'Topper']/h6 | ||
5 | body://div[@class = 'Core'] | ||
6 | |||
7 | |||
8 | |||
9 | # clean up | ||
10 | strip://div[@class = 'Topper']/h1 | ||
11 | strip://div[@class = 'Topper']/h3 | ||
12 | strip://div[@class = 'Topper']/h4 | ||
13 | strip://div[@class = 'Topper']/h5 | ||
14 | strip://div[@class = 'Topper']/h6 | ||
15 | strip://br[@clear = 'all'] | ||
16 | strip://div[@class = 'adCore'] | ||
17 | strip://div[@class = 'BookR'] | ||
18 | strip://div[@class = 'InfoBox'] | ||
19 | test_url: http://bookforum.com/inprint/018_04/8595 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/borderhouseblog.com.txt b/inc/3rdparty/site_config/standard/borderhouseblog.com.txt new file mode 100644 index 00000000..190738d5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/borderhouseblog.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title://h1 | ||
2 | author://div[@class="meta"]/span/a | ||
3 | date://div[@class="date"] | ||
4 | body://div[@class="content article"] | ||
5 | strip://div[@class="content article"]/h1 | ||
6 | |||
7 | test_url: http://borderhouseblog.com/?p=7832 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bostonglobe.com.txt b/inc/3rdparty/site_config/standard/bostonglobe.com.txt new file mode 100644 index 00000000..d3e6f43f --- /dev/null +++ b/inc/3rdparty/site_config/standard/bostonglobe.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | # NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com. | ||
2 | |||
3 | title: //div[@class="header"]/h1 | ||
4 | author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ") | ||
5 | date: //div[@class="byline"]/p[last()] | ||
6 | body: //div[@class="article-body"] | ||
7 | |||
8 | strip_id_or_class: aside | ||
9 | strip_id_or_class: promo | ||
10 | strip_id_or_class: skip-nav | ||
11 | strip_id_or_class: article-more | ||
12 | strip_id_or_class: article-bar | ||
13 | |||
14 | # This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed. | ||
15 | strip_id_or_class: figure | ||
16 | test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bostonreview.net.txt b/inc/3rdparty/site_config/standard/bostonreview.net.txt new file mode 100644 index 00000000..68567012 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bostonreview.net.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | #basics | ||
2 | title://h3[@class = 'article_title'] | ||
3 | date://span[@class = 'article_date'] | ||
4 | body://div[@id = 'center_column_article'] | ||
5 | #correct, but author not being picked up in preview | ||
6 | author://span[@class = 'article_author'] | ||
7 | |||
8 | #strips basics from article | ||
9 | strip_id_or_class:article_title | ||
10 | strip_id_or_class:article_date | ||
11 | strip_id_or_class:article_author | ||
12 | |||
13 | #strips pull quotes | ||
14 | strip_id_or_class:pull_quote | ||
15 | test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/boundlessline.org.txt b/inc/3rdparty/site_config/standard/boundlessline.org.txt new file mode 100644 index 00000000..bfc3f3d1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/boundlessline.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: substring-before(//title, '|') | ||
2 | body: //div[@class="entry"] | ||
3 | # Remove the author's picture | ||
4 | strip: //div[@class="entry"]/a[1] | ||
5 | test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brainfacts.org.txt b/inc/3rdparty/site_config/standard/brainfacts.org.txt new file mode 100644 index 00000000..94b0f56d --- /dev/null +++ b/inc/3rdparty/site_config/standard/brainfacts.org.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@class="standard"]/h1 | ||
2 | author: string("BrainFacts.org") | ||
3 | date: //div[@class="meta"]/strong | ||
4 | |||
5 | strip: //p[@class="skip"] | ||
6 | strip: //div[@class="meta"] | ||
7 | strip: //div[@class="standard"]/h1 | ||
8 | strip: //div[@class="modal"] | ||
9 | strip: //div[@class="columnRight"] | ||
10 | test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brandeins.de.txt b/inc/3rdparty/site_config/standard/brandeins.de.txt new file mode 100644 index 00000000..3753ce67 --- /dev/null +++ b/inc/3rdparty/site_config/standard/brandeins.de.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | # set body | ||
2 | body: //div[@id='theContent'] | ||
3 | |||
4 | # set title | ||
5 | title: //div[@id='theContent']/h3 | ||
6 | strip: //div[@id='theContent']/h3 | ||
7 | test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt b/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt new file mode 100644 index 00000000..19504844 --- /dev/null +++ b/inc/3rdparty/site_config/standard/brandingstrategyinsider.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | date://h2[@class="date-header"] | ||
2 | body://div[@class="entry-content"] | ||
3 | test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brettterpstra.com.txt b/inc/3rdparty/site_config/standard/brettterpstra.com.txt new file mode 100644 index 00000000..f6f73778 --- /dev/null +++ b/inc/3rdparty/site_config/standard/brettterpstra.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='post full'] | ||
2 | title: //h1 | ||
3 | author: substring-after(//title, '- ') | ||
4 | date: //span[@class='date'] | ||
5 | test_url: http://brettterpstra.com/byword-for-ios/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt b/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt new file mode 100644 index 00000000..27e6b70c --- /dev/null +++ b/inc/3rdparty/site_config/standard/brisbanetimes.com.au.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='articleBody'] | ||
2 | test_url: http://www.brisbanetimes.com.au/opinion/blogs/blunt-instrument/losing-our-minds--for-24-hours-20120118-1q682.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brookings.edu.txt b/inc/3rdparty/site_config/standard/brookings.edu.txt new file mode 100644 index 00000000..9f4fc4e3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/brookings.edu.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //div[@id='contentheader']/h1 | ||
2 | author: //p[@class='attribution']/span[@class='author']/* | ||
3 | # Is there a way to pull multiple authors? My XPath here is just grabbing the first | ||
4 | |||
5 | date: /html/head/meta[@name="date"]/@content | ||
6 | body: //div[@class='main-content'] | ||
7 | |||
8 | strip: //p[@class='byline'] | ||
9 | strip: //div[@class='img-gallery'] | ||
10 | strip: //div[@class='callout'] | ||
11 | strip: //div[@class='add-your-view'] | ||
12 | convert_double_br_tags: yes | ||
13 | test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/brooksreview.net.txt b/inc/3rdparty/site_config/standard/brooksreview.net.txt new file mode 100644 index 00000000..71cafcdb --- /dev/null +++ b/inc/3rdparty/site_config/standard/brooksreview.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@class='article'] | ||
3 | body: //div[@class='post'] | ||
4 | date: //*[@id='single']/span | ||
5 | prune: no | ||
6 | test_url: http://brooksreview.net/2011/11/readability-agency/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/buquad.com.txt b/inc/3rdparty/site_config/standard/buquad.com.txt new file mode 100644 index 00000000..a75fa046 --- /dev/null +++ b/inc/3rdparty/site_config/standard/buquad.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | author: //h2/a | ||
3 | date: substring-after(//h2, '|') | ||
4 | strip_id_or_class: 'attachment' | ||
5 | strip: //h3 | ||
6 | |||
7 | body: //div[@class='entry'] | ||
8 | test_url: http://buquad.com/2012/04/09/paul-ryan/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/businessinsider.com.txt b/inc/3rdparty/site_config/standard/businessinsider.com.txt new file mode 100644 index 00000000..c773db8b --- /dev/null +++ b/inc/3rdparty/site_config/standard/businessinsider.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title://div[@class="sl-layout-post"]/h1 | ||
2 | body: //div[contains(@class, 'post-content') or contains(@class, 'KonaBody')] | ||
3 | strip: //div[contains(@class, "post-sidebar")] | ||
4 | strip: //div[@id='related-links'] | ||
5 | author://div[@class="byline"]/a | ||
6 | date://div[@class="byline"]/span[@class="date"] | ||
7 | prune: no | ||
8 | |||
9 | strip://*[contains(@class,'sponsored-text')] | ||
10 | strip: //div[@id='post_footer'] | ||
11 | |||
12 | test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/businessnews.com.tn.txt b/inc/3rdparty/site_config/standard/businessnews.com.tn.txt new file mode 100644 index 00000000..714cfc90 --- /dev/null +++ b/inc/3rdparty/site_config/standard/businessnews.com.tn.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@id='article_detail'] | ||
2 | title: //meta[@property='og:title']/@content | ||
3 | date: //div[@id='date_com_art']//a[@class='date'] | ||
4 | author: //div[@id='article_detail']//font[@class='auteur'] | ||
5 | |||
6 | strip_id_or_class: porte_titre_theme | ||
7 | strip_id_or_class: cont_param | ||
8 | strip_id_or_class: date_com_art | ||
9 | |||
10 | prune: no | ||
11 | |||
12 | test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/businessweek.com.txt b/inc/3rdparty/site_config/standard/businessweek.com.txt new file mode 100644 index 00000000..7b3d063b --- /dev/null +++ b/inc/3rdparty/site_config/standard/businessweek.com.txt | |||
@@ -0,0 +1,30 @@ | |||
1 | # story has several pages, should be detected | ||
2 | body: //div[@id='storyBody'] | ||
3 | body: //div[@id='article_body'] | ||
4 | body: //div[@id='story_body'] | ||
5 | |||
6 | title://h1[@id='article_headline'] | ||
7 | |||
8 | # article author | ||
9 | author: //p[@class='author']/a | ||
10 | # story author(s) | ||
11 | author: substring-after(//p[@class='byline'], 'By ') | ||
12 | |||
13 | # article date | ||
14 | date: //span[@class='published_date'] | ||
15 | # story date | ||
16 | date: //span[@class='date'] | ||
17 | |||
18 | date: substring-after(//div[contains(@class,'attributor')],'on') | ||
19 | strip_id_or_class: inset | ||
20 | strip: //p/span[@class='photoCredit'] | ||
21 | strip: //h1 | ||
22 | |||
23 | strip_id_or_class: page_count | ||
24 | strip_id_or_class: tools | ||
25 | strip_id_or_class: pagination | ||
26 | |||
27 | single_page_link: //li[@id='stPrint']/a | ||
28 | |||
29 | test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html | ||
30 | test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/buzzfeed.com.txt b/inc/3rdparty/site_config/standard/buzzfeed.com.txt new file mode 100644 index 00000000..6df8bc47 --- /dev/null +++ b/inc/3rdparty/site_config/standard/buzzfeed.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | # Creator: Greg Leuch <greg@...> | ||
2 | |||
3 | # It can be messy. | ||
4 | tidy:no | ||
5 | |||
6 | # The basic template. | ||
7 | title: //h1[@data-print='title'] | ||
8 | author: //a[@data-print='author'] | ||
9 | date: //time[@data-print='date'] | ||
10 | body: //div[@data-print='body'] | ||
11 | body: //section[@data-print='body'] | ||
12 | |||
13 | # For various things... | ||
14 | strip: *[@data-print="ignore"] | ||
15 | test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/bygonebureau.com.txt b/inc/3rdparty/site_config/standard/bygonebureau.com.txt new file mode 100644 index 00000000..0abb6436 --- /dev/null +++ b/inc/3rdparty/site_config/standard/bygonebureau.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | author: //a[contains(@href, '/author/')] | ||
3 | date: //*[@class='post-date'] | ||
4 | strip: //*[@class='post-date'] | ||
5 | strip: //h1 | ||
6 | test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cardboardconnection.com.txt b/inc/3rdparty/site_config/standard/cardboardconnection.com.txt new file mode 100644 index 00000000..3adc7a35 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cardboardconnection.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class='producttabbed-title'] | ||
2 | body: //div[@class='postTabs_divs postTabs_curr_div'] | ||
3 | strip: //div[@class='ratingblock2'] | ||
4 | strip: //p[@id='breadcrumbs'] | ||
5 | strip: //div[@style='display: none'] | ||
6 | |||
7 | |||
8 | test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/carpeaqua.com.txt b/inc/3rdparty/site_config/standard/carpeaqua.com.txt new file mode 100644 index 00000000..7ba1ed78 --- /dev/null +++ b/inc/3rdparty/site_config/standard/carpeaqua.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2 | ||
2 | body: //div[@class='entry'] | ||
3 | |||
4 | prune: no | ||
5 | # otherwise the footnotes are removed | ||
6 | test_url: http://carpeaqua.com/2011/03/27/the-intersection-of-power-and-portability/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/catb.org.txt b/inc/3rdparty/site_config/standard/catb.org.txt new file mode 100644 index 00000000..8908292c --- /dev/null +++ b/inc/3rdparty/site_config/standard/catb.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@class='article'] | ||
2 | strip: //div[@class='revhistory'] | ||
3 | strip: //div[@class='toc'] | ||
4 | tidy: no | ||
5 | prune: no | ||
6 | |||
7 | test_url: http://catb.org/~esr/faqs/smart-questions.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cbc.ca.txt b/inc/3rdparty/site_config/standard/cbc.ca.txt new file mode 100644 index 00000000..25305109 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cbc.ca.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[contains(@class, 'headline')]/h1 | ||
2 | author: //h5[contains(@class, 'byline')] | ||
3 | date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ') | ||
4 | body: //div[@id="storyboard"] | ||
5 | test_url: http://www.cbc.ca/news/world/story/2012/01/16/cruise-ship-monday.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cbsnews.com.txt b/inc/3rdparty/site_config/standard/cbsnews.com.txt new file mode 100644 index 00000000..4ba3da19 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cbsnews.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | date: //meta[@name="published"]/@content | ||
2 | date: //div[@class="timeLine"] | ||
3 | title: //div[@id='contentBody']//h1 | ||
4 | author: //dl[@class="storyBlogByline"]/dd/a | ||
5 | body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')] | ||
6 | |||
7 | # Content Pruning | ||
8 | strip: //div[@class="scrollingArrows"] | ||
9 | strip: //div[@class="timeLine"] | ||
10 | strip: //dl[@class="storyBlogByline"] | ||
11 | |||
12 | prune: no | ||
13 | |||
14 | test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/chareidi.org.txt b/inc/3rdparty/site_config/standard/chareidi.org.txt new file mode 100644 index 00000000..de34a7d8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/chareidi.org.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: //h1 | ||
2 | test_url: http://www.chareidi.org/archives5772/tetzaveh/TZV72adraft.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/chinamining.org.txt b/inc/3rdparty/site_config/standard/chinamining.org.txt new file mode 100644 index 00000000..ea0df2a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/chinamining.org.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //*[@id='Content']/span[1] | ||
2 | author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(') | ||
3 | date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter') | ||
4 | |||
5 | strip: //*[@id='Content']/span[1] | ||
6 | strip: //*[@id='Content']/span[2] | ||
7 | |||
8 | body: //*[@id='Content'] | ||
9 | |||
10 | test_url: http://www.chinamining.org/News/2011-07-22/1311319069d48087.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/chomsky.info.txt b/inc/3rdparty/site_config/standard/chomsky.info.txt new file mode 100644 index 00000000..1d294109 --- /dev/null +++ b/inc/3rdparty/site_config/standard/chomsky.info.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='title'] | ||
2 | author: //div[@class='author'] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.chomsky.info/onchomsky/2002----.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/christianitytoday.com.txt b/inc/3rdparty/site_config/standard/christianitytoday.com.txt new file mode 100644 index 00000000..44288a46 --- /dev/null +++ b/inc/3rdparty/site_config/standard/christianitytoday.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title://div[@class='title'] | ||
2 | author://div[@class='byline']/b | ||
3 | date:substring-after(//div[@class='byline'], 'posted') | ||
4 | body://div[@id='body'] | ||
5 | wrap_in(h2)://span[@class='subhead'] | ||
6 | wrap_in(i)://p[@class='bio'] | ||
7 | wrap_in(i)://p[@class='copyright'] | ||
8 | strip://div[@class='title'] | ||
9 | strip://div[@class='deck'] | ||
10 | strip://div[@class='byline'] | ||
11 | strip://div[@class='copyright'] | ||
12 | strip://br | ||
13 | test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/christianpf.com.txt b/inc/3rdparty/site_config/standard/christianpf.com.txt new file mode 100644 index 00000000..7f089c55 --- /dev/null +++ b/inc/3rdparty/site_config/standard/christianpf.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | author: //*[@class="author vcard fn"] | ||
3 | date: //*[@class="published"] | ||
4 | body: //div[(@class = "dd_content_wrap")] | ||
5 | test_url: http://christianpf.com/do-ibuys-lead-to-more-buying/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/christies.com.txt b/inc/3rdparty/site_config/standard/christies.com.txt new file mode 100644 index 00000000..5c5889a2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/christies.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | tidy: no | ||
2 | prune: no | ||
3 | date: //article//time[@pubdate] | ||
4 | title: //article/header/h2 | ||
5 | body: //article | ||
6 | test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/chrome.google.com.txt b/inc/3rdparty/site_config/standard/chrome.google.com.txt new file mode 100644 index 00000000..d4cc8581 --- /dev/null +++ b/inc/3rdparty/site_config/standard/chrome.google.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //pre[@id='cx-desc-text'] | ||
2 | body: //div[contains(@class, 'overview-tab-right-bar-info')] | ||
3 | title: //h1[contains(@class, 'detail-dialog-title')] | ||
4 | tidy: no | ||
5 | prune: no | ||
6 | replace_string(<noscript>): <div> | ||
7 | replace_string(</noscript>): </div> | ||
8 | |||
9 | test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/chronicle.com.txt b/inc/3rdparty/site_config/standard/chronicle.com.txt new file mode 100644 index 00000000..0c6c11ed --- /dev/null +++ b/inc/3rdparty/site_config/standard/chronicle.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //h1[contains(@class, "entry-title")] | ||
2 | author: //p[contains(@class, "byline")] | ||
3 | |||
4 | # blog articles (chronicle.com/blogs/*) | ||
5 | body: //div[contains(@class, "abstract")] | ||
6 | date: //p[contains(@class, "time")] | ||
7 | |||
8 | # all (?) other articles | ||
9 | body: //div[@id="article-body"] | ||
10 | date: //p[contains(@class, "dateline")] | ||
11 | |||
12 | # remove sidebars containing images (I assume this is desired for Instapaper) | ||
13 | strip: //div[@id="related"] | ||
14 | strip: //div[contains(@class, "image")] | ||
15 | |||
16 | # note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet | ||
17 | test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cicero.de.txt b/inc/3rdparty/site_config/standard/cicero.de.txt new file mode 100644 index 00000000..b9f9a12b --- /dev/null +++ b/inc/3rdparty/site_config/standard/cicero.de.txt | |||
@@ -0,0 +1,33 @@ | |||
1 | # fforst@... | ||
2 | |||
3 | # Use link to print article for single page view | ||
4 | single_page_link: //a[@class="print"] | ||
5 | |||
6 | # set body | ||
7 | tidy: no | ||
8 | body: //div[@class='artikel-content'] | ||
9 | |||
10 | # strip title and subtitle since we got it already | ||
11 | strip: //div[@class='issue'] | ||
12 | strip: //div[@class='artikel-content']/h2 | ||
13 | |||
14 | # some authors are known and have a link, others don't | ||
15 | author: //a[contains(@href, 'autor?')] | ||
16 | |||
17 | #date | ||
18 | date: //span[@class='article-date'] | ||
19 | |||
20 | # Strip author since we got him | ||
21 | strip_id_or_class: author | ||
22 | |||
23 | #strip captions | ||
24 | strip_id_or_class: field-name-field-image-credit | ||
25 | strip_id_or_class: field-name-field-article-image-subtitle | ||
26 | |||
27 | # remove community functions | ||
28 | strip: //div[@class='meta'] | ||
29 | strip: //div[@id='comments'] | ||
30 | |||
31 | # remove "continue on the next page" text | ||
32 | strip: //p[text()="[SEITE]"] | ||
33 | test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ciperchile.cl.txt b/inc/3rdparty/site_config/standard/ciperchile.cl.txt new file mode 100644 index 00000000..4d3ac804 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ciperchile.cl.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //*[(@id = "articlebody")] | ||
2 | strip_id_or_class: rotulo | ||
3 | |||
4 | test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cjr.org.txt b/inc/3rdparty/site_config/standard/cjr.org.txt new file mode 100644 index 00000000..a0c3ea5d --- /dev/null +++ b/inc/3rdparty/site_config/standard/cjr.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body'] | ||
2 | prune: no | ||
3 | |||
4 | single_page_link: //li[@class='print']/a | ||
5 | |||
6 | test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/clientk.com.txt b/inc/3rdparty/site_config/standard/clientk.com.txt new file mode 100644 index 00000000..369e88ad --- /dev/null +++ b/inc/3rdparty/site_config/standard/clientk.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title://div[@class="entrytitle"]/a | ||
2 | author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ") | ||
3 | date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted") | ||
4 | body://div[@class="entrybody"] | ||
5 | strip://div[@class="entrybody"]//p[@class="singleinfo"] | ||
6 | test_url: http://clientk.com/2011/12/19/the-impact-of-more/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/clubic.com.txt b/inc/3rdparty/site_config/standard/clubic.com.txt new file mode 100644 index 00000000..b356bbdf --- /dev/null +++ b/inc/3rdparty/site_config/standard/clubic.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1 | ||
2 | author: //a[@class='auteur'] | ||
3 | body: //div[@class='editorial'] | ||
4 | next_page_link: //a[contains(text(),'Page suivante')] | ||
5 | strip: //a[contains(text(),'Page suivante')] | ||
6 | strip: //a[contains(text(),'Page précédente')] | ||
7 | strip_id_or_class: slideshow | ||
8 | |||
9 | prune: no | ||
10 | |||
11 | test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cmswire.com.txt b/inc/3rdparty/site_config/standard/cmswire.com.txt new file mode 100644 index 00000000..2bc96d2e --- /dev/null +++ b/inc/3rdparty/site_config/standard/cmswire.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[contains(@id,'article-body')] | ||
2 | strip://div[contains(@id,'disqus_count_block')] | ||
3 | strip://div[contains(@id,'col-left')] | ||
4 | strip://div[contains(@id,'col-right')] | ||
5 | |||
6 | test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cnet.com.txt b/inc/3rdparty/site_config/standard/cnet.com.txt new file mode 100644 index 00000000..74f46ba9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cnet.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //div[contains(@class, 'postBody')] | ||
3 | date: //div[@id='nameAndTime']/time | ||
4 | author: //div[@id='nameAndTime']/span[@class='author'] | ||
5 | |||
6 | strip_id_or_class: image-credit | ||
7 | strip_id_or_class: noAutolink | ||
8 | strip_id_or_class: related | ||
9 | |||
10 | prune: no | ||
11 | tidy: no | ||
12 | |||
13 | # early end | ||
14 | replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html> | ||
15 | |||
16 | test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cnn.com.txt b/inc/3rdparty/site_config/standard/cnn.com.txt new file mode 100644 index 00000000..995e2c79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cnn.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //div[@class="cnn_storyarea"]/h1 | ||
2 | author: //div[@class="cnnByline"]/strong | ||
3 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun') | ||
4 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon') | ||
5 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue') | ||
6 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed') | ||
7 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu') | ||
8 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri') | ||
9 | date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat') | ||
10 | strip: //div[@class="cnn_storyarea"]/h1 | ||
11 | strip_id_or_class: cnnByline | ||
12 | strip_id_or_class: cnn_strytmstmp | ||
13 | strip_id_or_class: cnn_strycaptiontxt | ||
14 | strip_id_or_class: cnn_strybtntoolsbttm | ||
15 | strip_id_or_class: cnn_strybtntools | ||
16 | strip_id_or_class: cnn_strybtmcntnt | ||
17 | strip_id_or_class: cnn_containerwht | ||
18 | strip_id_or_class: cnn_stryathrtmp | ||
19 | test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cnnsi.com.txt b/inc/3rdparty/site_config/standard/cnnsi.com.txt new file mode 100644 index 00000000..6a2c2b80 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cnnsi.com.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | # main sportsillustrated.com articles | ||
2 | |||
3 | body: //div[@id="cnnStoryContent"] | ||
4 | title: //div[@id="cnnStoryHeadline"]//h1 | ||
5 | author: //div[@id="cnnSubBanner"]//strong | ||
6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") | ||
7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") | ||
8 | |||
9 | # kill ugly font buttons | ||
10 | strip: //div[@id="cnnSCFontButtons"] | ||
11 | |||
12 | # kill misc filler videos & etc | ||
13 | strip: //div[@class="cnnDivideContent"] | ||
14 | strip: //*[@class="cnnTMbox"] | ||
15 | |||
16 | # si vault articles | ||
17 | # ------------- | ||
18 | body: //div[@class="siv_artPara"] | ||
19 | title: //div[@class="siv_artHeader"]//h1 | ||
20 | author: //div[@class="byline"] | ||
21 | date: //div[@class="date"] | ||
22 | |||
23 | next_page_link: //div[@id='cnnStoryContinue']/a | ||
24 | strip_id_or_class: cnnstorypagination | ||
25 | |||
26 | test_url: http://cnnsi.com/2012/writers/peter_king/01/08/wild.card.round/index.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/code.activestate.com.txt b/inc/3rdparty/site_config/standard/code.activestate.com.txt new file mode 100644 index 00000000..6cf72e23 --- /dev/null +++ b/inc/3rdparty/site_config/standard/code.activestate.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@id='content'] | ||
2 | title: //div[@id='page_header']/h1 | ||
3 | |||
4 | strip_id_or_class: 'lineno' | ||
5 | strip_id_or_class: 'block-toolbar-button' | ||
6 | strip_id_or_class: 'recipe_score' | ||
7 | strip: //div[@id='recipe_tools'] | ||
8 | strip: //div[@id='addcomment'] | ||
9 | |||
10 | test_url: http://code.activestate.com/recipes/500261-named-tuples/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/code.google.com.txt b/inc/3rdparty/site_config/standard/code.google.com.txt new file mode 100644 index 00000000..40a16209 --- /dev/null +++ b/inc/3rdparty/site_config/standard/code.google.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id="gc-pagecontent"] | ||
2 | strip: //a[@class="backtotop"] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/codinghorror.com.txt b/inc/3rdparty/site_config/standard/codinghorror.com.txt new file mode 100644 index 00000000..9c95f107 --- /dev/null +++ b/inc/3rdparty/site_config/standard/codinghorror.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body: //div[@class='blogbody'] | ||
2 | strip: //h3[@class='title'] | ||
3 | date: //h2[@class='date'] | ||
4 | #Should Atwood just be a literal? | ||
5 | author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V') | ||
6 | |||
7 | # tim.kingman@... 2011-07-26 | ||
8 | # Prune:no to retain all-link ULs that are part of the body content like | ||
9 | # http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html | ||
10 | # Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed. | ||
11 | |||
12 | prune: no | ||
13 | strip: //div[@class='posted']/following-sibling::* | ||
14 | strip: //div[@class='posted'] | ||
15 | test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/collegehumor.com.txt b/inc/3rdparty/site_config/standard/collegehumor.com.txt new file mode 100644 index 00000000..9d75d641 --- /dev/null +++ b/inc/3rdparty/site_config/standard/collegehumor.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //h1[@class='title'] | ||
2 | author: //p[@class='byline']/a[1] | ||
3 | date: //*[@class='date'] | ||
4 | |||
5 | body: //div[@class='article_body'] | ||
6 | strip: //p[@class='ca_intro'] | ||
7 | strip: //div[@id='action_bar'] | ||
8 | strip: //div[@class='below_content'] | ||
9 | strip: //div[@id='announcement'] | ||
10 | strip: //div[@id='leftovers'] | ||
11 | strip: //div[@class='form'] | ||
12 | strip: //div[@id='email_overlay'] | ||
13 | strip: //a[@class='close'] | ||
14 | test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt b/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt new file mode 100644 index 00000000..800a907d --- /dev/null +++ b/inc/3rdparty/site_config/standard/communities-dominate.blogs.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class="entry-body"] | ||
2 | test_url: http://communities-dominate.blogs.com/brands/2012/03/brutal-truth-about-lumia-cannot-sustain-even-1-to-1-replacement-of-symbian-windows-phone-strategy-do.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/community.service-now.com.txt b/inc/3rdparty/site_config/standard/community.service-now.com.txt new file mode 100644 index 00000000..10fd2516 --- /dev/null +++ b/inc/3rdparty/site_config/standard/community.service-now.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[@id="center"]//div[@class="node"] | ||
2 | title: //div[@id="center"]//h2 | ||
3 | author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") | ||
4 | date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—") | ||
5 | strip: //div[@id="center"]//h2[1] | ||
6 | strip: //span[@class="submitted"][1] | ||
7 | move_into(//div[@class="node"])://div[@class="breadcrumb"] | ||
8 | test_url: http://community.service-now.com/blog/lawrenceeng/seasons-greetings-servicenow-team \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/computer.org.txt b/inc/3rdparty/site_config/standard/computer.org.txt new file mode 100644 index 00000000..00e6fddf --- /dev/null +++ b/inc/3rdparty/site_config/standard/computer.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | strip_id_or_class:column-3 | ||
2 | strip_id_or_class:portlet-boundary | ||
3 | strip_id_or_class:banner | ||
4 | |||
5 | test_url: http://www.computer.org/portal/web/buildyourcareer/careerwatch/jt19 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/computerbase.de.txt b/inc/3rdparty/site_config/standard/computerbase.de.txt new file mode 100644 index 00000000..29199242 --- /dev/null +++ b/inc/3rdparty/site_config/standard/computerbase.de.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title://h1 | ||
2 | |||
3 | author://div[@id="news-meta"]/a | ||
4 | |||
5 | body://*[@id="main"]/div[1] | ||
6 | |||
7 | strip://*[@id="main"]/div[2] | ||
8 | strip://*[@id="main"]/div[3] | ||
9 | strip://*[@id="page"]//footer | ||
10 | |||
11 | #date: didn't manage to parse it | ||
12 | |||
13 | #Images have to be stripped because the page does it with overlay | ||
14 | strip://img | ||
15 | |||
16 | #figures are not displayed in instapaper... | ||
17 | strip://figure | //figcaption | ||
18 | test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/computerworld.com.txt b/inc/3rdparty/site_config/standard/computerworld.com.txt new file mode 100644 index 00000000..8e1f3e11 --- /dev/null +++ b/inc/3rdparty/site_config/standard/computerworld.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title: //meta[@name='headline']/@content | ||
2 | date: //meta[@name='date']/@content | ||
3 | author: //meta[@name='author']/@content | ||
4 | body: //div[contains(@class, 'article')] | ||
5 | body://div[@id="article_body"] | ||
6 | |||
7 | strip_id_or_class: banner | ||
8 | strip: //noscript | ||
9 | strip: //div[@style='width:1px;height:130px;float:right;'] | ||
10 | strip: //div[@class='storyby'] | ||
11 | strip_image_src: twitter_icon | ||
12 | strip_image_src: rss_bug | ||
13 | |||
14 | tidy: no | ||
15 | prune: no | ||
16 | |||
17 | next_page_link://div[@id="next_page"]/a | ||
18 | |||
19 | single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/')) | ||
20 | |||
21 | test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware | ||
22 | test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/computerworld.dk.txt b/inc/3rdparty/site_config/standard/computerworld.dk.txt new file mode 100644 index 00000000..a83f366f --- /dev/null +++ b/inc/3rdparty/site_config/standard/computerworld.dk.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | strip: //div[contains(@class, 'articleAdtechAd')] | ||
2 | title: //div[@id='article']/h1 | ||
3 | title: //div[contains(@class, 'article')]/h1 | ||
4 | body: //div[@id='articleText'] | ||
5 | test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/contemporist.com.txt b/inc/3rdparty/site_config/standard/contemporist.com.txt new file mode 100644 index 00000000..d2b289a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/contemporist.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | # get author from string like "Posted by <author> on <date>" | ||
2 | author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on') | ||
3 | |||
4 | # get date from string like "Posted by <author> on <date>" | ||
5 | date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on') | ||
6 | |||
7 | # this keeps thumbnail images | ||
8 | prune: no | ||
9 | test_url: http://www.contemporist.com/2011/11/02/landing-200-lamp-by-kim-hyunjoo \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt b/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt new file mode 100644 index 00000000..9bad2c84 --- /dev/null +++ b/inc/3rdparty/site_config/standard/conversaciones.nokia.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='article_header']/h1 | ||
2 | body: //div[@class='article_header']/p | //div[@class='article_body'] | ||
3 | strip_id_or_class: share_this | ||
4 | strip_id_or_class: sociable | ||
5 | prune: no | ||
6 | |||
7 | test_url: http://conversaciones.nokia.com/2011/10/07/cinco-atajos-en-el-nokia-n8/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/core77.com.txt b/inc/3rdparty/site_config/standard/core77.com.txt new file mode 100644 index 00000000..a24374d8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/core77.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id="permalink"]/div[@class="post"] | ||
2 | |||
3 | strip: //div[@id='backArrow'] | ||
4 | strip: //div[@id='fwdArrow'] | ||
5 | strip: //div[@class="post-title"] | ||
6 | strip: //div[@class="sharing"] | ||
7 | test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/counterpunch.org.txt b/inc/3rdparty/site_config/standard/counterpunch.org.txt new file mode 100644 index 00000000..c9e92287 --- /dev/null +++ b/inc/3rdparty/site_config/standard/counterpunch.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='main']//h1[contains(@class, 'article-title')] | ||
2 | author: //div[@class='mainauthorstyle'] | ||
3 | body: //div[@class='main']//div[@class='main-text'] | ||
4 | strip: //td[@width='140'] | ||
5 | |||
6 | test_url: http://www.counterpunch.org/johnstone05172011.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/crazybutable.com.txt b/inc/3rdparty/site_config/standard/crazybutable.com.txt new file mode 100644 index 00000000..d25cd05d --- /dev/null +++ b/inc/3rdparty/site_config/standard/crazybutable.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title://h2 | ||
2 | body://div[contains(@class, 'entrytext')] | ||
3 | test_url: http://www.crazybutable.com/weblog/archives/2010/07/01/house-ideas-that-worked/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/crimemagazine.com.txt b/inc/3rdparty/site_config/standard/crimemagazine.com.txt new file mode 100644 index 00000000..9cf0bccc --- /dev/null +++ b/inc/3rdparty/site_config/standard/crimemagazine.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | autodetect_next_page: no | ||
2 | test_url: http://www.crimemagazine.com/son-sam \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/crimethinc.com.txt b/inc/3rdparty/site_config/standard/crimethinc.com.txt new file mode 100644 index 00000000..74bc6db9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/crimethinc.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class="readingtext"] | ||
2 | title: substring-after(substring-after(//title, ':'), ':') | ||
3 | test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/crn.de.txt b/inc/3rdparty/site_config/standard/crn.de.txt new file mode 100644 index 00000000..7fa950af --- /dev/null +++ b/inc/3rdparty/site_config/standard/crn.de.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: //p[contains(@class,'author')]/a | ||
2 | date: //div[contains(@class,'date')] | ||
3 | test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/csmonitor.com.txt b/inc/3rdparty/site_config/standard/csmonitor.com.txt new file mode 100644 index 00000000..d4dbc5c8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/csmonitor.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //h1[contains(@class, 'head')] | ||
2 | |||
3 | # standard page | ||
4 | body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')] | ||
5 | # print page | ||
6 | body: //div[@id='mainColumn'] | ||
7 | |||
8 | author: //a[contains(@class, 'ui-author')] | ||
9 | |||
10 | single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')] | ||
11 | |||
12 | strip_id_or_class: storyToolbar | ||
13 | strip_id_or_class: promotion-tag | ||
14 | |||
15 | tidy: no | ||
16 | prune: no | ||
17 | |||
18 | test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/csnbayarea.com.txt b/inc/3rdparty/site_config/standard/csnbayarea.com.txt new file mode 100644 index 00000000..131a923b --- /dev/null +++ b/inc/3rdparty/site_config/standard/csnbayarea.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@id='csn_blogST_headline']/h1 | ||
2 | |||
3 | body: //div[@id='csn_blogST_main'] | ||
4 | strip_id_or_class: ipfootnotes | ||
5 | strip: //div[@id='csn_blogST_main']/p[1]/img | ||
6 | strip: //div[@id='csn_blogST_sidebar'] | ||
7 | test_url: http://www.csnbayarea.com/blog/giants-talk/post/-?blog%2Fgiants-talk%2Fpost%2F-=&blockID=578902&feedID=5987 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/csnphilly.com.txt b/inc/3rdparty/site_config/standard/csnphilly.com.txt new file mode 100644 index 00000000..0df72c32 --- /dev/null +++ b/inc/3rdparty/site_config/standard/csnphilly.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | # author's name is not isolated as a tag.... ugh | ||
2 | convert_double_br_tags: yes | ||
3 | body: //csn_blogST_main | ||
4 | |||
5 | #junk above and around the article | ||
6 | strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div | ||
7 | strip: /html/body/div[4]/header | ||
8 | strip_id_or_class: article-right-sidebar | ||
9 | strip_id_or_class: rsn-gigya-sharebar-container | ||
10 | strip_id_or_class: article-bottom | ||
11 | strip_id_or_class: hider | ||
12 | strip_id_or_class: footer | ||
13 | strip_id_or_class: masthead | ||
14 | strip_id_or_class: block-menu-menu-rsn-login-or-register | ||
15 | strip_id_or_class: block-menu-menu-header-links | ||
16 | strip_id_or_class: block-rsn-follow-bar-follow-bar | ||
17 | strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard | ||
18 | strip_id_or_class: logo | ||
19 | strip_id_or_class: element-invisible | ||
20 | strip_id_or_class: site-name | ||
21 | strip: //div[contains(@style, 'none')] | ||
22 | test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/cucharasonica.com.txt b/inc/3rdparty/site_config/standard/cucharasonica.com.txt new file mode 100644 index 00000000..e691fe83 --- /dev/null +++ b/inc/3rdparty/site_config/standard/cucharasonica.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://cucharasonica.com/2011/09/queen-busca-candidatos-para-su-propia-banda-tributo \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/da.feedsportal.com.txt b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt new file mode 100644 index 00000000..4a00ef44 --- /dev/null +++ b/inc/3rdparty/site_config/standard/da.feedsportal.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | single_page_link: //a | ||
2 | tidy: no | ||
3 | prune: no | ||
4 | |||
5 | test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dailydot.com.txt b/inc/3rdparty/site_config/standard/dailydot.com.txt new file mode 100644 index 00000000..61013993 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dailydot.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | tidy: no | ||
2 | body: //article | ||
3 | |||
4 | test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dailykos.com.txt b/inc/3rdparty/site_config/standard/dailykos.com.txt new file mode 100644 index 00000000..124675cb --- /dev/null +++ b/inc/3rdparty/site_config/standard/dailykos.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@id='article-1']//div[contains(@class, 'article-body')] | ||
2 | title: //div[@class='meta']//a[@id='titleHref'] | ||
3 | date: //div[@class='meta']//p[@class='date'] | ||
4 | |||
5 | strip_id_or_class: invisible | ||
6 | strip_id_or_class: divider-doodle | ||
7 | |||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrichs-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his ex-wife \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dailymail.co.uk.txt b/inc/3rdparty/site_config/standard/dailymail.co.uk.txt new file mode 100644 index 00000000..c83dbdb0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dailymail.co.uk.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@id='js-article-text'] | ||
2 | strip: //div[@class='explore-links'] | ||
3 | strip: //div[@id='js-article-text']/br[position()=1] | ||
4 | strip_id_or_class: print-or-mail-links | ||
5 | strip_id_or_class: shareArticles | ||
6 | strip_id_or_class: googleAds | ||
7 | strip_id_or_class: digg-button | ||
8 | strip_id_or_class: article-icon-links-container | ||
9 | strip_id_or_class: clickToEnlarge | ||
10 | tidy: no | ||
11 | |||
12 | test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dansdata.com.txt b/inc/3rdparty/site_config/standard/dansdata.com.txt new file mode 100644 index 00000000..96a2bc41 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dansdata.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | autodetect_next_page: no | ||
2 | tidy: no | ||
3 | prune: no | ||
4 | body: //div[@class='NoOverflow'] | ||
5 | test_url: http://www.dansdata.com/gz129.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/daringfireball.net.txt b/inc/3rdparty/site_config/standard/daringfireball.net.txt new file mode 100644 index 00000000..dca8ade7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/daringfireball.net.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class="article"]/h1 | ||
2 | author: //div[@id="Sidebar"]/p/strong | ||
3 | date: //h6[@class="dateline"] | ||
4 | body: //div[@class="article"] | ||
5 | strip: //h6[@class="dateline"] | ||
6 | strip: //div[@class="article"]/h1 | ||
7 | test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/datanami.com.txt b/inc/3rdparty/site_config/standard/datanami.com.txt new file mode 100644 index 00000000..3534002a --- /dev/null +++ b/inc/3rdparty/site_config/standard/datanami.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id="article"] | ||
2 | date: //p[@class="date"] | ||
3 | author: //p[@class="byline"] | ||
4 | test_url: http://www.datanami.com/datanami/2011-12-07/new_path_for_sap:_in_memory_computing,_predictive_analysis_converge.html?featured=top \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dcurt.is.txt b/inc/3rdparty/site_config/standard/dcurt.is.txt new file mode 100644 index 00000000..7d11c6e1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dcurt.is.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: (//article//h2)[1] | ||
2 | body: //article[contains(@class, 'post')] | ||
3 | date: //time[@id='top_time']/@datetime | ||
4 | |||
5 | prune: no | ||
6 | tidy: no | ||
7 | |||
8 | test_url: http://dcurt.is/predictions-txt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/delong.typepad.com.txt b/inc/3rdparty/site_config/standard/delong.typepad.com.txt new file mode 100644 index 00000000..84fd4f79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/delong.typepad.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | strip_id_or_class: banner | ||
2 | strip_id_or_class: gamma | ||
3 | strip_id_or_class: module-list | ||
4 | test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/derstandard.at.txt b/inc/3rdparty/site_config/standard/derstandard.at.txt new file mode 100644 index 00000000..48722ebd --- /dev/null +++ b/inc/3rdparty/site_config/standard/derstandard.at.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //div[@id='artikelHeader']/h1 | ||
2 | author: //span[@class='author'] | ||
3 | date: //span[@class='date'] | ||
4 | body: //div[@class='copytext'] | ||
5 | strip: //ul[@class='lookupLinksArtikel'] | ||
6 | |||
7 | strip: //div[@id='pageTop'] | ||
8 | strip: //div[@id='toolbar'] | ||
9 | strip: //div[@id='articleTools'] | ||
10 | strip: //div[@id='weiterlesen'] | ||
11 | strip: //div[@id='communityCanvas'] | ||
12 | |||
13 | test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/designtagebuch.de.txt b/inc/3rdparty/site_config/standard/designtagebuch.de.txt new file mode 100644 index 00000000..6096db0b --- /dev/null +++ b/inc/3rdparty/site_config/standard/designtagebuch.de.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | tidy: no | ||
2 | body: //div[@class='main'] | ||
3 | |||
4 | author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am') | ||
5 | date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ') | ||
6 | |||
7 | strip_id_or_class: pagelink | ||
8 | strip_id_or_class: wp-polls | ||
9 | |||
10 | next_page_link: //div[@class='post-page-next']/a | ||
11 | test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/desitvforum.net.txt b/inc/3rdparty/site_config/standard/desitvforum.net.txt new file mode 100644 index 00000000..a6dac5fd --- /dev/null +++ b/inc/3rdparty/site_config/standard/desitvforum.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: (//blockquote[contains(@class, 'postcontent')])[1] | ||
2 | body: (//div[starts-with(@id, 'post_message')])[1] | ||
3 | |||
4 | prune: no | ||
5 | tidy: no \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/details.com.txt b/inc/3rdparty/site_config/standard/details.com.txt new file mode 100644 index 00000000..548cabad --- /dev/null +++ b/inc/3rdparty/site_config/standard/details.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class="content-headline"] | ||
2 | body: //div[@class="headers-container"] | //div[@class="content-container"] | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | single_page_link: //li[@class='utility-print']/a | ||
7 | |||
8 | test_url: http://www.details.com/culture-trends/critical-eye/201108/best-new-designers-innovations \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/developers.facebook.com.txt b/inc/3rdparty/site_config/standard/developers.facebook.com.txt new file mode 100644 index 00000000..43a8f0a0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/developers.facebook.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //div[@class="bodyText"]/h1 | ||
2 | author: //div[@class="picture"]/a/img/@alt | ||
3 | test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt b/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt new file mode 100644 index 00000000..b960b37e --- /dev/null +++ b/inc/3rdparty/site_config/standard/devlinsangle.blogspot.co.at.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | date: //h2[@class='date-header'] | ||
2 | body: //div[@class='post hentry'] | ||
3 | title: //h3 | ||
4 | strip: //div[@class='post-footer'] | ||
5 | |||
6 | test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dictionary.reference.com.txt b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt new file mode 100644 index 00000000..a1172024 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dictionary.reference.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@id='query_h1'] | ||
2 | body: //div[contains(@class, 'lunatext results_content')] | ||
3 | strip_id_or_class: spl_unshd | ||
4 | #replace_string(<div class="dicTl">): <div class="dicTl">------------------<br /> | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/diepresse.com.txt b/inc/3rdparty/site_config/standard/diepresse.com.txt new file mode 100644 index 00000000..7e825a91 --- /dev/null +++ b/inc/3rdparty/site_config/standard/diepresse.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='article']/h1 | ||
2 | date: substring-before(//p[@class='articletime'],'|') | ||
3 | body: //div[@id='articletext'] | ||
4 | strip: //div[@class='inlineDiashow'] | ||
5 | |||
6 | test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt b/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt new file mode 100644 index 00000000..2d2ae2c2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/digiphoto.techbang.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | # default parser works great | ||
2 | # only add "author" and "next page link" reference | ||
3 | # 2012-04-13 | ||
4 | |||
5 | next_page_link: //div[@class = 'pagination']/a[@class = 'next_page'] | ||
6 | |||
7 | author: //*[@class = 'author metadata']/a | ||
8 | test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/digital-photography-school.com.txt b/inc/3rdparty/site_config/standard/digital-photography-school.com.txt new file mode 100644 index 00000000..37192ac0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/digital-photography-school.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='post-title']/h1 | ||
2 | author: //a[@href='#author'] | ||
3 | body: //div[@class='post-content'] | ||
4 | strip: //div[@class='post-meta'] | ||
5 | |||
6 | test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt b/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt new file mode 100644 index 00000000..b21431d7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/digitalspy.co.uk.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class="article_header"]/h1 | ||
2 | date: //div[@class="article_pub"]/span[@class="time"] | ||
3 | author: //div[@class="article_pub"]/span[@class="editors"]/a/text() | ||
4 | body: //div[@class="article_body clear_left"] | ||
5 | test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dilbert.com.txt b/inc/3rdparty/site_config/standard/dilbert.com.txt new file mode 100644 index 00000000..413e5506 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dilbert.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | convert_double_br_tags: yes | ||
2 | |||
3 | title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10) | ||
4 | body: //*[contains(@class, 'SB_Content')] | ||
5 | author: string('Scott Adams') | ||
6 | date: //*[contains(@class, 'SB_Detail')]/text()[1] | ||
7 | |||
8 | test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dinamalar.com.txt b/inc/3rdparty/site_config/standard/dinamalar.com.txt new file mode 100644 index 00000000..9ef198c9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dinamalar.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //div[@class='newsdetbd'] | ||
2 | body: //div[@id='innerleft'] | ||
3 | #//p[@class = 'plnht'] | ||
4 | strip_image_src: /albums/ | ||
5 | strip: //div[@class='mrrt'] | ||
6 | prune: yes | ||
7 | strip_id_or_class: 'fdpd' | ||
8 | strip_id_or_class: 'epapt' | ||
9 | strip_id_or_class: 'newsrtwd' | ||
10 | strip_id_or_class: 'padtp' | ||
11 | strip_id_or_class: 'newdt' | ||
12 | strip_id_or_class: 'newdlt' | ||
13 | strip: //div[@id='selNotes'] | ||
14 | strip_id_or_class: 'clsNotes' | ||
15 | strip_id_or_class: 'clear' | ||
16 | strip_id_or_class: 'cmtwrap' | ||
17 | strip_id_or_class: 'sess' | ||
18 | strip_id_or_class: 'parents' | ||
19 | test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dn.se.txt b/inc/3rdparty/site_config/standard/dn.se.txt new file mode 100644 index 00000000..86bb3b8d --- /dev/null +++ b/inc/3rdparty/site_config/standard/dn.se.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | # Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height. | ||
2 | |||
3 | body: //div[@id="article-content"] | ||
4 | |||
5 | |||
6 | # Ads | ||
7 | strip_id_or_class: advert-space | ||
8 | |||
9 | # Read more, recommend, comments etc | ||
10 | strip_id_or_class: fbc-recommend | ||
11 | strip_id_or_class: recommend | ||
12 | strip_id_or_class: article-readers | ||
13 | strip_id_or_class: article-addons | ||
14 | strip_id_or_class: hook | ||
15 | strip_id_or_class: right | ||
16 | strip_id_or_class: footer | ||
17 | |||
18 | # Other news | ||
19 | strip: //div[@id="mirrors"] | ||
20 | |||
21 | # Author | ||
22 | author: //div[@id="byline"]/div/p/strong | ||
23 | |||
24 | # Date | ||
25 | date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11) | ||
26 | test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/doctac.com.txt b/inc/3rdparty/site_config/standard/doctac.com.txt new file mode 100644 index 00000000..9f65ea9b --- /dev/null +++ b/inc/3rdparty/site_config/standard/doctac.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | strip: //*[(@id = "featured")] | ||
2 | |||
3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') | ||
4 | |||
5 | date: concat(//div[@class='month'],' ',//div[@class='day']) | ||
6 | |||
7 | #doctac doesn't provide a year, but month/day is better than nothing | ||
8 | test_url: http://www.doctac.com/mac/iphone/instapaper-update-app/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/domusweb.it.txt b/inc/3rdparty/site_config/standard/domusweb.it.txt new file mode 100644 index 00000000..81683f02 --- /dev/null +++ b/inc/3rdparty/site_config/standard/domusweb.it.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | # TODO: clean up the extra junk at the end of articles | ||
2 | |||
3 | # general text formatting | ||
4 | prune: no | ||
5 | convert_double_br_tags:yes | ||
6 | |||
7 | # where to find the basic metadata | ||
8 | author://a[@class='articleauthor'] | ||
9 | date://a[starts-with(@href,'/en/search/published/')] | ||
10 | title:substring-before(//h2[@class='title'],'—') | ||
11 | body://div[@id='maincontainer'] | ||
12 | |||
13 | dissolve://div[starts-with(@id,'commentableblock')] | ||
14 | |||
15 | # clean up the crap | ||
16 | strip://div[contains(@class,'domusnetwork')] | ||
17 | strip://div[contains(@class,'relative_wrapper')] | ||
18 | |||
19 | strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')] | ||
20 | wrap_in(em): //div[contains(@class,'captionsubimage')]/span | ||
21 | test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dou.ua.txt b/inc/3rdparty/site_config/standard/dou.ua.txt new file mode 100644 index 00000000..22907c22 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dou.ua.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@itemprop="name"] | ||
2 | |||
3 | author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a | ||
4 | |||
5 | date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')] | ||
6 | |||
7 | body: //div[contains(@class, 'b-typo')] | ||
8 | test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/douban.com.txt b/inc/3rdparty/site_config/standard/douban.com.txt new file mode 100644 index 00000000..99d7e5dc --- /dev/null +++ b/inc/3rdparty/site_config/standard/douban.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.douban.com/note/215003067/ | ||
3 | # http://www.douban.com/note/213540049/ | ||
4 | # http://www.douban.com/group/topic/31140104/ | ||
5 | |||
6 | title: //div[@class='note-header']/h1 | ||
7 | title: //div[@id='content']/h1 | ||
8 | |||
9 | author: //div[@class='info']/ul/li/a | ||
10 | author: //h3/span/a | ||
11 | |||
12 | date://div[@class='note-header']/div/span | ||
13 | date://h3/span[contains(@class, 'color-green')] | ||
14 | |||
15 | body://div[contains(@class, 'note')] | ||
16 | body://div[contains(@class, 'topic-content')] | ||
17 | |||
18 | strip://h3 | ||
19 | |||
20 | convert_double_br_tags: yes | ||
21 | test_url: http://www.douban.com/group/topic/31140104/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dpreview.com.txt b/inc/3rdparty/site_config/standard/dpreview.com.txt new file mode 100644 index 00000000..30179a3b --- /dev/null +++ b/inc/3rdparty/site_config/standard/dpreview.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | # next_page_link for product review | ||
2 | # example: http://www.dpreview.com/reviews/lytro/ | ||
3 | next_page_link: //img[@alt = 'Next page']/../@href | ||
4 | |||
5 | # next_page_link for other articles | ||
6 | # example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 | ||
7 | next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a | ||
8 | single_page_link: //a[contains(.,'Print view')] | ||
9 | test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dr.dk.txt b/inc/3rdparty/site_config/standard/dr.dk.txt new file mode 100644 index 00000000..7e46b0d6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dr.dk.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | author: //div[@class='articleFunctions']//a | ||
3 | date: //meta[@name='pubdate']/@content | ||
4 | |||
5 | # Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason) | ||
6 | body: //div[@class='articleContent'] | ||
7 | |||
8 | tidy: no | ||
9 | test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dramasonline.com.txt b/inc/3rdparty/site_config/standard/dramasonline.com.txt new file mode 100644 index 00000000..659d0443 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dramasonline.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@class='postext'] | ||
2 | |||
3 | strip_id_or_class: ratingblock | ||
4 | strip_id_or_class: hreview-aggregate | ||
5 | strip: //div[contains(@style, 'display: none;')] | ||
6 | |||
7 | tidy: no | ||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.dramasonline.com/jago-pakistan-jago-7th-december-2012-ali-gul-pir/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/drdobbs.com.txt b/inc/3rdparty/site_config/standard/drdobbs.com.txt new file mode 100644 index 00000000..b1a9db6f --- /dev/null +++ b/inc/3rdparty/site_config/standard/drdobbs.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | single_page_link: //a[contains(@href, '/article/print')] | ||
2 | test_url: http://www.drdobbs.com/architecture-and-design/240001128 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/drive2.ru.txt b/inc/3rdparty/site_config/standard/drive2.ru.txt new file mode 100644 index 00000000..6125ce79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/drive2.ru.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@class = "description"] | ||
2 | body: //div[@id = "post"] | ||
3 | |||
4 | strip_id_or_class: vcard | ||
5 | strip_id_or_class: journallist | ||
6 | strip_id_or_class: infobox | ||
7 | strip_id_or_class: terms | ||
8 | strip_id_or_class: replieslist | ||
9 | strip_id_or_class: communityside | ||
10 | |||
11 | |||
12 | test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/drupal.org.txt b/inc/3rdparty/site_config/standard/drupal.org.txt new file mode 100644 index 00000000..ffb77e4d --- /dev/null +++ b/inc/3rdparty/site_config/standard/drupal.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title://h1 | ||
2 | author://div[@class="submitted"]/a | ||
3 | date:substring-after(//div[@class="meta"],'modified: ') | ||
4 | date:substring-after(//div[@class="submitted"],'on ') | ||
5 | body://div[@class="node-content"] | ||
6 | strip://div[@class="meta"] | ||
7 | strip_id_or_class:book-navigation | ||
8 | test_url: http://drupal.org/node/1327354 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt b/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt new file mode 100644 index 00000000..418c9f62 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dukebasketballreport.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h2/a | ||
2 | author: substring-before(substring-after(//span[@class='byline'], 'by'), ',') | ||
3 | date: substring-before(substring-after(//span[@class='byline'], ','), '|') | ||
4 | body: //div[@class='entry'] | ||
5 | |||
6 | |||
7 | # strip out auction stuff at the end of posts | ||
8 | # tidy kills the center tag, so disable it | ||
9 | tidy: no | ||
10 | strip: //center//table | ||
11 | test_url: http://www.dukebasketballreport.com/articles/?p=42660 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/dvice.com.txt b/inc/3rdparty/site_config/standard/dvice.com.txt new file mode 100644 index 00000000..c8163680 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dvice.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | strip://*[@id = 'blog_top_stories'] | ||
2 | strip://*[@id = 'takeover_off'] | ||
3 | strip://*[@id = 'right_gray_box'] | ||
4 | strip://*[@class = 'blog_topics'] | ||
5 | strip://*[@class = 'section_titles'] | ||
6 | |||
7 | author://div[@class = 'post_author_info']/a | ||
8 | date://div[@class = 'post_date_info'] | ||
9 | test_url: http://dvice.com/archives/2012/05/is-nfc-and-smar.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eamesinerudition.com.txt b/inc/3rdparty/site_config/standard/eamesinerudition.com.txt new file mode 100644 index 00000000..908a1b51 --- /dev/null +++ b/inc/3rdparty/site_config/standard/eamesinerudition.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div [@class="post contain"]/h1 | ||
2 | strip: //div [@class="post contain"]/h1 | ||
3 | body: //div [@class="post contain"] | ||
4 | author: substring-before(//title, ':') | ||
5 | author: substring-before(//title, ' ') | ||
6 | |||
7 | |||
8 | test_url: http://eamesinerudition.com/2012/03/hospital-numbers-are-bad-for-you \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eandt.theiet.org.txt b/inc/3rdparty/site_config/standard/eandt.theiet.org.txt new file mode 100644 index 00000000..c4c38f25 --- /dev/null +++ b/inc/3rdparty/site_config/standard/eandt.theiet.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | date: //div[@class="et_dateUnderTitle"] | ||
3 | author: substring-after(//div[@class="et_authorUnderTitle"], 'By ') | ||
4 | body: //div[@id="et_leftCol640split"] | ||
5 | |||
6 | strip: //div[@id="et_leftCol640splitRight"] | ||
7 | strip: //div[@class="et_light_greybgboxlower"] | ||
8 | test_url: http://eandt.theiet.org/magazine/2011/12/this-festive-waste.cfm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eastoftheweb.com.txt b/inc/3rdparty/site_config/standard/eastoftheweb.com.txt new file mode 100644 index 00000000..d762091c --- /dev/null +++ b/inc/3rdparty/site_config/standard/eastoftheweb.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //div[@class='title_text'] | ||
2 | |||
3 | author: //div[@class='author_text'] | ||
4 | |||
5 | body: //div[@class='story_text']/.. | ||
6 | |||
7 | strip: //b | ||
8 | |||
9 | strip_id_or_class: back_to_top | ||
10 | strip_id_or_class: author_text | ||
11 | strip_id_or_class: title_text | ||
12 | |||
13 | wrap_in(center): //a | ||
14 | |||
15 | dissolve: //a | ||
16 | |||
17 | footnotes: no | ||
18 | test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ebay.com.txt b/inc/3rdparty/site_config/standard/ebay.com.txt new file mode 100644 index 00000000..5fa18ff3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ebay.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum'] | ||
2 | |||
3 | strip_image_src: imgLoading_30x30.gif | ||
4 | |||
5 | test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ecetia.com.txt b/inc/3rdparty/site_config/standard/ecetia.com.txt new file mode 100644 index 00000000..d67e9103 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ecetia.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://ecetia.com/2011/09/vida-de-jugon-vii-las-tres-es \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/econlog.econlib.org.txt b/inc/3rdparty/site_config/standard/econlog.econlib.org.txt new file mode 100644 index 00000000..ebafc197 --- /dev/null +++ b/inc/3rdparty/site_config/standard/econlog.econlib.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class="title"] | ||
2 | author: //div[@class="hosted"]/a | ||
3 | date: substring-after(//div[@class="dateline"]/text(), '|') | ||
4 | |||
5 | strip: //a[@class="top" and @href="#"] | ||
6 | test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt b/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt new file mode 100644 index 00000000..b59f554e --- /dev/null +++ b/inc/3rdparty/site_config/standard/economia.estadao.com.br.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | date: //div[@class="bb-md-noticia-fecha"] | ||
2 | body: //div[@class="corpo"] | ||
3 | dissolve: //div[@class="bb-md-noticia-extras"] | ||
4 | strip: //strong | ||
5 | strip_id_or_class: bb-md-noticia-foto-autor | ||
6 | strip_id_or_class: bb-md-noticia-foto-bajada | ||
7 | test_url: http://economia.estadao.com.br/noticias/economia,cmn-aprova-r-67-bi-em-credito-para-20-setores-da-economia,118501,0.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/economist.com.txt b/inc/3rdparty/site_config/standard/economist.com.txt new file mode 100644 index 00000000..71dd62f5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/economist.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@class='ec-blog-headline'] | ||
2 | body: //div[@class='ec-blog-body'] | ||
3 | body: //div[@class='ec-article-content clear'] | ||
4 | strip: //div[@class='related-items'] | ||
5 | date: substring-before(//p[@class='ec-article-info'], '|') | ||
6 | prune: no | ||
7 | |||
8 | autodetect_next_page: no | ||
9 | |||
10 | test_url: http://www.economist.com/node/21528429 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/edge-online.com.txt b/inc/3rdparty/site_config/standard/edge-online.com.txt new file mode 100644 index 00000000..461d909c --- /dev/null +++ b/inc/3rdparty/site_config/standard/edge-online.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')] | ||
3 | date: //time[@pubdate]/@datetime | ||
4 | author: //span[@class='author-name'] | ||
5 | prune: no | ||
6 | tidy: no | ||
7 | strip: //footer | ||
8 | |||
9 | replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak --> | ||
10 | |||
11 | single_page_link: //a[contains(@href, '?page=show')] | ||
12 | |||
13 | test_url: http://www.edge-online.com/features/telling-modern-warfares-story \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/edge.org.txt b/inc/3rdparty/site_config/standard/edge.org.txt new file mode 100644 index 00000000..9980000d --- /dev/null +++ b/inc/3rdparty/site_config/standard/edge.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='HomeLeftPannel IMGCTRL']/h2 | ||
2 | body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc'] | ||
3 | tidy: no | ||
4 | |||
5 | test_url: http://edge.org/print/conversation.php?cid=the-argumentative-theory \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/edition.cnn.com.txt b/inc/3rdparty/site_config/standard/edition.cnn.com.txt new file mode 100644 index 00000000..dc8ebe14 --- /dev/null +++ b/inc/3rdparty/site_config/standard/edition.cnn.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')] | ||
2 | strip: //div[@id='cnnCVP2'] | ||
3 | strip_id_or_class: cnn_strylftcexpbx | ||
4 | strip_id_or_class: cnn_strylctcqrelt | ||
5 | strip_id_or_class: cnn_strybtntoolsbttm | ||
6 | strip_id_or_class: cnn_stryftsbttm | ||
7 | strip_id_or_class: cnn_strybtmcntnt | ||
8 | prune: no | ||
9 | test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ekultura.hu.txt b/inc/3rdparty/site_config/standard/ekultura.hu.txt new file mode 100644 index 00000000..59f6a711 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ekultura.hu.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[@class='style6 nevek'] | ||
2 | |||
3 | body: //div[@class='bal3'] | ||
4 | |||
5 | |||
6 | prune: yes | ||
7 | |||
8 | tidy: yes | ||
9 | convert_double_br_tags: yes | ||
10 | |||
11 | test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/elance.com.txt b/inc/3rdparty/site_config/standard/elance.com.txt new file mode 100644 index 00000000..52ffe2d0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/elance.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='jobDesc-bd']/p | ||
2 | |||
3 | test_url: http://www.elance.com/j/xml-technical-intergration/23687172/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/elektroniknet.de.txt b/inc/3rdparty/site_config/standard/elektroniknet.de.txt new file mode 100644 index 00000000..07664719 --- /dev/null +++ b/inc/3rdparty/site_config/standard/elektroniknet.de.txt | |||
@@ -0,0 +1,27 @@ | |||
1 | title: //h1 | ||
2 | date: //div[@class='datum'] | ||
3 | single_page_link: //a[contains(@href, '?type=99')] | ||
4 | |||
5 | # this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1 | ||
6 | dissolve: //div[@class='artikelMeldung'] | ||
7 | |||
8 | |||
9 | strip_id_or_class: anzeige | ||
10 | strip_id_or_class: top_page_navigation | ||
11 | strip_id_or_class: cr_image_container | ||
12 | strip_id_or_class: cr_image_reference | ||
13 | strip_id_or_class: cr_image_icon | ||
14 | strip_id_or_class: _close_txt | ||
15 | strip_id_or_class: _close_ico | ||
16 | strip_id_or_class: clearer | ||
17 | |||
18 | strip://h1 | ||
19 | strip://h6 | ||
20 | strip://div[contains(@id, 'plista')] | ||
21 | strip://img[contains(@id,'tiny')] | ||
22 | strip://img[@class='cr_image'] | ||
23 | |||
24 | # strip url at the top | ||
25 | strip: //p[@style='font-size: 10px;'] | ||
26 | |||
27 | test_url: http://www.elektroniknet.de/automotive/technik-know-how/sicherheitselektronik/article/87717/0/Besser_als_die_Wirklichkeit/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/elmalpensante.com.txt b/inc/3rdparty/site_config/standard/elmalpensante.com.txt new file mode 100644 index 00000000..9fecd663 --- /dev/null +++ b/inc/3rdparty/site_config/standard/elmalpensante.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | single_page_link: //a[contains(@href, 'print_contenido')] | ||
2 | title: //h2 | ||
3 | author: //div[@class="autor"] | ||
4 | test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/elpais.com.txt b/inc/3rdparty/site_config/standard/elpais.com.txt new file mode 100644 index 00000000..32f9fc3f --- /dev/null +++ b/inc/3rdparty/site_config/standard/elpais.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title: //meta[@name='DC.title']/@content | ||
2 | title: //div[contains(@class, 'cabecera_noticia')]//h1 | ||
3 | date: //meta[@name='DC.date']/@content | ||
4 | date: //meta[@name='date']/@content | ||
5 | body: //div[@class='columna_texto'] | ||
6 | body: //div[@id='cuerpo_noticia'] | ||
7 | body: //div[@class='estructura_2col_1zq']//div[@class='margen_n'] | ||
8 | |||
9 | prune: no | ||
10 | |||
11 | strip_id_or_class: disposicion_vertical | ||
12 | strip_id_or_class: ampliar_foto | ||
13 | strip_id_or_class: utilidades | ||
14 | strip_id_or_class: info_relacionada | ||
15 | strip_id_or_class: m-kiosko | ||
16 | strip_id_or_class: info_complementa | ||
17 | |||
18 | strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')] | ||
19 | strip: //div[@id='coment' or @id='foros_not'] | ||
20 | |||
21 | test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html | ||
22 | test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/en.espnf1.com.txt b/inc/3rdparty/site_config/standard/en.espnf1.com.txt new file mode 100644 index 00000000..c1a91063 --- /dev/null +++ b/inc/3rdparty/site_config/standard/en.espnf1.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@id='content'] | ||
2 | strip: //div[@class='rl'] | ||
3 | strip: //p[@class='authdesc'] | ||
4 | strip: //p[@class='strybtm'] | ||
5 | strip: //div[@id='stryFtrLft'] | ||
6 | strip: //div[@id='f1Conversation'] | ||
7 | strip: //div[@id='cmtSpncrRuler'] | ||
8 | strip: //div[@id='stryComments'] | ||
9 | strip: //div[@id='athrData'] | ||
10 | test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/engadget.com.txt b/inc/3rdparty/site_config/standard/engadget.com.txt new file mode 100644 index 00000000..6cc6b14e --- /dev/null +++ b/inc/3rdparty/site_config/standard/engadget.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //div[@class='post_body'] | ||
3 | date: //*[@class='post_time'] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt b/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt new file mode 100644 index 00000000..35ace467 --- /dev/null +++ b/inc/3rdparty/site_config/standard/engineering.tumblr.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h2 | ||
2 | body: //div[@class="post_content"] | ||
3 | author: //p[@class="author"]/a | ||
4 | date: //p[@class="date"] | ||
5 | strip: //h2 | ||
6 | strip: //header | ||
7 | test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/english.aljazeera.net.txt b/inc/3rdparty/site_config/standard/english.aljazeera.net.txt new file mode 100644 index 00000000..aed3a5f9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/english.aljazeera.net.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //span[@id='DetailedTitle'] | ||
2 | body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary'] | ||
3 | strip_id_or_class: sidebar | ||
4 | strip_id_or_class: Skyscrapper_Body | ||
5 | strip: //td[@class='DetailedSummary']/table[position() != 1] | ||
6 | prune: no | ||
7 | test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/enikos.gr.txt b/inc/3rdparty/site_config/standard/enikos.gr.txt new file mode 100644 index 00000000..e2b99bfc --- /dev/null +++ b/inc/3rdparty/site_config/standard/enikos.gr.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='article']//div[contains(@class, 'inside')] | ||
2 | |||
3 | strip_id_or_class: tags | ||
4 | strip_id_or_class: actions | ||
5 | strip_id_or_class: google-ads | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt b/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt new file mode 100644 index 00000000..3e7fba09 --- /dev/null +++ b/inc/3rdparty/site_config/standard/entertainment.timesonline.co.uk.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | author://div[@class = 'article-author']/span[@class = 'byline'] | ||
2 | title://h1[@class = 'heading'] | ||
3 | body://div[@id = 'related-article-links'] | ||
4 | strip://div[@id = 'comment-sort-order'] | ||
5 | strip://div[@id = 'my-profile'] | ||
6 | strip://div[@class = 'article-author'] | ||
7 | strip://div[@class = 'bg-f8f1d8 width-385 text-left'] | ||
8 | strip://div[@id = 'login-status'] | ||
9 | strip://div[@class = 'puff-padding'] | ||
10 | test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/es.hu.txt b/inc/3rdparty/site_config/standard/es.hu.txt new file mode 100644 index 00000000..19a1e9dd --- /dev/null +++ b/inc/3rdparty/site_config/standard/es.hu.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title'])) | ||
2 | |||
3 | body: //div[@class='doc'] | ||
4 | |||
5 | prune: yes | ||
6 | |||
7 | tidy: yes | ||
8 | convert_double_br_tags: yes | ||
9 | |||
10 | strip: //a[contains(@href, 'www.facebook.com/pages/Elet-es-Irodalom/')] | ||
11 | test_url: http://www.es.hu/2010-12-08_vissza-a-partpenzt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/escapistmagazine.com.txt b/inc/3rdparty/site_config/standard/escapistmagazine.com.txt new file mode 100644 index 00000000..7e17a04d --- /dev/null +++ b/inc/3rdparty/site_config/standard/escapistmagazine.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_comments: no | ||
2 | test_url: http://www.escapistmagazine.com/articles/view/columns/extraconsideration/8717-Extra-Consideration-The-Story \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/espn.go.com.txt b/inc/3rdparty/site_config/standard/espn.go.com.txt new file mode 100644 index 00000000..319d352b --- /dev/null +++ b/inc/3rdparty/site_config/standard/espn.go.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 | ||
2 | body: //div[contains(@class, 'article')] | ||
3 | strip: //div[contains(@class, 'mod-inline')] | ||
4 | strip: //*/span[@class='page-actions'] | ||
5 | strip: //div[@class='page-actions']/* | ||
6 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 | ||
7 | strip: //div[@class='mod-blog-navigation'] | ||
8 | strip: //div[@class='monthday'] | ||
9 | strip: //div[@class='time'] | ||
10 | strip: //div[@class='timeofday'] | ||
11 | strip: //div[contains(@class, 'mod-conversations')] | ||
12 | test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/esquire.com.txt b/inc/3rdparty/site_config/standard/esquire.com.txt new file mode 100644 index 00000000..7566e8cc --- /dev/null +++ b/inc/3rdparty/site_config/standard/esquire.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1 | ||
2 | author: //div[@id='byline'] | ||
3 | |||
4 | body: //div[@id='printBody'] | ||
5 | |||
6 | single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/')) | ||
7 | |||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt b/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt new file mode 100644 index 00000000..88c8c560 --- /dev/null +++ b/inc/3rdparty/site_config/standard/essentialpublicradio.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //*[@itemprop='headline'] | ||
2 | author: //*[@itemprop='author'] | ||
3 | date: //*[@itemprop='datePublished'] | ||
4 | body: //*[@itemprop='articleBody'] | ||
5 | strip: //*[contains(@class, 'instapaper_ignore')] | ||
6 | test_url: http://www.essentialpublicradio.org/story/2011-11-14/volunteers-sought-federal-tax-assistance-program-pennsylvania-9421 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/etc.se.txt b/inc/3rdparty/site_config/standard/etc.se.txt new file mode 100644 index 00000000..58da5ef7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/etc.se.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | strip_id_or_class: 'left' | ||
2 | strip_id_or_class: 'right' | ||
3 | strip_id_or_class: 'block-belowcontent' | ||
4 | author: //span[@class = 'name']/a | ||
5 | date: //div[@class= 'datum'] | ||
6 | test_url: http://www.etc.se/intervju/lonsamt-att-radda-jorden \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt b/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt new file mode 100644 index 00000000..bfa2c5dc --- /dev/null +++ b/inc/3rdparty/site_config/standard/eternabuenosaires.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://eternabuenosaires.com/2011/09/calle-adolfo-bioy-casares \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/eurogamer.net.txt b/inc/3rdparty/site_config/standard/eurogamer.net.txt new file mode 100644 index 00000000..6ecdf6bd --- /dev/null +++ b/inc/3rdparty/site_config/standard/eurogamer.net.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[ @class='content' ] | //div[ @class='blog-entry' ] | ||
2 | |||
3 | strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')] | ||
4 | |||
5 | date://p[ @class='timestamp' ] | ||
6 | |||
7 | author://a[ @class='eurogamer-author' ] | ||
8 | test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/evo.co.uk.txt b/inc/3rdparty/site_config/standard/evo.co.uk.txt new file mode 100644 index 00000000..07162513 --- /dev/null +++ b/inc/3rdparty/site_config/standard/evo.co.uk.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | author: substring-after(//div[@class='articleauthor'],'By ') | ||
2 | |||
3 | # Blog posts | ||
4 | date: //div[@class='articledate'] | ||
5 | # News | ||
6 | date: //div[@class='articledate_b'] | ||
7 | |||
8 | body: //div[@class='articletext'] | ||
9 | |||
10 | convert_double_br_tags: yes | ||
11 | test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/expressen.se.txt b/inc/3rdparty/site_config/standard/expressen.se.txt new file mode 100644 index 00000000..d0cb283e --- /dev/null +++ b/inc/3rdparty/site_config/standard/expressen.se.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='article']/div[contains(@class, 'content')]/h1 | ||
2 | body: //div[@id='article']/div[contains(@class, 'content')] | ||
3 | date: //div[contains(@class, 'article-slot')]/descendant::div[contains(@id, 'articledates')] | ||
4 | |||
5 | strip: //img[contains(@src, 'img/px.gif')] | ||
6 | prune: no | ||
7 | # remove Facebook banner and obtrusive ad | ||
8 | strip: //div[@id='article']/div[contains(@class, 'content')]/div[contains(@class, 'art-right')] | ||
9 | test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/extracine.com.txt b/inc/3rdparty/site_config/standard/extracine.com.txt new file mode 100644 index 00000000..52b598da --- /dev/null +++ b/inc/3rdparty/site_config/standard/extracine.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://extracine.com/2011/09/straw-dogs-la-original \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/f1actual.com.txt b/inc/3rdparty/site_config/standard/f1actual.com.txt new file mode 100644 index 00000000..6ef2738a --- /dev/null +++ b/inc/3rdparty/site_config/standard/f1actual.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://f1actual.com/2011/09/previo-gran-premio-de-singapur \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/facta.co.jp.txt b/inc/3rdparty/site_config/standard/facta.co.jp.txt new file mode 100644 index 00000000..c17e0b8c --- /dev/null +++ b/inc/3rdparty/site_config/standard/facta.co.jp.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | bosdy: //div[@class='content'] | ||
2 | |||
3 | test_url: http://facta.co.jp/blog/archives/20111026001026.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/falter.at.txt b/inc/3rdparty/site_config/standard/falter.at.txt new file mode 100644 index 00000000..b941b740 --- /dev/null +++ b/inc/3rdparty/site_config/standard/falter.at.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //h2[@class='related relatedTitle'] | ||
2 | author: //a[contains(@href, 'liste.php?author_id')] | ||
3 | |||
4 | # can't think of a better way unfortunately, really bad markup on this site | ||
5 | date: substring-after(//td[@style='width:85%;'], 'vom') | ||
6 | |||
7 | # not sure why, but instapaper seems to suck up the teaser paragraph | ||
8 | # not solved! | ||
9 | body: //div[contains(@class, 'teaser')] | ||
10 | body: //div[@id='content'] | ||
11 | |||
12 | # cleanup | ||
13 | strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif'] | ||
14 | strip: //div[@class='servicebox'] | ||
15 | strip: //h1 | ||
16 | strip: //br | ||
17 | strip: //td[@id='adcol'] | ||
18 | test_url: http://www.falter.at/web/print/detail.php?id=1634 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fanfiction.net.txt b/inc/3rdparty/site_config/standard/fanfiction.net.txt new file mode 100644 index 00000000..8d0c4daf --- /dev/null +++ b/inc/3rdparty/site_config/standard/fanfiction.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //*[@id = 'story text'] | ||
2 | author: //a[starts-with(@href, '/u/')] | ||
3 | next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") | ||
4 | autodetect_next_page:yes | ||
5 | strip_id_or_class: 'a2a_kit' | ||
6 | test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fastcompany.com.txt b/inc/3rdparty/site_config/standard/fastcompany.com.txt new file mode 100644 index 00000000..5547a76c --- /dev/null +++ b/inc/3rdparty/site_config/standard/fastcompany.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //h1 | ||
2 | author: //h5[@class='byline']//a | ||
3 | date: //h5[@class='date'] | ||
4 | body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")] | ||
5 | strip_id_or_class: article-top-wrapper | ||
6 | strip_id_or_class: footer-message | ||
7 | strip_id_or_class: print-logo | ||
8 | strip: //cite | ||
9 | strip://*[@class='timestamp'] | ||
10 | strip://div[@id='page_right'] | ||
11 | strip://section[@id='header_region'] | ||
12 | strip://h1[@class='node-title'] | ||
13 | strip://div[@class='node-submitted'] | ||
14 | strip_id_or_class: skipnav | ||
15 | test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity | ||
16 | test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt new file mode 100644 index 00000000..4fe5968b --- /dev/null +++ b/inc/3rdparty/site_config/standard/faz.net.txt | |||
@@ -0,0 +1,30 @@ | |||
1 | # Title | ||
2 | title: //p[@class='Content HeadlineShort'] | ||
3 | |||
4 | # Authors | ||
5 | # some are known and have a link, others don't | ||
6 | author: substring-after(//span[@class='Autor'], 'Von') | ||
7 | |||
8 | # Date | ||
9 | date: //span[@class='Datum'] | ||
10 | |||
11 | # Body | ||
12 | body: //div[@class='Artikel'] | ||
13 | |||
14 | # Removements before body text | ||
15 | strip: //div[@class='Breadcrumbs'] | ||
16 | strip: //div[@class='QuickSearchBox'] | ||
17 | strip: //div[@class='FAZArtikelEinleitung'] | ||
18 | strip: //div[@class='FAZArtikelReiter'] | ||
19 | strip: //div[@class='clear'] | ||
20 | |||
21 | # General removements | ||
22 | strip: //span[@class='Bildnachweis'] | ||
23 | |||
24 | # Removements after body text | ||
25 | strip: //div[@class='ArtikelAbbinder'] | ||
26 | strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] | ||
27 | strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] | ||
28 | strip: //div[@class='FAZArtikelFunktionen'] | ||
29 | strip: //div[@id='FAZContentRight'] | ||
30 | test_url: http://www.faz.net/aktuell/gesellschaft/ehe-haltbarkeitsformel-verliebe-dich-oft-verlobe-dich-selten-heirate-vielleicht-11685306.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fictionpress.com.txt b/inc/3rdparty/site_config/standard/fictionpress.com.txt new file mode 100644 index 00000000..4a04e832 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fictionpress.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: id('storytext') | ||
2 | author: //a[starts-with(@href, '/u/')] | ||
3 | #next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='") | ||
4 | strip_id_or_class: 'a2a_kit' | ||
5 | test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ficwad.com.txt b/inc/3rdparty/site_config/standard/ficwad.com.txt new file mode 100644 index 00000000..3dbfe76f --- /dev/null +++ b/inc/3rdparty/site_config/standard/ficwad.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h4 | ||
2 | author: //span[@class="author"] | ||
3 | body: //div[@id="story"] | ||
4 | strip_id_or_class: summary | ||
5 | strip_id_or_class: meta | ||
6 | strip_id_or_class: storyfoot | ||
7 | convert_double_br_tags: yes | ||
8 | prune: no | ||
9 | |||
10 | # Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface. | ||
11 | |||
12 | test_url: http://www.ficwad.com/story/158977 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/finance.yahoo.com.txt b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt new file mode 100644 index 00000000..81c18fd3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/finance.yahoo.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | body: //div[@id='y-article-bd'] | ||
3 | body: //div[contains(@class, 'yom-art-content')] | ||
4 | strip: //div[contains(@class, 'related-companies')] | ||
5 | strip: //div[@id='y-article-related'] | ||
6 | strip: //div[@id='ypf-article-related'] | ||
7 | prune: no | ||
8 | |||
9 | single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')] | ||
10 | |||
11 | test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1 | ||
12 | test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt b/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt new file mode 100644 index 00000000..1a5cd2e1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/findtheswagger.tumblr.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | date: //div[@class='notes']/a | ||
2 | body: //div[@id='content'] | ||
3 | |||
4 | strip_id_or_class: tags | ||
5 | strip_id_or_class: permalink | ||
6 | strip_id_or_class: notes | ||
7 | strip_id_or_class: post_nav | ||
8 | strip: //div[@id='content']//h2 | ||
9 | strip_id_or_class: right_column | ||
10 | test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/firstthings.com.txt b/inc/3rdparty/site_config/standard/firstthings.com.txt new file mode 100644 index 00000000..dd56da22 --- /dev/null +++ b/inc/3rdparty/site_config/standard/firstthings.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='articleTitle'] | ||
2 | author: //div[@class='articleAuthor'] | ||
3 | body: //div[@class='articleContent'] | ||
4 | prune: no | ||
5 | convert_double_br_tags: yes | ||
6 | |||
7 | test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fivechapters.com.txt b/inc/3rdparty/site_config/standard/fivechapters.com.txt new file mode 100644 index 00000000..d9c5e42e --- /dev/null +++ b/inc/3rdparty/site_config/standard/fivechapters.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='entry'] | ||
2 | test_url: http://www.fivechapters.com/2010/paris-part-one/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fivefilters.org.txt b/inc/3rdparty/site_config/standard/fivefilters.org.txt new file mode 100644 index 00000000..dc1db432 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fivefilters.org.txt | |||
@@ -0,0 +1 @@ | |||
prune: no \ No newline at end of file | |||
diff --git a/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt b/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt new file mode 100644 index 00000000..3d7b45a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fivethirtyeight.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: substring-after(//title, 'Right:') | ||
2 | body: //div[@class = 'post-body'] | ||
3 | author: substring-after(//*[@class='post-author'], 'by') | ||
4 | date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a) | ||
5 | convert_double_br_tags: yes | ||
6 | |||
7 | test_url: http://www.fivethirtyeight.com/2010/07/does-rnc-have-structural-problems.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fm4.orf.at.txt b/inc/3rdparty/site_config/standard/fm4.orf.at.txt new file mode 100644 index 00000000..32d44c87 --- /dev/null +++ b/inc/3rdparty/site_config/standard/fm4.orf.at.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | author: //div[@class='authorDescription']/h2 | ||
2 | body: //div[@id='story'] | ||
3 | date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-') | ||
4 | title: //h1[@class='detail'] | ||
5 | strip: //div[@class='fact'] | ||
6 | |||
7 | test_url: http://fm4.orf.at/stories/1689156/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fnal.gov.txt b/inc/3rdparty/site_config/standard/fnal.gov.txt new file mode 100644 index 00000000..7faa6bfc --- /dev/null +++ b/inc/3rdparty/site_config/standard/fnal.gov.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: normalize(//h1) | ||
2 | |||
3 | author: //td/p[position()=last()]/em | ||
4 | |||
5 | # I swear, this is really the best way to do this | ||
6 | date: normalize(//td[contains(@style, "color: #ffffff")]) | ||
7 | |||
8 | # my god, it's full of tables | ||
9 | body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td | ||
10 | strip: //h1 | ||
11 | |||
12 | # the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output. | ||
13 | strip: //p[position()=last()]/em | ||
14 | strip: //p[position()=last()]/child::text() | ||
15 | test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/focus.de.txt b/inc/3rdparty/site_config/standard/focus.de.txt new file mode 100644 index 00000000..3ad5cabf --- /dev/null +++ b/inc/3rdparty/site_config/standard/focus.de.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] | ||
4 | |||
5 | date: //div[@class='articleHead']/span[@class='created'] | ||
6 | |||
7 | body: //div[@id='article'] | ||
8 | |||
9 | strip: //span[@class='markerText'] | ||
10 | strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created'] | ||
11 | strip: //div[@class='sidebar'] | ||
12 | strip: //div[@class='starbar'] | ||
13 | strip: //div[@class='actions clearfix'] | ||
14 | strip: //div[@id='commentForm'] | ||
15 | strip: //div[@id='commentSent'] | ||
16 | strip: //div[@id='comments'] | ||
17 | strip: //div[@class='similarityBlock'] | ||
18 | |||
19 | test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fool.com.txt b/inc/3rdparty/site_config/standard/fool.com.txt new file mode 100644 index 00000000..69867ccb --- /dev/null +++ b/inc/3rdparty/site_config/standard/fool.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class='entry-content'] | ||
2 | date: //meta[@name="date"]/@content | ||
3 | author: //meta[@name="author"]/@content | ||
4 | |||
5 | strip_id_or_class: ecapShell | ||
6 | strip_id_or_class: noindent | ||
7 | strip_id_or_class: targetedPromotion | ||
8 | |||
9 | prune: no | ||
10 | |||
11 | test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/forbes.com.txt b/inc/3rdparty/site_config/standard/forbes.com.txt new file mode 100644 index 00000000..2381b56a --- /dev/null +++ b/inc/3rdparty/site_config/standard/forbes.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //hgroup//h1 | ||
2 | title: //span[@class='mainarttitle'] | ||
3 | |||
4 | body: //div[@id='leftRail']//div[contains(@class, 'body')] | ||
5 | |||
6 | author: //meta[@name="author"]/@content | ||
7 | author: //span[@class='mainartauthor'] | ||
8 | |||
9 | date: substring-before(//hgroup//h6, '@') | ||
10 | date: //span[@class='mainartdate'] | ||
11 | |||
12 | prune: no | ||
13 | |||
14 | single_page_link: //a[contains(@href, '/print/')] | ||
15 | |||
16 | test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/foreignpolicy.com.txt b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt new file mode 100644 index 00000000..6ab7a091 --- /dev/null +++ b/inc/3rdparty/site_config/standard/foreignpolicy.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@id='art-mast']//h1 | ||
2 | author: substring-after(//span[@id='by-line'], 'BY ') | ||
3 | date: //span[@id='pub-date'] | ||
4 | body: //div[@id='art-mast']//h2 | //div[@id='art-mast']/h3 | //div[@id='art-body']//div[@class='translateBody'] | ||
5 | strip: //div[@id='share-box'] | ||
6 | prune: no | ||
7 | |||
8 | single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')] | ||
9 | |||
10 | test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me | ||
11 | test_url: test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/forsvaret.no.txt b/inc/3rdparty/site_config/standard/forsvaret.no.txt new file mode 100644 index 00000000..3085c8f2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/forsvaret.no.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@class="articleHeader"]/h1 | ||
2 | author: //p[@class="byline"] | ||
3 | date: //p[contains(@class,"publishedDate")]/span | ||
4 | # remove the right menu | ||
5 | strip: //div[contains(@class,"aside")] | ||
6 | # remove some SharePoint webpart label junk | ||
7 | strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"] | ||
8 | strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"] | ||
9 | test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/foxnews.com.txt b/inc/3rdparty/site_config/standard/foxnews.com.txt new file mode 100644 index 00000000..f1ee4851 --- /dev/null +++ b/inc/3rdparty/site_config/standard/foxnews.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | prune: no | ||
2 | |||
3 | author: //meta[@name="dc.publisher"]/@content | ||
4 | date: //meta[@name="dc.date"]/@content | ||
5 | strip: //p[contains(@class, 'contributor vcard')] | ||
6 | replace_string(<ul><li><div class="photo">): <div class="photo"> | ||
7 | strip: //p[a[contains(., 'Click here to read more on this story ')]] | ||
8 | |||
9 | test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/freelancer.com.txt b/inc/3rdparty/site_config/standard/freelancer.com.txt new file mode 100644 index 00000000..f3d5425c --- /dev/null +++ b/inc/3rdparty/site_config/standard/freelancer.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id="projectDetailsContent"]//td | ||
2 | |||
3 | test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/freytag-film.com.txt b/inc/3rdparty/site_config/standard/freytag-film.com.txt new file mode 100644 index 00000000..8dc0dabc --- /dev/null +++ b/inc/3rdparty/site_config/standard/freytag-film.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class = 'instapaperbody'] | ||
2 | convert_double_br_tags: no | ||
3 | date: //div[@class='instadate'] | ||
4 | title: //h2[@class = 'instatitle'] | ||
5 | test_url: http://freytag-film.com/blog/artikel/shooting_a_feature_film_in_10_days \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/friendskorner.com.txt b/inc/3rdparty/site_config/standard/friendskorner.com.txt new file mode 100644 index 00000000..39a9973f --- /dev/null +++ b/inc/3rdparty/site_config/standard/friendskorner.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | ||
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | ||
3 | body: (//div[starts-with(@id, 'post_message')])[1] | ||
4 | |||
5 | prune: no | ||
6 | tidy: no | ||
7 | |||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | ||
9 | #replace_string(</iframe>): </iframe> </div> | ||
10 | |||
11 | test_url: http://www.friendskorner.com/forum/f137/debate-personal-lives-leaders-west-vs-pakistan-must-read-297989/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ft.com.txt b/inc/3rdparty/site_config/standard/ft.com.txt new file mode 100644 index 00000000..38d9d326 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ft.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[contains(@class, 'ft-story-body')] | ||
2 | |||
3 | author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ') | ||
4 | date: substring-before(substring-after(//div[contains(@class, 'ft-story-header')]/p[2], 'Published:'), '|') | ||
5 | test_url: http://www.ft.com/cms/s/2/e1be4b5a-620c-11e0-8ee4-00144feab49a.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ftd.de.txt b/inc/3rdparty/site_config/standard/ftd.de.txt new file mode 100644 index 00000000..a58765b0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ftd.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft'] | ||
2 | single_page_link: //a[@class='icon print'] | ||
3 | |||
4 | test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html | ||
5 | test_url: http://www.ftd.de/it-medien/medien-internet/:verkauf-von-warner-music-musikbranche-auf-dem-sprung/60048185.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/fubiz.net.txt b/inc/3rdparty/site_config/standard/fubiz.net.txt new file mode 100644 index 00000000..8e6356bf --- /dev/null +++ b/inc/3rdparty/site_config/standard/fubiz.net.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class = 'entry'] | ||
2 | |||
3 | test_url: http://www.fubiz.net/2011/05/31/world-press-photo-2011/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/futurezone.at.txt b/inc/3rdparty/site_config/standard/futurezone.at.txt new file mode 100644 index 00000000..50fc144a --- /dev/null +++ b/inc/3rdparty/site_config/standard/futurezone.at.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | date: //span[@class='date'] | ||
2 | strip: //div[@class='postsidebar'] | ||
3 | body: //div[@class='singlepost'] | ||
4 | title: //div[@class='singlepost']/h1 | ||
5 | move_into(//div[@class='singlepost']): //div[@class='info'] | ||
6 | strip: //div[@class='gallery'] | ||
7 | strip: //div[@class='biggallery'] | ||
8 | strip: //ul[@class='social'] | ||
9 | strip: //ul[@class='social_mail'] | ||
10 | |||
11 | test_url: http://futurezone.at/future/5502-erste-galileo-satelliten-starten-ins-all.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gamasutra.com.txt b/inc/3rdparty/site_config/standard/gamasutra.com.txt new file mode 100644 index 00000000..35a8762a --- /dev/null +++ b/inc/3rdparty/site_config/standard/gamasutra.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | # default view title | ||
2 | title: //span[@class='newsTitle'] | ||
3 | # print view title | ||
4 | title: //h3[@class='title'] | ||
5 | |||
6 | # default view author | ||
7 | author: //span[@class='newsAuth']/a | ||
8 | author: substring-after(//span[@class='newsAuth'], 'by ') | ||
9 | |||
10 | # default view date | ||
11 | date: //td[@class='newsDate'] | ||
12 | |||
13 | # default view body | ||
14 | body: //td[@class='featureText'] | ||
15 | body: //td[@class='newsText'] | ||
16 | |||
17 | strip: //h3[@class='title'] | ||
18 | |||
19 | single_page_link: //a[contains(@href, '?print=1')] | ||
20 | test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gameblog.fr.txt b/inc/3rdparty/site_config/standard/gameblog.fr.txt new file mode 100644 index 00000000..2cc4b378 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gameblog.fr.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | strip_id_or_class: noprint | ||
7 | strip: //div[@id='gbNewsTextContent']/following-sibling::* | ||
8 | |||
9 | test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video | ||
10 | test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/garythink.com.txt b/inc/3rdparty/site_config/standard/garythink.com.txt new file mode 100644 index 00000000..1791e816 --- /dev/null +++ b/inc/3rdparty/site_config/standard/garythink.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | tidy: no | ||
2 | |||
3 | test_url: http://www.garythink.com/eft/testing.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gasteroprod.com.txt b/inc/3rdparty/site_config/standard/gasteroprod.com.txt new file mode 100644 index 00000000..ef68082a --- /dev/null +++ b/inc/3rdparty/site_config/standard/gasteroprod.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | # These should work, but don't. They were given by Firefox XPather extension | ||
2 | title: //article//header//a//h1 | ||
3 | body: //article//section | ||
4 | test_url: http://gasteroprod.com/blog/faut-il-continuer-a-supporter-internet-explorer-6.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gatopardo.com.txt b/inc/3rdparty/site_config/standard/gatopardo.com.txt new file mode 100644 index 00000000..74346328 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gatopardo.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[@class='panel'] | ||
2 | strip: //div[@style='float:right'] | ||
3 | strip: //span[@class='titulosHomePublicidad'] | ||
4 | strip: //div[@id='TitTop5Der'] | ||
5 | strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png'] | ||
6 | |||
7 | prune: yes | ||
8 | test_url: http://www.gatopardo.com/ReportajesGP.php?R=95 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gawker.com.txt b/inc/3rdparty/site_config/standard/gawker.com.txt new file mode 100644 index 00000000..6531d81a --- /dev/null +++ b/inc/3rdparty/site_config/standard/gawker.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class="post-body"] | ||
2 | |||
3 | # Remove 'content is restricted' | ||
4 | strip: //div[@id='agegate_IDHERE'] | ||
5 | |||
6 | test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/geeksofdoom.com.txt b/inc/3rdparty/site_config/standard/geeksofdoom.com.txt new file mode 100644 index 00000000..55586e1c --- /dev/null +++ b/inc/3rdparty/site_config/standard/geeksofdoom.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: substring-after(//span[@class='storyauthor'],'Posted by') | ||
2 | date: //span[@class='storydate'] | ||
3 | test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/geenstijl.nl.txt b/inc/3rdparty/site_config/standard/geenstijl.nl.txt new file mode 100644 index 00000000..f6dccf48 --- /dev/null +++ b/inc/3rdparty/site_config/standard/geenstijl.nl.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id = 'article'] | ||
2 | strip: //div[@id = 'klasbox'] | ||
3 | test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/getnews.jp.txt b/inc/3rdparty/site_config/standard/getnews.jp.txt new file mode 100644 index 00000000..537b4c2e --- /dev/null +++ b/inc/3rdparty/site_config/standard/getnews.jp.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='post'] | ||
2 | strip: //ul[@id='bookmark_single'] | ||
3 | test_url: http://getnews.jp/archives/117312 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/giantbomb.com.txt b/inc/3rdparty/site_config/standard/giantbomb.com.txt new file mode 100644 index 00000000..8a54bc07 --- /dev/null +++ b/inc/3rdparty/site_config/standard/giantbomb.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # 2011-11-19 - carlo@... - Initial setup. | ||
2 | |||
3 | strip_id_or_class: user-review-detail | ||
4 | strip: //h1 | ||
5 | |||
6 | body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"] | ||
7 | |||
8 | author: //span[@class="reviewer"] | //p[@class="byline"]/a/text() | ||
9 | date: //span[@class="dtreviewed"] | ||
10 | |||
11 | test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/giga.de.txt b/inc/3rdparty/site_config/standard/giga.de.txt new file mode 100644 index 00000000..f60199ad --- /dev/null +++ b/inc/3rdparty/site_config/standard/giga.de.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | tidy:no | ||
2 | title://h2[@class="title"] | ||
3 | # author:"Ben Miller" | ||
4 | date://div[@id="stats"]/span | ||
5 | strip_id_or_class:stats | ||
6 | strip_id_or_class:breadcrumbs | ||
7 | strip_id_or_class:gn-why-content | ||
8 | strip_id_or_class:single-social | ||
9 | strip_id_or_class:sidebar-ads | ||
10 | strip_id_or_class:sidebar-top | ||
11 | strip_id_or_class:footer | ||
12 | strip_id_or_class:post_meta | ||
13 | # strip_id_or_class: | ||
14 | # strip_id_or_class: | ||
15 | # strip_id_or_class: | ||
16 | # strip_id_or_class: | ||
17 | # strip_id_or_class: | ||
18 | # strip_id_or_class: | ||
19 | |||
20 | test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gigaom.com.txt b/inc/3rdparty/site_config/standard/gigaom.com.txt new file mode 100644 index 00000000..348bdf23 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gigaom.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | date: //meta[@name='DC.date.issued']/@content | ||
2 | date: //span[@class='post-meta the-date'] | ||
3 | |||
4 | title: //meta[@property='og:title']/@content | ||
5 | |||
6 | author: //meta[@name='DC.creator']/@content | ||
7 | |||
8 | body: //div[contains(@class, 'post-sub-head') or starts-with(@id, 'post-content-')] | ||
9 | |||
10 | find_string: id="content" | ||
11 | replace_string: id="content-ignore" | ||
12 | |||
13 | strip_id_or_class: sharedaddy | ||
14 | |||
15 | prune: no | ||
16 | |||
17 | test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gihyo.jp.txt b/inc/3rdparty/site_config/standard/gihyo.jp.txt new file mode 100644 index 00000000..478b23a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gihyo.jp.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //p[@id='skip']//a[contains(@href, 'skip')] | ||
2 | |||
3 | test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gist.github.com.txt b/inc/3rdparty/site_config/standard/gist.github.com.txt new file mode 100644 index 00000000..53095b34 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gist.github.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class="highlight"]/pre | ||
2 | |||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | test_url: https://gist.github.com/1258908 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt b/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt new file mode 100644 index 00000000..144ce045 --- /dev/null +++ b/inc/3rdparty/site_config/standard/givemesomethingtoread.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //div[@id="content"]//h2/a | ||
2 | |||
3 | test_url: http://givemesomethingtoread.com/post/6285838917/the-baddest-lawyer-in-the-history-of-jersey \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt b/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt new file mode 100644 index 00000000..285e76c0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gizmodo.co.uk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id="leadimage" or @class="postcontent"] | ||
2 | author: //div[@class="contentauthor"] | ||
3 | date: //div[@class="timestamp"] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gizmodo.com.txt b/inc/3rdparty/site_config/standard/gizmodo.com.txt new file mode 100644 index 00000000..c9536255 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gizmodo.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@class="post-body" or contains(@class, 'illustration top')] | ||
2 | author: (//cite//span[@class="plus-icon"])[1] | ||
3 | date: //span[@class="date"] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gizmologia.com.txt b/inc/3rdparty/site_config/standard/gizmologia.com.txt new file mode 100644 index 00000000..d2c7c9f9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gizmologia.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://gizmologia.com/2011/09/amd-trinity-el-sucesor-de-llano-en-una-demostracion-muy-interesante \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gizmovil.com.txt b/inc/3rdparty/site_config/standard/gizmovil.com.txt new file mode 100644 index 00000000..5fc204b8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gizmovil.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://gizmovil.com/2011/09/hipertextual-labs-receptor-bluetooth-nokia-bh-214 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/global.txt b/inc/3rdparty/site_config/standard/global.txt new file mode 100644 index 00000000..135ed500 --- /dev/null +++ b/inc/3rdparty/site_config/standard/global.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | # Look for Open Graph data - http://ogp.me | ||
2 | title: //meta[@property="og:title"]/@content | ||
3 | date: //meta[@property="article:published_time"]/@content | ||
4 | # article:author is someties URL, e.g. on guardian.co.uk \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/globalissues.org.txt b/inc/3rdparty/site_config/standard/globalissues.org.txt new file mode 100644 index 00000000..95d4becf --- /dev/null +++ b/inc/3rdparty/site_config/standard/globalissues.org.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body: //div[@id='content'] | ||
2 | |||
3 | strip: //p[@class='top'] | ||
4 | strip: //h2[.='Where next?'] | ||
5 | strip_id_or_class: where-next | ||
6 | strip_id_or_class: social-bookmarks | ||
7 | strip_id_or_class: link-to-here | ||
8 | strip_id_or_class: options-heading | ||
9 | strip_id_or_class: page-options-content | ||
10 | strip_id_or_class: page-info-bottom | ||
11 | |||
12 | tidy: no | ||
13 | prune: no | ||
14 | |||
15 | test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/goal.com.txt b/inc/3rdparty/site_config/standard/goal.com.txt new file mode 100644 index 00000000..075c4d2b --- /dev/null +++ b/inc/3rdparty/site_config/standard/goal.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //div[@id='article_headline']//h1 | ||
2 | date: //div[contains(@class, 'articleDate')]//h4 | ||
3 | body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content'] | ||
4 | |||
5 | strip_id_or_class: relatedLinksBox | ||
6 | strip_id_or_class: betting-widget | ||
7 | strip_image_src: install_flash.gif | ||
8 | |||
9 | strip: //table[contains(@style, 'float: right; width: 285px;')] | ||
10 | strip: //div[@class='caption'] | ||
11 | |||
12 | tidy: no | ||
13 | prune: no | ||
14 | |||
15 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and- | ||
16 | test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to# \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/golem.de.txt b/inc/3rdparty/site_config/standard/golem.de.txt new file mode 100644 index 00000000..6c5d1c4f --- /dev/null +++ b/inc/3rdparty/site_config/standard/golem.de.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | # Jens Kohl, jens.kohl@... | ||
2 | # - Added publication date | ||
3 | # - Striped pagination block | ||
4 | # - Added single page link | ||
5 | # - Added xpath-querys for the printer friendly version | ||
6 | |||
7 | title: //h1 | ||
8 | body: //div[@class='formatted'] | ||
9 | prune: no | ||
10 | |||
11 | date: substring-after(//li[2][@class="text1"], 'Datum:') | ||
12 | strip: //ol[@class="list-chapters"] | ||
13 | strip_comments: yes | ||
14 | |||
15 | # next: commands for printer friendly pages | ||
16 | single_page_link: //a[contains(@href, 'print.php?a=')]/@href | ||
17 | title: //body/h3 | ||
18 | strip_image_src: staticrl/images/logo.jpg | ||
19 | strip_image_src: http://cpx.golem.de/cpx.php?class=7 | ||
20 | strip: //body/h3 | ||
21 | strip: //body/b[1] | ||
22 | strip: //body/b[2] | ||
23 | strip: //body/b[3] | ||
24 | strip: //div[1] | ||
25 | test_url: http://www.golem.de/1112/88696.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/good.is.txt b/inc/3rdparty/site_config/standard/good.is.txt new file mode 100644 index 00000000..5cf67011 --- /dev/null +++ b/inc/3rdparty/site_config/standard/good.is.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[@class="title"]/div/h1 | ||
2 | body: //div[@class="body"] | ||
3 | date: //li[@class="date-time"] | ||
4 | test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gossip-tv.gr.txt b/inc/3rdparty/site_config/standard/gossip-tv.gr.txt new file mode 100644 index 00000000..c2fe4e40 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gossip-tv.gr.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | date: //meta[@name='og:article:published_time']/@value | ||
2 | |||
3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] | ||
4 | |||
5 | strip_id_or_class: itemImageGallery | ||
6 | |||
7 | # remove extras at end of post content | ||
8 | find_string: <div style="margin:5px 0 10px;"> | ||
9 | replace_string: </div></body></html><!-- | ||
10 | |||
11 | prune: no | ||
12 | |||
13 | test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous | ||
14 | test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gothamist.com.txt b/inc/3rdparty/site_config/standard/gothamist.com.txt new file mode 100644 index 00000000..5179fc12 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gothamist.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='entry-header'] | ||
2 | author: //span[@class='vcard author'] | ||
3 | date: //abbr[@class='published'] | ||
4 | #move_into(//div[@class='entry-body']): //img[@id='photo_1'] | ||
5 | body: //div[@class='entry-body'] | ||
6 | strip: //div[@class='galleryEaseThumbs'] | ||
7 | test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gotomanager.com.txt b/inc/3rdparty/site_config/standard/gotomanager.com.txt new file mode 100644 index 00000000..7fb0ee03 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gotomanager.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | title: //span[@id="showTitle"] | ||
2 | author: //span[@id="showAuthor"] | ||
3 | date: //span[@id="showRefDate"] | ||
4 | |||
5 | strip: //span[@class="black_bold"] | ||
6 | strip: //div[@id="sectionName"] | ||
7 | strip: //div[@id="storyHeader"] | ||
8 | |||
9 | body: //div[@id="newsBodyText"] | ||
10 | |||
11 | strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif" | ||
12 | strip_image_src: "http://www.gotomanager.com/images/separator.gif" | ||
13 | strip_image_src: "http://www.gotomanager.com/images/spaces.gif" | ||
14 | |||
15 | convert_double_br_tags: yes | ||
16 | tidy: yes | ||
17 | |||
18 | strip: //div[@id="smallLeadImage"] | ||
19 | strip: //div[@id="truehitsSurvey"] | ||
20 | strip: //table[@id="relatedInfoTable"] | ||
21 | test_url: http://www.gotomanager.com/news/details.aspx?id=86759 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gq.com.txt b/inc/3rdparty/site_config/standard/gq.com.txt new file mode 100644 index 00000000..233c4a7f --- /dev/null +++ b/inc/3rdparty/site_config/standard/gq.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a | ||
2 | strip_id_or_class: utility | ||
3 | strip_id_or_class: keywords | ||
4 | strip_id_or_class: pagination | ||
5 | strip_id_or_class: position2_content | ||
6 | body: //div[@class='article'] | ||
7 | title: //h1[@class='content-headline'] | ||
8 | author: //span[@class='contributor']//a | ||
9 | test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/grantland.com.txt b/inc/3rdparty/site_config/standard/grantland.com.txt new file mode 100644 index 00000000..3269e086 --- /dev/null +++ b/inc/3rdparty/site_config/standard/grantland.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | # this is fragile with footnotes -- leave it for now | ||
2 | |||
3 | #tidy: no | ||
4 | #prune: no | ||
5 | #move_into(//article): //aside[@id='footnotes'] | ||
6 | author: //cite/a | ||
7 | date: //time | ||
8 | |||
9 | strip: //a[text()='Grantland'] | ||
10 | strip_id_or_class: ad-wrapper | ||
11 | strip_id_or_class: fb-connect-link | ||
12 | strip_id_or_class: fb-status | ||
13 | strip: //li[@class='print'] | ||
14 | strip: //cite | ||
15 | strip: //a[contains(text(), '[+]')] | ||
16 | strip: //a[@id='jump-nav-link'] | ||
17 | strip: //h1[text()='Share This'] | ||
18 | strip: //h1[text()='Top Stories'] | ||
19 | strip: //div[@id="update-text-size"] | ||
20 | test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt b/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt new file mode 100644 index 00000000..a5258030 --- /dev/null +++ b/inc/3rdparty/site_config/standard/greatergreaterwashington.org.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@class="blogpost"]/h2 | ||
2 | author: //div[@class="blogpost"]/p[@class="byline"]/a | ||
3 | date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"] | ||
4 | body: //div[@class="blogpost"] | ||
5 | strip_id_or_class: flag | ||
6 | strip_id_or_class: byline | ||
7 | strip_id_or_class: post_footer | ||
8 | strip_id_or_class: related_posts | ||
9 | strip_id_or_class: post_author_bios | ||
10 | strip: //h2 | ||
11 | test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/groups.drupal.org.txt b/inc/3rdparty/site_config/standard/groups.drupal.org.txt new file mode 100644 index 00000000..7e15a5c1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/groups.drupal.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title://h1 | ||
2 | author://span[@class="submitted"]/a | ||
3 | date:substring-after(//span[@class="submitted"],'on ') | ||
4 | body://div[@class="content"] | ||
5 | test_url: http://groups.drupal.org/node/36816 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/guardian.co.uk.txt b/inc/3rdparty/site_config/standard/guardian.co.uk.txt new file mode 100644 index 00000000..71d84306 --- /dev/null +++ b/inc/3rdparty/site_config/standard/guardian.co.uk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@id='main-article-info']//h1 | ||
2 | body: //div[@id='article-wrapper'] | ||
3 | date: //li[@class='publication']//time[@pubdate] | //li[@class='publication']//data[@pubdate] | ||
4 | author: //li[@class='byline'] | ||
5 | prune: no | ||
6 | tidy: no | ||
7 | test_url: http://www.guardian.co.uk/business/2011/oct/06/quantitative-easing-75bn-bank-of-england \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/gulfnews.com.txt b/inc/3rdparty/site_config/standard/gulfnews.com.txt new file mode 100644 index 00000000..e69044b3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/gulfnews.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article'] | ||
2 | strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1] | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/guokr.com.txt b/inc/3rdparty/site_config/standard/guokr.com.txt new file mode 100644 index 00000000..00255eb8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/guokr.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | # To administrator: | ||
2 | # Please change the hostname to "www.guokr.com/article/*" | ||
3 | # Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com | ||
4 | |||
5 | # This filter is tested on: | ||
6 | # http://www.guokr.com/article/274325/ | ||
7 | # http://www.guokr.com/article/275013/ | ||
8 | |||
9 | title://h1 | ||
10 | author://div[contains(@class, 'content-th-info')]/a | ||
11 | date://div[contains(@class, 'content-th-info')]/span | ||
12 | body://div[contains(@class, 'Content')] | ||
13 | |||
14 | strip://div[contains(@class, 'bottom-i')] | ||
15 | strip://div[contains(@class, 'copyright')] | ||
16 | strip://div[contains(@class, 'fr')] | ||
17 | strip://div[contains(@class, 'content-th-info')] | ||
18 | strip://h1[contains(@id, 'articleTitle')] | ||
19 | strip://div[contains(@class, 'side')] | ||
20 | strip://div[contains(@class, 'top-wp')] | ||
21 | test_url: http://www.guokr.com/article/275013/ | ||
22 | test_url: http://www.guokr.com/article/338387/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/haberler.com.txt b/inc/3rdparty/site_config/standard/haberler.com.txt new file mode 100644 index 00000000..bc1ce689 --- /dev/null +++ b/inc/3rdparty/site_config/standard/haberler.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@id="habermetni"]/h1[@id="haber_baslik"] | ||
2 | body: //div[@id="habermetni"]/p | ||
3 | strip: //img[@class='newsDetailLeft'] | ||
4 | strip_image_src: /haber-resimleri/ | ||
5 | test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/halo.bungie.org.txt b/inc/3rdparty/site_config/standard/halo.bungie.org.txt new file mode 100644 index 00000000..7989d09f --- /dev/null +++ b/inc/3rdparty/site_config/standard/halo.bungie.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title:substring-before(id("maincontent")/table, 'Posted') | ||
2 | body:id("maincontent")/p | ||
3 | # eventually convert linebreaks better | ||
4 | |||
5 | test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt b/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt new file mode 100644 index 00000000..747f90a1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hammers.theoffside.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | # Remove right column | ||
2 | strip: //*[(@class = 'right_col')] | ||
3 | |||
4 | # Remove comments etc. | ||
5 | strip: //*[(@class = 'category')] | ||
6 | strip: /html/body/div[1][@class='absolute_content_high']/div[1][@class='wrapper']/div[1][@class='main_col']/div[@class='main_content']/h3 | ||
7 | test_url: http://hammers.theoffside.com/carling-cup/a-funny-thing-happened-on-the-way-to-4-nil.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hanselman.com.txt b/inc/3rdparty/site_config/standard/hanselman.com.txt new file mode 100644 index 00000000..d3ffeab1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hanselman.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | date: //span[@class="item-date"] | ||
2 | body: //div[@class="item-content"] | ||
3 | strip_comments: no | ||
4 | test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hardware.fr.txt b/inc/3rdparty/site_config/standard/hardware.fr.txt new file mode 100644 index 00000000..318885c8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hardware.fr.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | author: //a[@class='a_aut'] | ||
3 | body: //div[@class='content_dossier'] | ||
4 | strip: //div[@id='pagination'] | ||
5 | next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href | ||
6 | test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hbr.org.txt b/inc/3rdparty/site_config/standard/hbr.org.txt new file mode 100644 index 00000000..fd6145e7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hbr.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@id='article-title'] | ||
2 | author: //div[@id='articleAuthors'] | ||
3 | body: //div[@id='article'] | ||
4 | strip: //div[@class='module wide'] | ||
5 | next_page_link: //a[@title='Next Page'] | ||
6 | test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/healthland.time.com.txt b/inc/3rdparty/site_config/standard/healthland.time.com.txt new file mode 100644 index 00000000..204d8da0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/healthland.time.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | date: //span[@class = 'date'] | ||
2 | body: //div[@class = 'entry-content'] | ||
3 | strip://div[@class='more-ways'] | ||
4 | strip://div[@id = 'stayConnected'] | ||
5 | strip://p[child::a[@rel = 'bookmark']] | ||
6 | strip://p[starts-with(string(.),'(MORE:')] | ||
7 | strip://p[starts-with(string(.),'(PHOTOS:')] | ||
8 | move_into(//p[../@class = 'entry-content'][position() = last()])://div[@id = 'featbox'] | ||
9 | |||
10 | test_url: http://healthland.time.com/2011/07/24/amy-winehouse-and-the-pain-of-addiction/?preview=true&preview_id=39210&preview_nonce=0777d4e408 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/heise-online.mobi.txt b/inc/3rdparty/site_config/standard/heise-online.mobi.txt new file mode 100644 index 00000000..1da82ac7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/heise-online.mobi.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='content']/div | ||
2 | date: //p[@class='author_date']/span[@class='date'] | ||
3 | test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/heise.de.txt b/inc/3rdparty/site_config/standard/heise.de.txt new file mode 100644 index 00000000..5f19d3f8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/heise.de.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | single_page_link: //p[@class='news_option']/a | ||
2 | |||
3 | date: //p[@class='news_datum'] | ||
4 | title: //h1 | ||
5 | body: //div[@class='meldung_wrapper'] | ||
6 | |||
7 | test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hespress.com.txt b/inc/3rdparty/site_config/standard/hespress.com.txt new file mode 100644 index 00000000..d866f629 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hespress.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body'] | ||
2 | |||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://hespress.com/videos/73684.html | ||
7 | test_url: http://hespress.com/permalink/73678.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/highscalability.com.txt b/inc/3rdparty/site_config/standard/highscalability.com.txt new file mode 100644 index 00000000..fd50b6ad --- /dev/null +++ b/inc/3rdparty/site_config/standard/highscalability.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='journal-entry-text'] | ||
2 | |||
3 | test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hiperpop.com.txt b/inc/3rdparty/site_config/standard/hiperpop.com.txt new file mode 100644 index 00000000..b5eb062e --- /dev/null +++ b/inc/3rdparty/site_config/standard/hiperpop.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://hiperpop.com/2011/09/marc-anthony-celebra-su-cumpleanos-con-jennifer-lopez \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt b/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt new file mode 100644 index 00000000..c57c1aa9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hiphopleeft.nl.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class = 'pd'] | ||
2 | strip: //div[@id = 'overzicht-albumrecensies'] | ||
3 | strip: //div[@id = 'jc'] | ||
4 | test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/historytoday.com.txt b/inc/3rdparty/site_config/standard/historytoday.com.txt new file mode 100644 index 00000000..dc687f3f --- /dev/null +++ b/inc/3rdparty/site_config/standard/historytoday.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body://div[@id = 'content'] | ||
2 | author://span[@class = 'authors'] | ||
3 | author://span[@class = 'ht-vtag'][1] | ||
4 | date:substring-before(//meta[@name = 'dc.date']/@content,'T') | ||
5 | strip://div[contains(@class, 'region-ubercontent')] | ||
6 | strip://h1 | ||
7 | strip://div[@id = 'ht-author'] | ||
8 | strip://ul[@class = 'links inline'] | ||
9 | strip://div[@id = 'ht-tools'] | ||
10 | test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hmercer.com.txt b/inc/3rdparty/site_config/standard/hmercer.com.txt new file mode 100644 index 00000000..eeee1594 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hmercer.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //*[@class='ptitle'] | ||
2 | date: //span[@class='date'] | ||
3 | body: //div[@class='body'] | ||
4 | prune: no | ||
5 | test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hometheaterreview.com.txt b/inc/3rdparty/site_config/standard/hometheaterreview.com.txt new file mode 100644 index 00000000..d43e6448 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hometheaterreview.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id='entry-body'] | ||
2 | strip_id_or_class: paginate | ||
3 | strip: //p[contains(., 'Additional Resources')] | ||
4 | test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hosted.ap.org.txt b/inc/3rdparty/site_config/standard/hosted.ap.org.txt new file mode 100644 index 00000000..e19dd526 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hosted.ap.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content'] | ||
2 | tidy: no | ||
3 | strip_image_src: analytics.apnewsregistry | ||
4 | |||
5 | test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hs.fi.txt b/inc/3rdparty/site_config/standard/hs.fi.txt new file mode 100644 index 00000000..67125fb5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hs.fi.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | prune: yes | ||
2 | tidy: yes | ||
3 | test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ht.ly.txt b/inc/3rdparty/site_config/standard/ht.ly.txt new file mode 100644 index 00000000..a8412d2a --- /dev/null +++ b/inc/3rdparty/site_config/standard/ht.ly.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //iframe[@id='hootFrame']/@src | ||
2 | |||
3 | test_url: http://ht.ly/bOiZV \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/huffingtonpost.com.txt b/inc/3rdparty/site_config/standard/huffingtonpost.com.txt new file mode 100644 index 00000000..d40513b2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/huffingtonpost.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')] | ||
3 | date: //meta[@name="publish_date"]/@content | ||
4 | author: //a[@rel="author"] | ||
5 | author: //meta[@name="author"]/@content | ||
6 | prune: no | ||
7 | tidy: no | ||
8 | strip: //footer | ||
9 | strip_id_or_class: ps-slideshow | ||
10 | strip_id_or_class: fs-slideshow | ||
11 | strip: //p[contains(., 'Related on HuffPost:')] | ||
12 | # end early | ||
13 | replace_string(<div class="sbm-main): </body></html><div class="not-interested | ||
14 | |||
15 | test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html | ||
16 | test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/humantransit.org.txt b/inc/3rdparty/site_config/standard/humantransit.org.txt new file mode 100644 index 00000000..ec7d3c06 --- /dev/null +++ b/inc/3rdparty/site_config/standard/humantransit.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h3[@class="entry-header"] | ||
2 | date: //h2[@class="date-header"] | ||
3 | body: //div[contains(@class, 'entry')] | ||
4 | |||
5 | test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt b/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt new file mode 100644 index 00000000..ccf09dcc --- /dev/null +++ b/inc/3rdparty/site_config/standard/hurriyet.com.tr.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='HaberDetayTitleHold Title']/h1 | ||
2 | body: //div[@id='YazarDetayText'] | ||
3 | author: //div[@class='HaberDetayTitleHold Title']/h1 | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp | ||
7 | test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hvg.hu.txt b/inc/3rdparty/site_config/standard/hvg.hu.txt new file mode 100644 index 00000000..06fa98d8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hvg.hu.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='pg-content']//h1 | ||
2 | body: //div[@id='articleBody0'] | ||
3 | replace_string(</table>): </table><br /><br /> | ||
4 | |||
5 | single_page_link: //div[@class="up-header"]/a | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://hvg.hu/w/20111125_sparta \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/hypebeast.com.txt b/inc/3rdparty/site_config/standard/hypebeast.com.txt new file mode 100644 index 00000000..49b46da5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/hypebeast.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1] | ||
2 | author: //span[@class='author']/a | ||
3 | |||
4 | strip_id_or_class: disqus | ||
5 | strip_id_or_class: paginator | ||
6 | strip_id_or_class: photo-number | ||
7 | |||
8 | prune: no | ||
9 | |||
10 | test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/idlewords.com.txt b/inc/3rdparty/site_config/standard/idlewords.com.txt new file mode 100644 index 00000000..e1badef7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/idlewords.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //a[@class='post_title'] | ||
2 | body: //div[@class='entrybox'] | ||
3 | strip_id_or_class: post_title | ||
4 | date: //div[@class='entrybox']/b[1] | ||
5 | strip: //div[@class='entrybox']/b[1] | ||
6 | author: string('Maciej Cegłowski') | ||
7 | test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/igeneration.fr.txt b/inc/3rdparty/site_config/standard/igeneration.fr.txt new file mode 100644 index 00000000..d7ec2da1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/igeneration.fr.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ') | ||
2 | date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- '))) | ||
3 | body: //div[@class='content clear-block zoneApple'] | ||
4 | |||
5 | test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt b/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt new file mode 100644 index 00000000..f74178a9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ignoredbydinosaurs.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title://h1[@class='page-title'] | ||
2 | body://*[@id='content']//div[contains(@class,'node-content')] | ||
3 | |||
4 | author://*[@id='content']//div[contains(@class,'node-submitted')]/a | ||
5 | |||
6 | date:substring-after(//div[contains(@class,'node-submitted')],' on ') | ||
7 | test_url: http://ignoredbydinosaurs.com/2011/09/great-lie-lorem-ipsum \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ilounge.com.txt b/inc/3rdparty/site_config/standard/ilounge.com.txt new file mode 100644 index 00000000..ca1e54a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ilounge.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | # Get proper Title, Author and Date info | ||
2 | title: substring-before(//title, '|') | ||
3 | author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By') | ||
4 | date: //span[@class='instapaper_date'] | ||
5 | |||
6 | # For Reviews & First Looks, get the intro paragraph and put it in front of the main body. | ||
7 | move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body'] | ||
8 | body: //div[@id='instapaper_para1'] | ||
9 | strip: //div[@class='reviewinfo'] | ||
10 | |||
11 | # We don't use footnotes, so why bother checking for them? | ||
12 | footnotes: no | ||
13 | test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ilyabirman.ru.txt b/inc/3rdparty/site_config/standard/ilyabirman.ru.txt new file mode 100644 index 00000000..da6a60f6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ilyabirman.ru.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='published visible e2-smart-title']//span | ||
2 | author: //span[@id='e2-blog-title'] | ||
3 | date: //p[@class='super-h'] | ||
4 | body: //div[@class='text published visible'] | ||
5 | test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/inc.com.txt b/inc/3rdparty/site_config/standard/inc.com.txt new file mode 100644 index 00000000..0589aaae --- /dev/null +++ b/inc/3rdparty/site_config/standard/inc.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | author: substring-after(substring-before(//div[@id='byline'],'|'),'By') | ||
2 | author: //div[@class='byline']/a | ||
3 | date: //span[@class='pubdate'] | ||
4 | # print friendly page | ||
5 | body: //div[@id='text'] | ||
6 | # regular page | ||
7 | body: //div[@id= 'articlecontent'] | ||
8 | |||
9 | strip: //div[@id= 'articlecontent']/h1 | ||
10 | strip: //div[@id='articlecontent']/p[@class='deck'] | ||
11 | strip: //div[@id='articlecontent']/div[@class='byline'] | ||
12 | strip: //div[@id='articlespacer'] | ||
13 | strip: //div[@id='incsharebox'] | ||
14 | strip: //div[@id='articlesidebar'] | ||
15 | |||
16 | prune: no | ||
17 | |||
18 | single_page_link: //a[contains(@href, 'Printer_Friendly.html')] | ||
19 | strip: //a[contains(., 'Dig Deeper')] | ||
20 | test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html | ||
21 | test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/independent.co.uk.txt b/inc/3rdparty/site_config/standard/independent.co.uk.txt new file mode 100644 index 00000000..47baf36b --- /dev/null +++ b/inc/3rdparty/site_config/standard/independent.co.uk.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | body: //div[contains(@class, 'articleContent')] | ||
3 | date: //meta[@property='article:published_time']/@content | ||
4 | author: //div[@id='main']//div[@class='byline']//span[@class='authorName'] | ||
5 | |||
6 | strip_id_or_class: RelatedArtTag | ||
7 | |||
8 | tidy: no | ||
9 | test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/index.php b/inc/3rdparty/site_config/standard/index.php index a3d5f739..a1b767fd 100644 --- a/inc/3rdparty/site_config/standard/index.php +++ b/inc/3rdparty/site_config/standard/index.php | |||
@@ -1,3 +1,3 @@ | |||
1 | <?php | 1 | <?php |
2 | // this is here to prevent directory listing over the web | 2 | // this is here to prevent directory listing over the web |
3 | ?> \ No newline at end of file | 3 | ?> \ No newline at end of file |
diff --git a/inc/3rdparty/site_config/standard/indiatimes.com.txt b/inc/3rdparty/site_config/standard/indiatimes.com.txt new file mode 100644 index 00000000..e7a35e84 --- /dev/null +++ b/inc/3rdparty/site_config/standard/indiatimes.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //figure[@class='mainVideo'] | ||
2 | strip: //figcaption | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/inessential.com.txt b/inc/3rdparty/site_config/standard/inessential.com.txt new file mode 100644 index 00000000..312cec4b --- /dev/null +++ b/inc/3rdparty/site_config/standard/inessential.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='weblogPost']/h3[1] | ||
2 | author: ("Brent Simmons") | ||
3 | date: //span[@class="weblogPostDisplayDate"] | ||
4 | body: //div[@class='weblogPostBody'] | ||
5 | test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/info.abril.com.br.txt b/inc/3rdparty/site_config/standard/info.abril.com.br.txt new file mode 100644 index 00000000..64cf3c8e --- /dev/null +++ b/inc/3rdparty/site_config/standard/info.abril.com.br.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title://h1 | ||
2 | body://div[@id='texto_link'] | ||
3 | |||
4 | test_url: http://info.abril.com.br/noticias/internet/filme-do-youtube-vai-estrear-nos-cinemas-22042011-6.shl \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/infoq.com.txt b/inc/3rdparty/site_config/standard/infoq.com.txt new file mode 100644 index 00000000..3a4e402d --- /dev/null +++ b/inc/3rdparty/site_config/standard/infoq.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | body: //div[@id="intTranscript"] | ||
2 | body: //div[@class="box-content"] | ||
3 | title: //div[@class="box-content"]//h1[1] | ||
4 | author: //p[@class="info"]/strong | ||
5 | date: substring-before(substring-after(//p[@class="info"], "on"), "Length") | ||
6 | strip: //div[@class="box-content"]//h1[1] | ||
7 | strip: //div[@class="box-content"]//p[@class="info"] | ||
8 | strip_id_or_class: vendor-content-box | ||
9 | strip_id_or_class: tags2 | ||
10 | strip_id_or_class: instructions | ||
11 | strip_id_or_class: comments | ||
12 | strip_id_or_class: forum-list-tree | ||
13 | strip: //div[@class="addthis_toolbox addthis_default_style"] | ||
14 | test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/informador.com.mx.txt b/inc/3rdparty/site_config/standard/informador.com.mx.txt new file mode 100644 index 00000000..eedec24f --- /dev/null +++ b/inc/3rdparty/site_config/standard/informador.com.mx.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@class='tituloInt'] | ||
2 | body: //div[@class='notaPortada'] | ||
3 | strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota'] | ||
4 | date: //span[@class='publi'] | ||
5 | author: //span[@class='autor'] | ||
6 | tidy: no | ||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/information.dk.txt b/inc/3rdparty/site_config/standard/information.dk.txt new file mode 100644 index 00000000..6e3c3b1a --- /dev/null +++ b/inc/3rdparty/site_config/standard/information.dk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | author: //*[@property='dc:creator'] | ||
3 | date: //*[@property='dc:date']/@content | ||
4 | body: //div[@id='page-content']//div[contains(@class, 'article-body')] | ||
5 | |||
6 | tidy: no | ||
7 | test_url: http://www.information.dk/282307 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/informationarchitects.net.txt b/inc/3rdparty/site_config/standard/informationarchitects.net.txt new file mode 100644 index 00000000..134306cd --- /dev/null +++ b/inc/3rdparty/site_config/standard/informationarchitects.net.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title://h1[@class="post_title"] | ||
2 | body://article[@class="post"] | ||
3 | date://h1[@class="section_separator"] | ||
4 | author://span[@class="post_author"] | ||
5 | strip://nav[@class="arrow_nav"] | ||
6 | strip://section[@id="contact"] | ||
7 | strip_id_or_class:post_title | ||
8 | strip_id_or_class:post_author | ||
9 | strip_id_or_class:section_separator | ||
10 | test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt b/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt new file mode 100644 index 00000000..0879e9e6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/informationclearinghouse.info.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //head/title | ||
2 | body: //table[@id='table3']//div[@class='postContent'] | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://www.informationclearinghouse.info/article28238.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/informit.com.txt b/inc/3rdparty/site_config/standard/informit.com.txt new file mode 100644 index 00000000..84c1fdcf --- /dev/null +++ b/inc/3rdparty/site_config/standard/informit.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@id='content']/h1 | ||
2 | body: //div[@id="content"] | ||
3 | strip: //img[contains(@src, 'informit_printer.png')] | ||
4 | single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')] | ||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.informit.com/articles/article.aspx?p=1729268 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/infoworld.com.txt b/inc/3rdparty/site_config/standard/infoworld.com.txt new file mode 100644 index 00000000..dd588ed8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/infoworld.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@id='main_text'] | ||
2 | title: //div[@id='main_text']/h1 | ||
3 | strip: //div[@id='main_text']/h1 | ||
4 | strip: //div[@id='main_text']/h2 | ||
5 | strip_id_or_class: tools | ||
6 | strip_id_or_class: articleTools | ||
7 | strip_id_or_class: pagination | ||
8 | strip_id_or_class: byline | ||
9 | strip_id_or_class: tweet | ||
10 | date: //div[@class='date'] | ||
11 | strip: //div[@class='date'] | ||
12 | test_url: http://www.infoworld.com/d/the-industry-standard/it-jobs-the-rise-both-offshore-and-in-us-187689 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/infzm.com.txt b/inc/3rdparty/site_config/standard/infzm.com.txt new file mode 100644 index 00000000..012c873f --- /dev/null +++ b/inc/3rdparty/site_config/standard/infzm.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | # This filter is tested on: | ||
2 | # http://www.infzm.com/content/71068 | ||
3 | # http://www.infzm.com/content/41577 | ||
4 | |||
5 | author://em[contains(@class, 'toAuthor')] | ||
6 | date:substring(//em[contains(@class, 'pubTime')],1) | ||
7 | body://section[contains(@id, 'articleContent')] | ||
8 | title://h1[contains(@class ,'articleHeadline clearfix')] | ||
9 | test_url: http://www.infzm.com/content/41577 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/inhabitat.com.txt b/inc/3rdparty/site_config/standard/inhabitat.com.txt new file mode 100644 index 00000000..6629dafe --- /dev/null +++ b/inc/3rdparty/site_config/standard/inhabitat.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | # set body | ||
2 | body: //div[@class='post-listing'] | ||
3 | |||
4 | # remove clutter | ||
5 | strip: //a/big | ||
6 | strip: //a/em | ||
7 | strip: //p/em | ||
8 | test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/instagr.am.txt b/inc/3rdparty/site_config/standard/instagr.am.txt new file mode 100644 index 00000000..ad9e8214 --- /dev/null +++ b/inc/3rdparty/site_config/standard/instagr.am.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='caption'] | ||
2 | author: //p[@class='username'] | ||
3 | |||
4 | strip: //div[@class='contents']/h3 | ||
5 | strip: //div[@class='location'] | ||
6 | test_url: http://instagr.am/p/G-s_aciyDJ/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/interest.co.nz.txt b/inc/3rdparty/site_config/standard/interest.co.nz.txt new file mode 100644 index 00000000..28c3310a --- /dev/null +++ b/inc/3rdparty/site_config/standard/interest.co.nz.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@id='content'] | ||
2 | test_url: http://www.interest.co.nz/opinion/opinion-when-our-fear-corporate-way-and-our-love-small-business-man-dangerous-thing \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/iolanguage.com.txt b/inc/3rdparty/site_config/standard/iolanguage.com.txt new file mode 100644 index 00000000..231875ad --- /dev/null +++ b/inc/3rdparty/site_config/standard/iolanguage.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //center/table | ||
2 | test_url: http://www.iolanguage.com/scm/io/docs/IoGuide.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ipadclub.nl.txt b/inc/3rdparty/site_config/standard/ipadclub.nl.txt new file mode 100644 index 00000000..d196059e --- /dev/null +++ b/inc/3rdparty/site_config/standard/ipadclub.nl.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | ||
2 | strip: //div[@class = 'postinfo'] | ||
3 | strip: //div[@id = 'postmetanew'] | ||
4 | strip: //div[@class = 'paginator'] | ||
5 | strip: //div[@class = 'col-2'] | ||
6 | strip: //div[@id = 'adfactor-label'] | ||
7 | test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ipadplanet.nl.txt b/inc/3rdparty/site_config/standard/ipadplanet.nl.txt new file mode 100644 index 00000000..a2e49005 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ipadplanet.nl.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | ||
2 | strip: //div[@class = 'postinfo'] | ||
3 | strip: //div[@id = 'postmetanew'] | ||
4 | strip: //div[@class = 'paginator'] | ||
5 | strip: //div[@class = 'col-2'] | ||
6 | strip: //div[@id = 'adfactor-label'] | ||
7 | test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/iphoneclub.nl.txt b/inc/3rdparty/site_config/standard/iphoneclub.nl.txt new file mode 100644 index 00000000..f8d4f6a6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/iphoneclub.nl.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id = 'post'] | ||
2 | strip: //div[@class = 'postinfo'] | ||
3 | strip: //div[@id = 'postmetanew'] | ||
4 | strip: //div[@class = 'paginator'] | ||
5 | strip: //div[@class = 'col-2'] | ||
6 | strip: //div[@id = 'adfactor-label'] | ||
7 | test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/iphonehacks.com.txt b/inc/3rdparty/site_config/standard/iphonehacks.com.txt new file mode 100644 index 00000000..c97ff43c --- /dev/null +++ b/inc/3rdparty/site_config/standard/iphonehacks.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //meta[@name='og:title']/@content | ||
2 | body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')] | ||
3 | |||
4 | strip: //span[@vanilla-identifier] | ||
5 | |||
6 | prune: no | ||
7 | tidy: no | ||
8 | |||
9 | test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/iplaysoft.com.txt b/inc/3rdparty/site_config/standard/iplaysoft.com.txt new file mode 100644 index 00000000..4a944768 --- /dev/null +++ b/inc/3rdparty/site_config/standard/iplaysoft.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@id='content']//div[@class='entry-banner' or @class='entry-content'] | ||
2 | test_url: http://www.iplaysoft.com/webbrowserpassview.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/isource.com.txt b/inc/3rdparty/site_config/standard/isource.com.txt new file mode 100644 index 00000000..a1c16a16 --- /dev/null +++ b/inc/3rdparty/site_config/standard/isource.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | # Remove social buttons | ||
2 | strip: //div[@id='temp_Content_Right'] | ||
3 | |||
4 | # Remove duplicate article title | ||
5 | strip: //*[(@class='storytitle')] | ||
6 | test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/itavisen.no.txt b/inc/3rdparty/site_config/standard/itavisen.no.txt new file mode 100644 index 00000000..8da78cb0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/itavisen.no.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | author: //p[@class = 'writer'] | ||
2 | |||
3 | date: //p[@class = 'published-time'] | ||
4 | |||
5 | body: //div[@class = 'text main'] | ||
6 | test_url: http://www.itavisen.no/899786/old-republic-blir-gratis \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/itstactical.com.txt b/inc/3rdparty/site_config/standard/itstactical.com.txt new file mode 100644 index 00000000..550875ec --- /dev/null +++ b/inc/3rdparty/site_config/standard/itstactical.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | body: //div[@class='format_text entry-content'] | ||
3 | author: //span[@class="author vcard"]/a | ||
4 | date: //abbr[@class="published"] | ||
5 | |||
6 | strip_id_or_class: related-posts | ||
7 | strip_id_or_class: membershipbox | ||
8 | strip_id_or_class: share_this_compact_bt | ||
9 | |||
10 | |||
11 | footnotes: no | ||
12 | test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/itworld.com.txt b/inc/3rdparty/site_config/standard/itworld.com.txt new file mode 100644 index 00000000..d4fa604e --- /dev/null +++ b/inc/3rdparty/site_config/standard/itworld.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //*[@id="article-title"] | ||
2 | author: //*[@id="article-info"]/strong | ||
3 | date: //*[@class="article-dateline"]/strong | ||
4 | body: //*[@id="article-content"] | ||
5 | test_url: http://www.itworld.com/open-source/140916/android-sued-microsoft-not-linux \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/izismile.com.txt b/inc/3rdparty/site_config/standard/izismile.com.txt new file mode 100644 index 00000000..af3f299a --- /dev/null +++ b/inc/3rdparty/site_config/standard/izismile.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[starts-with(@id, 'news-id-')] | ||
2 | prune: no | ||
3 | |||
4 | test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jalopnik.com.txt b/inc/3rdparty/site_config/standard/jalopnik.com.txt new file mode 100644 index 00000000..fc2eef8e --- /dev/null +++ b/inc/3rdparty/site_config/standard/jalopnik.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | author: //span[@class='plus-icon'] | ||
2 | test_url: http://jalopnik.com/5892124/1955-porsche-550-spyder-sells-for-record-3685-million/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jandan.net.txt b/inc/3rdparty/site_config/standard/jandan.net.txt new file mode 100644 index 00000000..f1dd3d17 --- /dev/null +++ b/inc/3rdparty/site_config/standard/jandan.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='content']//div[@class = 'post f'] | ||
2 | strip_id_or_class: comment-big | ||
3 | strip_id_or_class: avatar | ||
4 | strip: //div[@class='time_s'] | ||
5 | |||
6 | test_url: http://jandan.net/2011/04/03/iphone-5-sony.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt new file mode 100644 index 00000000..6e8af934 --- /dev/null +++ b/inc/3rdparty/site_config/standard/jetzt.sueddeutsche.de.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title: //h1 | ||
2 | author: //p[contains(@class, 'author')]/a | ||
3 | date: //p[contains(@class, 'time')] | ||
4 | body: //div[@class='content']/div[contains(@class, 'text')] | ||
5 | |||
6 | # prevent "no text" errors on multi-page articles | ||
7 | tidy: no | ||
8 | |||
9 | # we use a custom next-link detector instead of the print view because | ||
10 | # it's pretty hard to strip out the unwanted parts in the print view | ||
11 | autodetect_next_page: no | ||
12 | next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more '] | ||
13 | |||
14 | strip: //h1 | ||
15 | |||
16 | strip_id_or_class: meta | ||
17 | strip_id_or_class: author | ||
18 | strip_id_or_class: paging | ||
19 | |||
20 | # prevent "Report an Error" from being recognized as footnote | ||
21 | footnotes: no | ||
22 | test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jjahnke.net.txt b/inc/3rdparty/site_config/standard/jjahnke.net.txt new file mode 100644 index 00000000..95c45ee7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/jjahnke.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class='entry'] | ||
2 | prune: no | ||
3 | |||
4 | test_url: http://www.jjahnke.net/rundbr87.html#2514 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt b/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt new file mode 100644 index 00000000..af8d7d17 --- /dev/null +++ b/inc/3rdparty/site_config/standard/jobbank.gc.ca.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id='formatCont_en'] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/joelonsoftware.com.txt b/inc/3rdparty/site_config/standard/joelonsoftware.com.txt new file mode 100644 index 00000000..75fbee5a --- /dev/null +++ b/inc/3rdparty/site_config/standard/joelonsoftware.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | # Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html | ||
2 | |||
3 | author: substring-after(//div[@class="author"], 'by ') | ||
4 | date: //div[@class="date"] | ||
5 | |||
6 | ## Clean stuff at top ## | ||
7 | |||
8 | strip: //h1[1] | ||
9 | strip: //h2[1] | ||
10 | strip: //div[@class="date"] | ||
11 | strip: //div[@class="author"] | ||
12 | |||
13 | ## Clean stuff at bottom ## | ||
14 | |||
15 | strip: //blockquote[@class="textmessage"] | ||
16 | strip: //div[@style="width:500px"]/p[last()] | ||
17 | strip: //div[@style="width:500px"]/p[last()-1] | ||
18 | strip: //div[@style="width:500px"]/h4[last()] | ||
19 | strip: //div[@style="width:500px"]/h4[last()-1] | ||
20 | strip: //div[@style="width:500px"]/div[last()] | ||
21 | test_url: http://www.joelonsoftware.com/items/2011/09/15.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/jouire.com.txt b/inc/3rdparty/site_config/standard/jouire.com.txt new file mode 100644 index 00000000..535a501e --- /dev/null +++ b/inc/3rdparty/site_config/standard/jouire.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: //h1 | ||
2 | date: //p[contains(@class,'date')] | ||
3 | test_url: http://jouire.com/2011/01/exquisite-whispers/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/joystiq.com.txt b/inc/3rdparty/site_config/standard/joystiq.com.txt new file mode 100644 index 00000000..7fbd467d --- /dev/null +++ b/inc/3rdparty/site_config/standard/joystiq.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | author: //a[@class="byline-author"] | ||
2 | title: //h1[@class="headline"] | ||
3 | strip: //div[@id="info-card"] | ||
4 | strip: //div[@id="breaking-news"] | ||
5 | strip: //div[@class="rmod list-post-mod"] | ||
6 | strip: //div[@id="footer"] | ||
7 | strip: //div[@id="GH_strip"] | ||
8 | test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt b/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt new file mode 100644 index 00000000..be844e57 --- /dev/null +++ b/inc/3rdparty/site_config/standard/juedische-allgemeine.de.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | body: //div[@id='article_container'] | ||
2 | author: //h4//a[@class='author'] | ||
3 | title: //h1 | ||
4 | |||
5 | replace_string(lang="en"): lang="de" | ||
6 | replace_string(/>1</a>):/></a> | ||
7 | |||
8 | strip_id_or_class: share_toolbox | ||
9 | strip_id_or_class: article_header | ||
10 | strip_id_or_class: phototext | ||
11 | |||
12 | strip_image_src: icon_author.gif | ||
13 | |||
14 | strip: //img[@src=''] | ||
15 | strip: //h4[@id='author'] | ||
16 | |||
17 | prune: no | ||
18 | |||
19 | test_url: http://www.juedische-allgemeine.de/article/view/id/13366 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/juppy.org.txt b/inc/3rdparty/site_config/standard/juppy.org.txt new file mode 100644 index 00000000..e2d07f24 --- /dev/null +++ b/inc/3rdparty/site_config/standard/juppy.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | convert_double_br_tags: yes | ||
2 | |||
3 | title: //div[@id="storycredits"]/p/span[@class="title"] | ||
4 | author: //div[@id="storycredits"]/p/br[1]/following-sibling::text() | ||
5 | |||
6 | strip: //div[@id="storycredits"] | ||
7 | |||
8 | test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kachestvo.ru.txt b/inc/3rdparty/site_config/standard/kachestvo.ru.txt new file mode 100644 index 00000000..34404e96 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kachestvo.ru.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[contains(@class, 'inner_content')] | ||
2 | |||
3 | test_url: http://kachestvo.ru/promtovar/odezhda/denim.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kenrockwell.com.txt b/inc/3rdparty/site_config/standard/kenrockwell.com.txt new file mode 100644 index 00000000..e6d100ea --- /dev/null +++ b/inc/3rdparty/site_config/standard/kenrockwell.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | # Ads | ||
2 | strip: //table[@align="right"][@width="120"] | ||
3 | |||
4 | # Affiliate link paragraphs | ||
5 | strip: //a[.="Adorama"]/parent::p[contains(., "goodies")] | ||
6 | strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")] | ||
7 | test_url: http://www.kenrockwell.com/tech/composition.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kicker.de.txt b/inc/3rdparty/site_config/standard/kicker.de.txt new file mode 100644 index 00000000..7d5daa4b --- /dev/null +++ b/inc/3rdparty/site_config/standard/kicker.de.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | # set body | ||
2 | body: //div[@id='ovArtikel'] | ||
3 | |||
4 | # set title | ||
5 | title: //div[@id='ovArtikel']/h1 | ||
6 | # strip main title and leave sub title | ||
7 | strip: //div[@id='ovArtikel']/h1 | ||
8 | |||
9 | date: //div[@class='publicdate'] | ||
10 | |||
11 | #remove captions | ||
12 | strip: //*/div[@class='bu'] | ||
13 | strip: //*/div[@class='credit'] | ||
14 | |||
15 | #remove adds | ||
16 | strip: //*/div[@class='ad-head'] | ||
17 | strip: //*/div[@class='linksebay'] | ||
18 | |||
19 | # remove video content | ||
20 | strip: //*/div[@class='ovVideo'] | ||
21 | test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kickstarter.com.txt b/inc/3rdparty/site_config/standard/kickstarter.com.txt new file mode 100644 index 00000000..c055659f --- /dev/null +++ b/inc/3rdparty/site_config/standard/kickstarter.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@id='name'] | ||
2 | body: //*[@id='leftcol'] | ||
3 | |||
4 | strip_id_or_class: 'share-box' | ||
5 | strip_id_or_class: 'project-faqs' | ||
6 | strip_id_or_class: 'report-issue-wrap' | ||
7 | test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kingarthurflour.com.txt b/inc/3rdparty/site_config/standard/kingarthurflour.com.txt new file mode 100644 index 00000000..2f6783a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kingarthurflour.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[@class='post']/h2 | ||
2 | body: //div[@class='entry'] | ||
3 | strip: //p[contains(.,'Tags:')] | ||
4 | test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kotaku.com.txt b/inc/3rdparty/site_config/standard/kotaku.com.txt new file mode 100644 index 00000000..be439d75 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kotaku.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | author: //span[@class="plus-icon"] | ||
2 | test_url: http://kotaku.com/5920211/save-the-furries-on-your-wii-in-this-weeks-nintendo-download \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kottke.org.txt b/inc/3rdparty/site_config/standard/kottke.org.txt new file mode 100644 index 00000000..f93a61e7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kottke.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2 | ||
2 | author: //*[@id='main']/div/a[1] | ||
3 | date: substring-before(substring-after(//div[@class='meta'],'•'),'•') | ||
4 | body: //div[@id='main'] | ||
5 | strip: //div[@class='meta'] | ||
6 | test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kumailplus.com.txt b/inc/3rdparty/site_config/standard/kumailplus.com.txt new file mode 100644 index 00000000..9e15cc34 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kumailplus.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class = "entry-full"] | ||
2 | |||
3 | test_url: http://www.kumailplus.com/2011/12/02/24308 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kumb.com.txt b/inc/3rdparty/site_config/standard/kumb.com.txt new file mode 100644 index 00000000..3f0d2369 --- /dev/null +++ b/inc/3rdparty/site_config/standard/kumb.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@id='centrediv']/h1 | ||
2 | |||
3 | author: substring-after(//div[@id='centrediv']/h3,'By: ') | ||
4 | |||
5 | date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ') | ||
6 | |||
7 | body: //div[@class='KonaBody'] | ||
8 | |||
9 | convert_double_br_tags: yes | ||
10 | test_url: http://www.kumb.com/story.php?id=126084 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/kwerfeldein.de.txt b/inc/3rdparty/site_config/standard/kwerfeldein.de.txt new file mode 100644 index 00000000..879b4d6c --- /dev/null +++ b/inc/3rdparty/site_config/standard/kwerfeldein.de.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | date: //span[@class='datum'] | ||
2 | title: //div[@class='artikel']/h2 | ||
3 | body: //div[@class='entry'] | ||
4 | strip: //p[@class='tags'] | ||
5 | author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ') | ||
6 | strip: //div[@class='authorinfo'] | ||
7 | strip: //div[@class='authorpic'] | ||
8 | |||
9 | test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt b/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt new file mode 100644 index 00000000..a34e39dd --- /dev/null +++ b/inc/3rdparty/site_config/standard/laphamsquarterly.org.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //h1[@class='headline'] | ||
2 | body: //div[@class='article'] | ||
3 | strip: //div[@class='article']//h3[contains(@class, 'section')] | ||
4 | strip: //div[@class='article']//ul[contains(@class, 'article-actions')] | ||
5 | strip: //div[@id='syndication-upper'] | ||
6 | strip: //a[@id='syndication'] | ||
7 | strip: //dl[@id='article-tags'] | ||
8 | strip: //div[@id='article-like'] | ||
9 | prune: no | ||
10 | |||
11 | single_page_link: //li[@class='single-page']/a | ||
12 | |||
13 | test_url: http://www.laphamsquarterly.org/essays/balanced-diets.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/laprensagrafica.com.txt b/inc/3rdparty/site_config/standard/laprensagrafica.com.txt new file mode 100644 index 00000000..e771f81f --- /dev/null +++ b/inc/3rdparty/site_config/standard/laprensagrafica.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | tidy: no | ||
2 | |||
3 | test_url: http://www.laprensagrafica.com/opinion/editorial/229252-reflexiones-sobre-la-educacion-que-necesitamos.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/laquadrature.net.txt b/inc/3rdparty/site_config/standard/laquadrature.net.txt new file mode 100644 index 00000000..5bad8e65 --- /dev/null +++ b/inc/3rdparty/site_config/standard/laquadrature.net.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@id='content-content']//div[@class='content'] | ||
2 | title: //h1[@class='title'] | ||
3 | date: substring-after(//*[@class='submitted'],'Submitted on') | ||
4 | tidy: no | ||
5 | strip: //div[@class='terms terms-inline'] | ||
6 | strip: //div[@class='more'] | ||
7 | strip: //div[@class='share-links'] | ||
8 | strip: //table[@id='attachments'] | ||
9 | |||
10 | test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt b/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt new file mode 100644 index 00000000..504dbea1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lareviewofbooks.org.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | #meta data | ||
2 | title:substring-after(title,'|') | ||
3 | |||
4 | author:substring-before( substring-after(//meta[@name = 'description']/@content, normalize-space(substring-after(//title,'|'))),' respond ') | ||
5 | date://h5[@class = 'postDate'] | ||
6 | |||
7 | #text | ||
8 | body://div[@class = 'articleBody'] | ||
9 | |||
10 | #clean up | ||
11 | strip://center | ||
12 | test_url: http://lareviewofbooks.org/post/14066007115/literary-transactions-and-their-vicissitudes \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/latimes.com.txt b/inc/3rdparty/site_config/standard/latimes.com.txt new file mode 100644 index 00000000..0d6ac851 --- /dev/null +++ b/inc/3rdparty/site_config/standard/latimes.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | strip: //div[@id="tugs_story_display"] | ||
2 | strip: //div[@id="search_overlay"] | ||
3 | strip: //div[@id="adv_search"] | ||
4 | body: //div[@class='story'] | ||
5 | tidy: no | ||
6 | convert_double_br_tags: yes | ||
7 | single_page_link: //a[contains(@href, ',print.')] | ||
8 | strip: //p[starts-with(., 'latimes.com')] | ||
9 | strip: //h1[starts-with(., 'latimes.com')] | ||
10 | strip_id_or_class: cubead | ||
11 | test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/laughingsquid.com.txt b/inc/3rdparty/site_config/standard/laughingsquid.com.txt new file mode 100644 index 00000000..1814988a --- /dev/null +++ b/inc/3rdparty/site_config/standard/laughingsquid.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | body: //div[@class='entry-content'] | ||
3 | test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/leancrew.com.txt b/inc/3rdparty/site_config/standard/leancrew.com.txt new file mode 100644 index 00000000..0a4c84ba --- /dev/null +++ b/inc/3rdparty/site_config/standard/leancrew.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id="content"]/h1[1] | ||
2 | date: substring-before(//p[@class="postdate"], ' at ') | ||
3 | author: ("Dr. Drang") | ||
4 | |||
5 | strip: //div[@id="content"]/h1[1] | ||
6 | strip: //p[@class="postdate"] | ||
7 | strip: //h2[@id="respond"] | ||
8 | strip: //blockquote[@class="bbpTweet"]/p/span/a/img | ||
9 | test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lefigaro.fr.txt b/inc/3rdparty/site_config/standard/lefigaro.fr.txt new file mode 100644 index 00000000..f5494b96 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lefigaro.fr.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //meta[@name='title']/@content | ||
2 | author: //span[@class='sign']//a[@class='journaliste'] | ||
3 | author: //meta[@name='author']/@content | ||
4 | body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte'] | ||
5 | date: //time[@pubdate]/@datetime | ||
6 | prune: no | ||
7 | test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php | ||
8 | test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lemonde.fr.txt b/inc/3rdparty/site_config/standard/lemonde.fr.txt new file mode 100644 index 00000000..eb205275 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lemonde.fr.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | # they have a single component containing both author and date | ||
4 | #author: //p[@class='source'] | ||
5 | #date: //p[@class='source'] | ||
6 | |||
7 | body: //div[@class='contenu_article'] | ||
8 | #Shoot the insane "conjugaison.lemonde.fr" links : | ||
9 | strip: //a[contains(@class, 'listLink')] | ||
10 | |||
11 | prune: no | ||
12 | |||
13 | test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lesnumeriques.com.txt b/inc/3rdparty/site_config/standard/lesnumeriques.com.txt new file mode 100644 index 00000000..9b57f726 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lesnumeriques.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h1/following::span[@class='fn'] | ||
2 | # Author: should stop parsing until <br> reached, but I don't know how to do this. | ||
3 | author: //following::div[@class='PDate2'] | ||
4 | date: //following::div[@class='PDate2']/strong | ||
5 | |||
6 | body: //div[@class='ArTexte'] | ||
7 | body: //div[@id='prod_txt_b'] | ||
8 | body: //div[@class='ArPhotoP'] | ||
9 | test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/letemps.ch.txt b/inc/3rdparty/site_config/standard/letemps.ch.txt new file mode 100644 index 00000000..c4bee7ec --- /dev/null +++ b/inc/3rdparty/site_config/standard/letemps.ch.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h2 | ||
2 | strip_image_src: logo.gif | ||
3 | test_url: http://www.letemps.ch/Facet/print/Uuid/7c9f912c-07c9-11e0-9b50-4d96c9eca37f \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lifeandculture.fr.txt b/inc/3rdparty/site_config/standard/lifeandculture.fr.txt new file mode 100644 index 00000000..c3888aa8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lifeandculture.fr.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h2[@class="entry-title"] | ||
2 | body: //div[@class="entry-content"] | ||
3 | test_url: http://www.lifeandculture.fr/digital/facebook-and-the-epiphanator-an-end-to-endings/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lifehacker.com.txt b/inc/3rdparty/site_config/standard/lifehacker.com.txt new file mode 100644 index 00000000..32ade14a --- /dev/null +++ b/inc/3rdparty/site_config/standard/lifehacker.com.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | # Adds author text: Gawker sites commonly show as "Author: View Profile" | ||
2 | author://a[@class="plus-icon modfont"] | ||
3 | |||
4 | # Add date and time | ||
5 | date: //span[@class="date"] | ||
6 | |||
7 | # Remove date and time from article text | ||
8 | strip: //span[@class="date"] | ||
9 | |||
10 | # Remove login/comment text | ||
11 | strip: //*[(@class="presence_control_external smalltype")] | ||
12 | |||
13 | strip: //div[@class="nodebyline modfont"] | ||
14 | |||
15 | # Remove right sidebar | ||
16 | strip: //div[@id="rightwrapper"] | ||
17 | |||
18 | # Remove print header | ||
19 | strip: //div[@id='printhead']/h1 | ||
20 | |||
21 | # Remove 'content is restricted' | ||
22 | strip: //div[@id='agegate_IDHERE'] | ||
23 | |||
24 | # Remove follow text | ||
25 | strip: //*[(@class="permalink_ads")] | ||
26 | |||
27 | # Remove view/comment count | ||
28 | strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line'] | ||
29 | |||
30 | # Remove contact text | ||
31 | strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo'] | ||
32 | |||
33 | # Remove medium duplicates of the article image | ||
34 | strip_image_src: medium.jpg | ||
35 | |||
36 | # Remove "arrow" class at bottom of page | ||
37 | strip: //p[@class="arrow"] | ||
38 | |||
39 | # Remove "track" image from article body | ||
40 | strip: //img[@alt="track"] | ||
41 | test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos | ||
42 | test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/linkedin.com.txt b/inc/3rdparty/site_config/standard/linkedin.com.txt new file mode 100644 index 00000000..37e83cf6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/linkedin.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | single_page_link: //ul[@class='util-nav']//a[@class='close'] | ||
2 | test_url: http://www.linkedin.com/news?actionBar=&articleID=894735221&ids=0Rdj4Qe3wQejwIczAOc3sRdzwUb3wScPoPdzkVe2MNcz8RcPsQejwIcPASdjwTcjwU&aag=true&freq=weekly \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/longform.org.txt b/inc/3rdparty/site_config/standard/longform.org.txt new file mode 100644 index 00000000..48d5e1a7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/longform.org.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //div[@class="post"]/div[@class="title"]/a | ||
2 | |||
3 | test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/loopinsight.com.txt b/inc/3rdparty/site_config/standard/loopinsight.com.txt new file mode 100644 index 00000000..08ad90c3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/loopinsight.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@class='container_16']//div[@class='grid_11'] | ||
2 | strip: //h2[@class='mast'] | ||
3 | strip: //div[@class='container_16']//div[@class='grid_11']/h1 | ||
4 | strip: //div[@class='container_16']//div[@class='grid_11']/p[1] | ||
5 | strip: //div[@class='container_16']//div[@class='grid_11']/div | ||
6 | author: //a[starts-with(@title, 'Posts by')] | ||
7 | date: substring-before(substring-after(//time, 'Posted on '), ' at') | ||
8 | test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/ | ||
9 | test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lostgarden.com.txt b/inc/3rdparty/site_config/standard/lostgarden.com.txt new file mode 100644 index 00000000..a823e649 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lostgarden.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | prune: no | ||
2 | convert_double_br_tags: yes | ||
3 | test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/lrb.co.uk.txt b/inc/3rdparty/site_config/standard/lrb.co.uk.txt new file mode 100644 index 00000000..ce5053d4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/lrb.co.uk.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: substring-before(//title, ' · LRB') | ||
2 | |||
3 | body: //div[@class="article-body indent"] | ||
4 | |||
5 | date: substring-after(//p[@class="meta-info"]/a, '· ') | ||
6 | |||
7 | prune: no | ||
8 | test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/luminous-landscape.com.txt b/inc/3rdparty/site_config/standard/luminous-landscape.com.txt new file mode 100644 index 00000000..92ccf3ba --- /dev/null +++ b/inc/3rdparty/site_config/standard/luminous-landscape.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2 | ||
2 | |||
3 | body: // div[@id='content'] | ||
4 | |||
5 | strip: //div[@class='sidebar_wrapper'] | ||
6 | test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt b/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt new file mode 100644 index 00000000..a8af5438 --- /dev/null +++ b/inc/3rdparty/site_config/standard/m.bbc.co.uk.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@class="story-body"]/div[@class="story-inner"]/h1 | ||
2 | body: //div[@class="story-body"] | ||
3 | date: //p[@class='date']/strong | ||
4 | author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By') | ||
5 | |||
6 | strip: //div[@class="story-inner"]/div[@class="byline"] | ||
7 | |||
8 | test_url: http://m.bbc.co.uk/news/science-environment-19144464 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/m.guardian.co.uk.txt b/inc/3rdparty/site_config/standard/m.guardian.co.uk.txt new file mode 100644 index 00000000..f5f0dfca --- /dev/null +++ b/inc/3rdparty/site_config/standard/m.guardian.co.uk.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //p[@class="txhead"] | ||
2 | author: //div[@class='txb'] | ||
3 | wrap_in(p): //div[@class='para'] | ||
4 | date: //div[@class='txb']/following-sibling::p/text()[substring(., 14)] | ||
5 | strip: //table[@class="tlogo"] | ||
6 | strip: //div[@class="cookieText"] | ||
7 | strip: //*[@class="sltb"] | ||
8 | strip: //*[@class="ijobs-x-link"] | ||
9 | strip: //*[@class="sponscolour"] | ||
10 | strip: //*[@class="sponsouter"] | ||
11 | strip: //div[@id="bottom-nav-block"]/following::* | ||
12 | test_url: http://m.guardian.co.uk/ms/p/gnm/op/s3OOwgO3yIhGuj41C1_S3Xg/view.m?id=15&gid=world/2012/jul/26/arctic-climate-change&cat=top-stories \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mac4ever.com.txt b/inc/3rdparty/site_config/standard/mac4ever.com.txt new file mode 100644 index 00000000..892b47f5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mac4ever.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: substring-after(//div[@class='author'],'Par ') | ||
2 | date: //div[@class='date'] | ||
3 | body: //div[@class='content'] | ||
4 | |||
5 | test_url: http://www.mac4ever.com/news/64182/icloud_les_prix_en_euros_et_en_chf/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macdrifter.com.txt b/inc/3rdparty/site_config/standard/macdrifter.com.txt new file mode 100644 index 00000000..fd1ede7d --- /dev/null +++ b/inc/3rdparty/site_config/standard/macdrifter.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: substring-before(//title,' « Macdrifter') | ||
2 | test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macformat.techradar.com.txt b/inc/3rdparty/site_config/standard/macformat.techradar.com.txt new file mode 100644 index 00000000..109eae45 --- /dev/null +++ b/inc/3rdparty/site_config/standard/macformat.techradar.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | # Remove news feed | ||
2 | strip: //div[@id='news_feed_front'] | ||
3 | |||
4 | # Remove pull quote | ||
5 | strip: //div[@class='field field-type-text field-field-pull-quote'] | ||
6 | |||
7 | # Remove login | ||
8 | strip: //div[@class='right_bar_login'] | ||
9 | test_url: http://macformat.techradar.com/blog/solid-state-storage-bringing-parity-back-mac-29-10-10&article=89189666 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macgeneration.com.txt b/inc/3rdparty/site_config/standard/macgeneration.com.txt new file mode 100644 index 00000000..e6bbe28e --- /dev/null +++ b/inc/3rdparty/site_config/standard/macgeneration.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le') | ||
2 | date: substring-after(//div[@class='dateNews'],' le ') | ||
3 | body: //div[@class='singleNews zoneApple'] | ||
4 | |||
5 | test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macmagazine.com.br.txt b/inc/3rdparty/site_config/standard/macmagazine.com.br.txt new file mode 100644 index 00000000..47ebfd79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/macmagazine.com.br.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | # Remove sliders | ||
2 | strip: //*[(@class="slides_container")] | ||
3 | strip: //div[(@id="slides_two")] | ||
4 | |||
5 | # Remove tag cloud | ||
6 | strip: //span[(@class="secao")] | ||
7 | |||
8 | # Fix date article | ||
9 | # TODO | ||
10 | |||
11 | # Remove other stuff | ||
12 | strip: //div[(@id="idc-container")] | ||
13 | strip: //div[(@id="idc-noscript")] | ||
14 | strip: //div[(@class="linkwithin_div")] | ||
15 | strip: //div[(@class="navPosts")] | ||
16 | strip: //div[(@id="lateral")] | ||
17 | strip: //div[(@id="autor")] | ||
18 | strip: //div[(@id="rodape")] | ||
19 | strip: //div[(@id="post")]/h1 | ||
20 | strip: //div[(@id="post")]/div[(@id="boxInformacoes")] | ||
21 | test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macrumors.com.txt b/inc/3rdparty/site_config/standard/macrumors.com.txt new file mode 100644 index 00000000..76f999d3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/macrumors.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | author: substring-after(//div[@class='byline'], " by ") | ||
2 | date: substring-before(//div[@class='byline'], " by ") | ||
3 | |||
4 | # set body | ||
5 | body: //div[@class='content'] | ||
6 | |||
7 | # set title | ||
8 | title: //h3 | ||
9 | #strip: //div[@class='content']/h3 | ||
10 | test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macstories.net.txt b/inc/3rdparty/site_config/standard/macstories.net.txt new file mode 100644 index 00000000..6e651ca0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/macstories.net.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | strip: //*[(@id = "featured")] | ||
2 | |||
3 | author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ') | ||
4 | |||
5 | date: concat(//div[@class='month'],' ',//div[@class='day']) | ||
6 | |||
7 | #macstories doesn't provide a year, but month/day is better than nothing | ||
8 | test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mactalk.com.au.txt b/inc/3rdparty/site_config/standard/mactalk.com.au.txt new file mode 100644 index 00000000..e8d60522 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mactalk.com.au.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author://div[@class="article_username_container_full"] | ||
2 | date://div[@class="article_username_container"] | ||
3 | body://div[@class="article cms_clear restore postcontainer"] | ||
4 | test_url: http://www.mactalk.com.au/content/chat-basil-shkara-developer-taptax-2452/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mactechnews.de.txt b/inc/3rdparty/site_config/standard/mactechnews.de.txt new file mode 100644 index 00000000..c3fc0e44 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mactechnews.de.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: substring-after(substring-after(//title, '>'), '>') | ||
2 | body: //div[@class='NewsArticleContent'] | ||
3 | test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/macworld.com.txt b/inc/3rdparty/site_config/standard/macworld.com.txt new file mode 100644 index 00000000..96175872 --- /dev/null +++ b/inc/3rdparty/site_config/standard/macworld.com.txt | |||
@@ -0,0 +1,24 @@ | |||
1 | title: //article//h1 | ||
2 | date: //meta[@name="date"]/@content | ||
3 | author: //div[@class="author-name" or @class="article-byline"]/a[1] | ||
4 | |||
5 | body: //section[@class="page"] | ||
6 | |||
7 | # remove 'From the Lab' and 'Recent posts' text | ||
8 | strip: //div[@class='blogLabel'] | ||
9 | |||
10 | # remove byline and meta info | ||
11 | strip: //div[@class="article-meta"] | ||
12 | strip: //div[@class="author-info"] | ||
13 | |||
14 | #strip tags and categories | ||
15 | strip: //div[@class="department"] | ||
16 | |||
17 | #strip product cap links | ||
18 | strip: //div[@class="cap-main"] | ||
19 | strip: //div[@id="compare-lede"] | ||
20 | |||
21 | prune: no | ||
22 | |||
23 | # copes less well with Review pages, seems fine for News | ||
24 | test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mainichi.jp.txt b/inc/3rdparty/site_config/standard/mainichi.jp.txt new file mode 100644 index 00000000..e701207f --- /dev/null +++ b/inc/3rdparty/site_config/standard/mainichi.jp.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='NewsArticle'] | ||
2 | |||
3 | test_url: http://mainichi.jp/select/weathernews/20110311/news/20110520k0000e040062000c.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mainpost.de.txt b/inc/3rdparty/site_config/standard/mainpost.de.txt new file mode 100644 index 00000000..a2d25d56 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mainpost.de.txt | |||
@@ -0,0 +1,28 @@ | |||
1 | title: substring-before(//title, '|') | ||
2 | body: //*[@id='content-left'] | ||
3 | |||
4 | # Why is this not working here? | ||
5 | # body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail'] | ||
6 | |||
7 | |||
8 | #Header | ||
9 | strip_id_or_class: 'subHead' | ||
10 | strip_id_or_class: 'fl_right' | ||
11 | strip_id_or_class: 'infolink' | ||
12 | strip_id_or_class: 'content-head' | ||
13 | strip_id_or_class: 'tab' | ||
14 | strip_id_or_class: 'tab-active' | ||
15 | strip: //*[contains(@class,'trenner')] | ||
16 | |||
17 | # Headline | ||
18 | strip: //h1/* | ||
19 | strip_id_or_class: 'font16' | ||
20 | |||
21 | #Images | ||
22 | strip_id_or_class: 'leftimage' | ||
23 | strip_id_or_class: 'rightimage' | ||
24 | |||
25 | #Comments | ||
26 | strip: //table | ||
27 | strip: //p/following-sibling::*[0] | ||
28 | test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/makeuseof.com.txt b/inc/3rdparty/site_config/standard/makeuseof.com.txt new file mode 100644 index 00000000..6809afed --- /dev/null +++ b/inc/3rdparty/site_config/standard/makeuseof.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | tidy: no | ||
2 | |||
3 | test_url: http://www.makeuseof.com/dir/kindle-it-web-pages-kindle-friendly/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/marco.org.txt b/inc/3rdparty/site_config/standard/marco.org.txt new file mode 100644 index 00000000..ef2e03d3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/marco.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | tidy: no | ||
2 | prune: no | ||
3 | date: //article//time[@pubdate] | ||
4 | title: //article/header/h2 | ||
5 | body: //article | ||
6 | strip: //header | ||
7 | test_url: http://www.marco.org/2012/09/08/businessweek-gruber | ||
8 | test_url: http://www.marco.org/2012/04/24/might-upgrade-someday \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/marksdailyapple.com.txt b/inc/3rdparty/site_config/standard/marksdailyapple.com.txt new file mode 100644 index 00000000..0077f560 --- /dev/null +++ b/inc/3rdparty/site_config/standard/marksdailyapple.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_id_or_class: wwsgd | ||
2 | test_url: http://www.marksdailyapple.com/are-detoxes-and-cleanses-safe-and-effective/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/martinfowler.com.txt b/inc/3rdparty/site_config/standard/martinfowler.com.txt new file mode 100644 index 00000000..8e0e349f --- /dev/null +++ b/inc/3rdparty/site_config/standard/martinfowler.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | date: //div[@id="main"]/p[@class="date"] | ||
2 | author: string("Martin Fowler") | ||
3 | body: //div[@id="main"] | ||
4 | strip_id_or_class: date | ||
5 | strip_id_or_class: tags | ||
6 | strip_id_or_class: tagLabel | ||
7 | strip: //div[@id="main"]/h1[1] | ||
8 | test_url: http://martinfowler.com/bliki/DatabaseThaw.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mashable.com.txt b/inc/3rdparty/site_config/standard/mashable.com.txt new file mode 100644 index 00000000..2c5a14a6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mashable.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //header[@class='entry-title']/h1 | ||
2 | body: //div[@class='description'] | ||
3 | strip: //div[@class='ytm-gallery-box'] | ||
4 | test_url: http://mashable.com/2011/12/05/india-wants-google-and-facebook-to-censor-user-content/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mattcutts.com.txt b/inc/3rdparty/site_config/standard/mattcutts.com.txt new file mode 100644 index 00000000..76b1eac6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mattcutts.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | date: //*[@class = 'published'] | ||
2 | test_url: http://www.mattcutts.com/blog/internet-censorship-sopa/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mbl.is.txt b/inc/3rdparty/site_config/standard/mbl.is.txt new file mode 100644 index 00000000..fd26f091 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mbl.is.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[class="frett-main"] | ||
2 | test_url: http://mbl.is/frettir/innlent/2012/02/21/litill_munur_a_fargjaldaverdi/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/medialens.org.txt b/inc/3rdparty/site_config/standard/medialens.org.txt new file mode 100644 index 00000000..94f27b71 --- /dev/null +++ b/inc/3rdparty/site_config/standard/medialens.org.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip: //div[contains(@class, 'article-tools')] | ||
2 | test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/menshealth.com.txt b/inc/3rdparty/site_config/standard/menshealth.com.txt new file mode 100644 index 00000000..e7e1e269 --- /dev/null +++ b/inc/3rdparty/site_config/standard/menshealth.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | # need to find a way to eliminate <span> content for "related content" without eliminating important content | ||
2 | |||
3 | convert_double_br_tags: [yes] | ||
4 | #body: //div[@id='leftside'] | ||
5 | title: //h1 | ||
6 | title: //h2 | ||
7 | Author: substring-after(//h4, 'By ') | ||
8 | Author: substring-after(//h4, 'By: ') | ||
9 | #Strip: //span | ||
10 | strip_id_or_class: morefromcat | ||
11 | strip_id_or_class: mostpopular | ||
12 | strip_id_or_class: articlepagination | ||
13 | strip_id_or_class: toolbar | ||
14 | body: //div[@id='zmodcontent'] | ||
15 | single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')] | ||
16 | test_url: http://www.menshealth.com/mhlists/pursuit_of_happiness/index.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mikeash.com.txt b/inc/3rdparty/site_config/standard/mikeash.com.txt new file mode 100644 index 00000000..af8a7d30 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mikeash.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class="blogtitle"] | ||
2 | strip: //div[@class="blogtitle"] | ||
3 | |||
4 | author: substring-after(//span[@class="blogheader"], 'Author: ') | ||
5 | test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mikeindustries.com.txt b/inc/3rdparty/site_config/standard/mikeindustries.com.txt new file mode 100644 index 00000000..3d488e13 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mikeindustries.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@class='post_content']/h2 | ||
2 | date: //div[@class='dateline'] | ||
3 | body: //div[@class='entry'] | ||
4 | |||
5 | strip: //div[@class='closer'] | ||
6 | strip: //div[@class='navigation'] | ||
7 | strip: //div[@class='aux_pane'] | ||
8 | strip: //div[@class='aux_aux_pane'] | ||
9 | test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt b/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt new file mode 100644 index 00000000..7e43d63c --- /dev/null +++ b/inc/3rdparty/site_config/standard/minnesota.publicradio.org.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //*[@class="article"]/h1 | ||
2 | date: //*[@class="article"]/div[@class="date"] | ||
3 | |||
4 | # strip the title and date from the article text | ||
5 | strip: //*[@class="article"]/h1 | ||
6 | strip: //*[@class="article"]/div[@class="date"] | ||
7 | |||
8 | # strip annoying <br> between metadata and article | ||
9 | strip: //*[@class="article"]/div[@class="date"]/following-sibling::br | ||
10 | test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/minnpost.com.txt b/inc/3rdparty/site_config/standard/minnpost.com.txt new file mode 100644 index 00000000..51a0630b --- /dev/null +++ b/inc/3rdparty/site_config/standard/minnpost.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //*[@id="content-header"]/h1 | ||
2 | author: //*[contains(@class, 'byline')]/a/text() | ||
3 | date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|') | ||
4 | body: //*[contains(@class, 'node-body')] | ||
5 | test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt b/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt new file mode 100644 index 00000000..4215a051 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mirrorfootball.co.uk.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | # Remove extra links | ||
2 | strip: //*[@class='appended_html'] | ||
3 | test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mises.org.txt b/inc/3rdparty/site_config/standard/mises.org.txt new file mode 100644 index 00000000..ae542aa6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mises.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | strip_id_or_class: 'book-ad' | ||
2 | strip_id_or_class: 'bigger pullquote' | ||
3 | strip_id_or_class: 'subscribe' | ||
4 | strip_id_or_class: 'blog-link' | ||
5 | test_url: http://mises.org/daily/4804 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mlb.mlb.com.txt b/inc/3rdparty/site_config/standard/mlb.mlb.com.txt new file mode 100644 index 00000000..30e8aff2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mlb.mlb.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //h1[@class='article-headline'] | ||
2 | date: //span[@class='timeStamp'] | ||
3 | author: substring-before(//p[@class='article-byline'], '/') | ||
4 | body: //div[@id='article'] | ||
5 | #strip: //div[@class='inner'] | ||
6 | strip: //div[@id='article_head'] | ||
7 | strip: //p[@class='tagLine'] | ||
8 | strip: //div[@id='article_related_links'] | ||
9 | strip: //div[@id='article_related_mlb'] | ||
10 | strip: //span[@class='more'] | ||
11 | strip: //div[@class='article_component'] | ||
12 | strip: //span[@class='screen_reader'] | ||
13 | strip: //ul[@class='columnists_blurb'] | ||
14 | test_url: http://mlb.mlb.com/news/article.jsp?ymd=20120403&content_id=27880830 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt b/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt new file mode 100644 index 00000000..c4e3389e --- /dev/null +++ b/inc/3rdparty/site_config/standard/mlb.sbnation.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //h1[@id = 'stream_title'] | ||
2 | author: //p[@class = 'byline']/a | ||
3 | date: //span[@class = 'datetime'] | ||
4 | |||
5 | body: //div[@id = 'stream_container'] | ||
6 | strip: //p[@class = 'byline'] | ||
7 | strip_id_or_class: stream_summary | ||
8 | strip_id_or_class: social-spoken | ||
9 | strip_id_or_class: datetime | ||
10 | strip_id_or_class: author-mini-profile | ||
11 | strip_id_or_class: social-tools | ||
12 | strip_id_or_class: entry-tags | ||
13 | strip_id_or_class: fb-like-box | ||
14 | test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mlssoccer.com.txt b/inc/3rdparty/site_config/standard/mlssoccer.com.txt new file mode 100644 index 00000000..41e15136 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mlssoccer.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //*[@class="header_title"]/h1 | ||
2 | date: //*[@class="field-date"] | ||
3 | author: //*[@class="field-author"] | ||
4 | body: //div[contains(@class, 'content')] | ||
5 | |||
6 | test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mmo-champion.com.txt b/inc/3rdparty/site_config/standard/mmo-champion.com.txt new file mode 100644 index 00000000..918fae36 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mmo-champion.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id = 'article_content']/div[contains(@class,'article')] | ||
3 | author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')] | ||
4 | date: //div[@class = 'article_username_container'] | ||
5 | test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mnn.com.txt b/inc/3rdparty/site_config/standard/mnn.com.txt new file mode 100644 index 00000000..ddfe6fa2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mnn.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | tidy: no | ||
2 | author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text() | ||
3 | date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2] | ||
4 | body: //div[@class="node"] | ||
5 | |||
6 | strip_id_or_class: vertical-social-bar | ||
7 | strip_id_or_class: blogs_paginator | ||
8 | strip_id_or_class: horizontal-social-links | ||
9 | strip_id_or_class: servicelinksdiv | ||
10 | |||
11 | test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mno.hu.txt b/inc/3rdparty/site_config/standard/mno.hu.txt new file mode 100644 index 00000000..ba158953 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mno.hu.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //title | ||
2 | |||
3 | author: //div[@class="author"] | ||
4 | |||
5 | strip_id_or_class: 'header' | ||
6 | strip_id_or_class: 'cikk_ajanlo' | ||
7 | strip_id_or_class: 'buttons' | ||
8 | strip_id_or_class: 'related' | ||
9 | strip_id_or_class: 'adbox ad_cikk_kozepre' | ||
10 | strip_id_or_class: 'cikk-cimkek' | ||
11 | strip_id_or_class: 'cikk_ertekeles' | ||
12 | |||
13 | strip_comments: yes | ||
14 | test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mobile.slate.com.txt b/inc/3rdparty/site_config/standard/mobile.slate.com.txt new file mode 100644 index 00000000..d5d81034 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mobile.slate.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h2[@class="article_title"] | ||
2 | strip: //a[@class="houseAdLink"] | ||
3 | strip: //h1 | ||
4 | strip: //div[@class="more_articles"] | ||
5 | test_url: http://mobile.slate.com/rss.jsp?rssid=411&item=http%3a%2f%2fwww.slate.com%2fdefault.aspx%3fdisplaymode%3d201%26id%3d2293749%26device%3drss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt b/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt new file mode 100644 index 00000000..a1cc5317 --- /dev/null +++ b/inc/3rdparty/site_config/standard/mobileopportunity.blogspot.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class='post uncustomized-post-template'] | ||
2 | |||
3 | # remove duplicate of post title, which is a link | ||
4 | strip: //h3[@class='post-title'] | ||
5 | |||
6 | # remove permalink and timestamp, which isn't useful as it's a time with no date | ||
7 | strip: //span[@class='post-timestamp'] | ||
8 | |||
9 | # remove labels (tags) | ||
10 | strip: //span[@class='post-labels'] | ||
11 | test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/modernghana.com.txt b/inc/3rdparty/site_config/standard/modernghana.com.txt new file mode 100644 index 00000000..4c93d0cf --- /dev/null +++ b/inc/3rdparty/site_config/standard/modernghana.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | author: //meta[@name="author"]/@content | ||
3 | date: //span[@class='date1'] | ||
4 | body: //div[@id='newsimage'] | //div[@id='bodytext'] | ||
5 | tidy: no | ||
6 | prune: no | ||
7 | |||
8 | test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/money.cnn.com.txt b/inc/3rdparty/site_config/standard/money.cnn.com.txt new file mode 100644 index 00000000..a0d1628a --- /dev/null +++ b/inc/3rdparty/site_config/standard/money.cnn.com.txt | |||
@@ -0,0 +1,24 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | title: //h1[@class='storyheadline'] | ||
3 | author: //meta[@name="AUTHOR"]/@content | ||
4 | date: //span[@class='cnnDateStamp'] | ||
5 | date: //meta[@name="DATE"]/@content | ||
6 | body: //div[@id='storytext' or @class='storytext'] | ||
7 | |||
8 | strip_id_or_class: ie_column | ||
9 | strip_id_or_class: sharewidgets | ||
10 | strip_image_src: bug.gif | ||
11 | |||
12 | strip: //div[@class="hed_side"] | ||
13 | strip: //span[@class="byline"] | ||
14 | strip: //a[@class="soc-twtname"] | ||
15 | strip: //span[@class="cnnDateStamp"] | ||
16 | strip: //div[@class="storytimestamp"] | ||
17 | strip: //div[@class="cnnCol_side"] | ||
18 | |||
19 | prune: no | ||
20 | tidy: no | ||
21 | |||
22 | test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 | ||
23 | test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm | ||
24 | test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/monkeyzen.com.txt b/inc/3rdparty/site_config/standard/monkeyzen.com.txt new file mode 100644 index 00000000..f779c38e --- /dev/null +++ b/inc/3rdparty/site_config/standard/monkeyzen.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://monkeyzen.com/2011/09/siluetas-de-clasicos-a-modo-de-vinilos \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/moonsault.de.txt b/inc/3rdparty/site_config/standard/moonsault.de.txt new file mode 100644 index 00000000..061a8d5c --- /dev/null +++ b/inc/3rdparty/site_config/standard/moonsault.de.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | strip_image_src: menu | ||
2 | strip_image_src: templates | ||
3 | strip: //div/a | ||
4 | strip: //div/b | ||
5 | strip: //div/strong | ||
6 | strip: //td[@width='30%'] | ||
7 | strip: //br[1] | ||
8 | strip: //br[2] | ||
9 | strip: //br[3] | ||
10 | strip: //br[4] | ||
11 | strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home'] | ||
12 | strip_id_or_class: cse-branding-right | ||
13 | test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt b/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt new file mode 100644 index 00000000..a7e59c30 --- /dev/null +++ b/inc/3rdparty/site_config/standard/moreintelligentlife.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@class='print-title'] | ||
2 | body: //div[@class='print-submitted' or @class='print-created' or @class='print-content'] | ||
3 | prune: no | ||
4 | |||
5 | single_page_link: //li[@class='print']/a | ||
6 | |||
7 | test_url: http://moreintelligentlife.com/content/places/paul-markillie/they-trash-cars-dont-they \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/motherboard.vice.com.txt b/inc/3rdparty/site_config/standard/motherboard.vice.com.txt new file mode 100644 index 00000000..6faf1c9a --- /dev/null +++ b/inc/3rdparty/site_config/standard/motherboard.vice.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: //span[@class="author"]/a | ||
2 | date: //span[@class="date"] | ||
3 | body: //div[@class="story-content"] | ||
4 | strip: //aside | ||
5 | test_url: http://motherboard.vice.com/blog/you-can-carry-a-copy-of-the-pirate-bay-in-your-pocket \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/mothering.com.txt b/inc/3rdparty/site_config/standard/mothering.com.txt new file mode 100644 index 00000000..a9d9195f --- /dev/null +++ b/inc/3rdparty/site_config/standard/mothering.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h2[contains(@class,'post_headline')] | ||
2 | body: //div[@class='entry'] | ||
3 | convert_double_br_tags: yes | ||
4 | strip_image_src: _selected.gif | ||
5 | strip_id_or_class: addthis_ | ||
6 | strip: //a[contains(@href,'feedburner.com')] | ||
7 | test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/motherjones.com.txt b/inc/3rdparty/site_config/standard/motherjones.com.txt new file mode 100644 index 00000000..d58c7d2c --- /dev/null +++ b/inc/3rdparty/site_config/standard/motherjones.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id = 'content-area'] | ||
3 | next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')] | ||
4 | tidy: no | ||
5 | author: //p[contains(@class, 'byline')]/a | ||
6 | |||
7 | strip_id_or_class: node-header | ||
8 | strip_id_or_class: hdr-tools | ||
9 | strip_id_or_class: node-body-break | ||
10 | strip_id_or_class: pullquote | ||
11 | strip_id_or_class: node-pager | ||
12 | strip_id_or_class: author-bio | ||
13 | strip_id_or_class: node-footer | ||
14 | |||
15 | test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/motorfull.com.txt b/inc/3rdparty/site_config/standard/motorfull.com.txt new file mode 100644 index 00000000..c6bec7e9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/motorfull.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://motorfull.com/2011/09/aparca-valeo-park4u-remote \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt b/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt new file mode 100644 index 00000000..f4f20450 --- /dev/null +++ b/inc/3rdparty/site_config/standard/msdn.microsoft.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[class="mainBody"] | ||
2 | footnotes: no | ||
3 | test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/msnbc.msn.com.txt b/inc/3rdparty/site_config/standard/msnbc.msn.com.txt new file mode 100644 index 00000000..ad89cda8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/msnbc.msn.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | title: //title | ||
2 | author: //div[@id='byline'] | ||
3 | |||
4 | date: //div[contains(@class,'timestamp')]/abbr/text() | ||
5 | |||
6 | body: //div[@id='intellitTXT'] | ||
7 | |||
8 | strip: //div[@id='byline'] | ||
9 | strip: //div[contains(@class,'timestamp')] | ||
10 | strip: //div[contains(@class, 'ad-label')] | ||
11 | strip: //div[contains(@class, 'ad-break')] | ||
12 | strip: //span[contains(@class, 'x-video')] | ||
13 | strip: //span[contains(@class, 'inline')] | ||
14 | strip: //div[contains(@class, 'video')] | ||
15 | strip: //div[contains(@class, 'discuss')] | ||
16 | strip: //div[@id='most-popular'] | ||
17 | strip: //div[contains(@class,'drawer')] | ||
18 | strip: //*[contains(@class, 'hide')] | ||
19 | |||
20 | footnotes: no | ||
21 | test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/myfoxboston.com.txt b/inc/3rdparty/site_config/standard/myfoxboston.com.txt new file mode 100644 index 00000000..1a35b4fc --- /dev/null +++ b/inc/3rdparty/site_config/standard/myfoxboston.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"] | ||
2 | tidy: no | ||
3 | |||
4 | test_url: http://www.myfoxboston.com/dpp/news/local/transit-police-say-woman-spat-on-mbta-bus-driver-2010611 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/myrecipes.com.txt b/inc/3rdparty/site_config/standard/myrecipes.com.txt new file mode 100644 index 00000000..8b99d22d --- /dev/null +++ b/inc/3rdparty/site_config/standard/myrecipes.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h2[contains(@class, 'name')] | ||
2 | body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')] | ||
3 | |||
4 | strip_id_or_class: photoBy | ||
5 | strip_id_or_class: link | ||
6 | |||
7 | single_page_link: //li[@class='print']/a[contains(@href, '/print/')] | ||
8 | |||
9 | prune: no | ||
10 | tidy: no | ||
11 | |||
12 | test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/narenji.ir.txt b/inc/3rdparty/site_config/standard/narenji.ir.txt new file mode 100644 index 00000000..6c3d0c24 --- /dev/null +++ b/inc/3rdparty/site_config/standard/narenji.ir.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='node'] | ||
2 | test_url: http://www.narenji.ir/2806 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nasa.gov.txt b/inc/3rdparty/site_config/standard/nasa.gov.txt new file mode 100644 index 00000000..d95530f3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nasa.gov.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@class='address']/span | ||
2 | author: substring-before(//span[@class='credits'],',') | ||
3 | date: //div[@class='promodatepress']/span | ||
4 | body: //div[@class='default_style_wrap'] | ||
5 | strip: //div[@class='text_adjust'] | ||
6 | strip: //div[@class='skiplink'] | ||
7 | strip: //h2 | ||
8 | test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nbweekly.com.txt b/inc/3rdparty/site_config/standard/nbweekly.com.txt new file mode 100644 index 00000000..0b722d33 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nbweekly.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | date://span[contains(@class,'date')] | ||
2 | |||
3 | body://div[contains(@class,'contWarp')] | ||
4 | |||
5 | strip://div[contains(@class,'keyWord')] | ||
6 | strip://div[contains(@class,'submitComt')] | ||
7 | strip://div[contains(@class,'cmts')] | ||
8 | strip://div[contains(@class,'notice')] | ||
9 | strip://div[contains(@class,'part pt-second')] | ||
10 | test_url: http://www.nbweekly.com/news/china/201203/29316.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/neh.gov.txt b/inc/3rdparty/site_config/standard/neh.gov.txt new file mode 100644 index 00000000..45136a2b --- /dev/null +++ b/inc/3rdparty/site_config/standard/neh.gov.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | #host configuration should be http://www.neh.gov/news/humanities/ | ||
2 | |||
3 | |||
4 | #meta data | ||
5 | title:substring-after(substring-after(//title,':'),':') | ||
6 | author:substring-after(//h2[@class = 'subHead'],'By') | ||
7 | date:substring-before(substring-after(//title,':'),':') | ||
8 | |||
9 | #img and caption handling | ||
10 | wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text() | ||
11 | wrap_in(fieldset)://div[@id = 'mainContent']/table | ||
12 | |||
13 | # clean up | ||
14 | strip: //table[@class = 'marginpaddingTop'] | ||
15 | strip: //h2[@class = 'subHead'] | ||
16 | |||
17 | test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/neomoney.co.txt b/inc/3rdparty/site_config/standard/neomoney.co.txt new file mode 100644 index 00000000..564d5492 --- /dev/null +++ b/inc/3rdparty/site_config/standard/neomoney.co.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //*[@class="header_title"]/h1 | ||
2 | body: //div[contains(@class, 'content')] | ||
3 | test_url: http://neomoney.co/personal/expatriate-and-migrant-loans/expatriate-loans/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/net-security.org.txt b/inc/3rdparty/site_config/standard/net-security.org.txt new file mode 100644 index 00000000..4e6d66d4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/net-security.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='content-title'] | ||
2 | #date: substring-after(//div[@class='dernek-text-under'],'Posted on') | ||
3 | body: //div[@class='content-item'] | ||
4 | next_page_link: //li[@class='next']/a | ||
5 | convert_double_br_tags: yes | ||
6 | |||
7 | test_url: http://www.net-security.org/article.php?id=1732 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/netmagazine.com.txt b/inc/3rdparty/site_config/standard/netmagazine.com.txt new file mode 100644 index 00000000..86885445 --- /dev/null +++ b/inc/3rdparty/site_config/standard/netmagazine.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //h1 | ||
2 | author: //div[@class="submitted"]/span | ||
3 | |||
4 | # seems like this should work, but nothing is returned. Issue with xpath parser? | ||
5 | date: //div[@class="submitted"]/time | ||
6 | |||
7 | body: //div[@id="main-content"] | ||
8 | |||
9 | strip_comments: no | ||
10 | |||
11 | strip: //h1 | ||
12 | strip: //div[@class="submitted"] | ||
13 | strip: //dd[@class="profile-avatar"] | ||
14 | strip: //div[@class="author-profile"]/dl/dt[1] | ||
15 | strip: //div[@id="right-col"] | ||
16 | test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/netzpolitik.org.txt b/inc/3rdparty/site_config/standard/netzpolitik.org.txt new file mode 100644 index 00000000..87dc3cdf --- /dev/null +++ b/inc/3rdparty/site_config/standard/netzpolitik.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | author: //a[@ref='author'] | ||
3 | date: //span[@class='entry-date'] | ||
4 | body: //div[@class='entry-content'] | ||
5 | |||
6 | test_url: http://netzpolitik.org/2011/buch-generation-facebook/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newmatilda.com.txt b/inc/3rdparty/site_config/standard/newmatilda.com.txt new file mode 100644 index 00000000..ab766847 --- /dev/null +++ b/inc/3rdparty/site_config/standard/newmatilda.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id="maincontent"]/h1 | ||
2 | body: //div[@id="maincontent"] | ||
3 | date: //div[@id="maincontent"]/p[2] | ||
4 | author: //ul[@id="contributors"]/li/p/b | ||
5 | |||
6 | strip: //p[@*] | ||
7 | strip: //h1 | ||
8 | strip: //div[@id="maincontent"]/div | ||
9 | test_url: http://newmatilda.com/2011/07/22/turnbull-makes-sense-climate \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news-gazette.com.txt b/inc/3rdparty/site_config/standard/news-gazette.com.txt new file mode 100644 index 00000000..1f1e5d3a --- /dev/null +++ b/inc/3rdparty/site_config/standard/news-gazette.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@id="main-content"]//h2 | ||
2 | |||
3 | author: //div[@id="main-content"]//span[@class="authors"] | ||
4 | |||
5 | date: //div[@id="main-content"]//span[@class="timestamp"] | ||
6 | |||
7 | body: //div[@id="main-content"]//div[@class="content"] | ||
8 | test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.cnet.com.txt b/inc/3rdparty/site_config/standard/news.cnet.com.txt new file mode 100644 index 00000000..b7ab224a --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.cnet.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | #This should apply to *.cnet.com. Not just news.cnet.com. | ||
2 | title: //h1 | ||
3 | author: //img[@class="mugshot"]/@alt | ||
4 | strip: //h1 | ||
5 | strip_id_or_class: breadcrumb | ||
6 | strip: //p[@id="introP"] | ||
7 | strip: //div[@class="postByline"] | ||
8 | strip: //div[@class="editorBio"] | ||
9 | strip: //div[@class="inline-slideshow"] | ||
10 | strip: //div[@class="related"] | ||
11 | body: //div[@class="postBody txtWrap"] | ||
12 | test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.detik.com.txt b/inc/3rdparty/site_config/standard/news.detik.com.txt new file mode 100644 index 00000000..3ed1dc85 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.detik.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title://div[@class="content_detail"]/h1 | ||
2 | |||
3 | author://div[@class="author"]/strong | ||
4 | |||
5 | date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB') | ||
6 | |||
7 | body://div[@class="text_detail"] | ||
8 | test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt b/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt new file mode 100644 index 00000000..6fc86137 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.kanaloco.jp.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='main'] | ||
2 | strip: //div[@id='sbs'] | ||
3 | strip: //div[@id='fsizeSwitch'] | ||
4 | strip: //div[@id='googleAd'] | ||
5 | strip: //div[@id='detailFoot'] | ||
6 | strip_image_src: counter?key | ||
7 | convert_double_br_tags: yes | ||
8 | |||
9 | test_url: http://news.kanaloco.jp/localnews/article/1105200018/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.mynavi.jp.txt b/inc/3rdparty/site_config/standard/news.mynavi.jp.txt new file mode 100644 index 00000000..ded680f1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.mynavi.jp.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h2[@class="lyt-hdg-02-04"] | ||
2 | |||
3 | author: //div[@class="lyt-namearea"]/a | ||
4 | |||
5 | date: //div[@class="lyt-namearea"]/text() | ||
6 | |||
7 | body: //div[@class="articleContent"] | ||
8 | |||
9 | strip: //div[@id="tab-aside"] | ||
10 | |||
11 | test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.orf.at.txt b/inc/3rdparty/site_config/standard/news.orf.at.txt new file mode 100644 index 00000000..b60deea4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.orf.at.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | ||
2 | |||
3 | title: //div[@class='hidden offscreen']/h2 | ||
4 | body: //div[@id="storyText"] | ||
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | ||
6 | strip: //small[@class='credit'] | ||
7 | strip: //small[@class='caption'] | ||
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | ||
9 | strip: //p[@class='toplink'] | ||
10 | |||
11 | test_url: http://news.orf.at/stories/2084731/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.rambler.ru.txt b/inc/3rdparty/site_config/standard/news.rambler.ru.txt new file mode 100644 index 00000000..743245f8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.rambler.ru.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //article | ||
2 | title: //h1 | ||
3 | author: //span[@class='b-article-source-dropdown'] | ||
4 | strip: //span[@class='b-article-photo-incut__source'] | ||
5 | strip: //a[@class='b-read-more b-read-more_bottom'] | ||
6 | |||
7 | |||
8 | tidy:no | ||
9 | test_url: http://news.rambler.ru/12972208/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.techmeme.com.txt b/inc/3rdparty/site_config/standard/news.techmeme.com.txt new file mode 100644 index 00000000..c80c3327 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.techmeme.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class='main']/div[@class='item'] | ||
2 | strip: //div[@class='right'] | ||
3 | |||
4 | test_url: http://news.techmeme.com/110516/fh-rip \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.yahoo.com.txt b/inc/3rdparty/site_config/standard/news.yahoo.com.txt new file mode 100644 index 00000000..5ee04049 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.yahoo.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | title: //h1[@class='headline'] | ||
3 | author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn'] | ||
4 | date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title | ||
5 | body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')] | ||
6 | #strip: //cite/abbr | ||
7 | strip_id_or_class: action | ||
8 | strip_id_or_class: prefetch | ||
9 | tidy: no | ||
10 | prune: no | ||
11 | |||
12 | test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/news.ycombinator.com.txt b/inc/3rdparty/site_config/standard/news.ycombinator.com.txt new file mode 100644 index 00000000..0b01f8a1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/news.ycombinator.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | strip_comments: no | ||
2 | strip: //a[. = 'reply'] | ||
3 | test_url: http://news.ycombinator.com/item?id=1516461 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newsbomb.gr.txt b/inc/3rdparty/site_config/standard/newsbomb.gr.txt new file mode 100644 index 00000000..0500890f --- /dev/null +++ b/inc/3rdparty/site_config/standard/newsbomb.gr.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | date: //meta[@name='og:article:published_time']/@value | ||
2 | |||
3 | body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText'] | ||
4 | |||
5 | strip_id_or_class: itemImageGallery | ||
6 | |||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newsle.com.txt b/inc/3rdparty/site_config/standard/newsle.com.txt new file mode 100644 index 00000000..e500ddcc --- /dev/null +++ b/inc/3rdparty/site_config/standard/newsle.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | single_page_link: //iframe/@src | ||
2 | test_url: http://newsle.com/article/0/15831103/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newsmill.se.txt b/inc/3rdparty/site_config/standard/newsmill.se.txt new file mode 100644 index 00000000..eb7d3350 --- /dev/null +++ b/inc/3rdparty/site_config/standard/newsmill.se.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1 | ||
2 | body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent'] | ||
3 | author: //div[@class='byline']//a[contains(@href, '/user/')] | ||
4 | |||
5 | strip_id_or_class: facts | ||
6 | strip_id_or_class: articleBlogsHolder | ||
7 | strip_id_or_class: byline | ||
8 | |||
9 | prune: no | ||
10 | tidy: no | ||
11 | |||
12 | test_url: http://www.newsmill.se/artikel/2012/05/06/medielogiken-v-ger-tyngre-n-reportrarnas-sikter \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newsunspun.org.txt b/inc/3rdparty/site_config/standard/newsunspun.org.txt new file mode 100644 index 00000000..860ad66b --- /dev/null +++ b/inc/3rdparty/site_config/standard/newsunspun.org.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@class='right']//div[@class='articles'] | ||
2 | author: //div[@id='artinfo']//a[contains(@href, '/author/')] | ||
3 | strip: //div[@id='artinfo'] | ||
4 | strip: //table[//a[contains(@href, 'twitter.com')]] | ||
5 | strip_id_or_class: twitter | ||
6 | |||
7 | prune: no | ||
8 | tidy: no | ||
9 | |||
10 | test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/newyorker.com.txt b/inc/3rdparty/site_config/standard/newyorker.com.txt new file mode 100644 index 00000000..5624aa8c --- /dev/null +++ b/inc/3rdparty/site_config/standard/newyorker.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1[@id='articlehed'] | //h2[@id="articleintro"] | ||
2 | body: //div[@id='articletext'] | ||
3 | |||
4 | strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"] | ||
5 | |||
6 | date: //h4[@id='articleauthor']/span[@class='dd dds'] | ||
7 | date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published'] | ||
8 | |||
9 | single_page_link: //div[@class='paginationViewSinglePage']/a | ||
10 | test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/next-gen.biz.txt b/inc/3rdparty/site_config/standard/next-gen.biz.txt new file mode 100644 index 00000000..806a3dfd --- /dev/null +++ b/inc/3rdparty/site_config/standard/next-gen.biz.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | # 2011-08-22 [carlo@...] initial version | ||
2 | # 2011-08-22 [carlo@...] removed comments & social links | ||
3 | |||
4 | tidy: no | ||
5 | |||
6 | single_page_link: //a[@class="single active"] | ||
7 | |||
8 | body: //div[@id="main"]//div[@class="content-region"]/article | ||
9 | author: //span[@class="author-name"] | ||
10 | date: //time/text() | ||
11 | |||
12 | strip_id_or_class: //aside[@id="related"] | ||
13 | strip: //footer | ||
14 | |||
15 | title: //h1 | ||
16 | test_url: http://www.next-gen.biz/reviews/deus-ex-human-revolution-review \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nfl.com.txt b/inc/3rdparty/site_config/standard/nfl.com.txt new file mode 100644 index 00000000..70f92473 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nfl.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | # doesn't look like selecting an attribute value works? | ||
2 | # author: //meta[@id="authorName"]@value | ||
3 | |||
4 | author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ") | ||
5 | date: //abbr[@id="article-time"] | ||
6 | title: //div[@id="article-hdr"]/h1 | ||
7 | body: //div[@class="articleText"] | ||
8 | |||
9 | # strip miscellaneous teasers & etc | ||
10 | strip: //div[@class="removeformobile"] | ||
11 | test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt b/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt new file mode 100644 index 00000000..60834862 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ngm.nationalgeographic.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | next_page_link: //div[@class='nextpage_continue']/a | ||
2 | strip: //div[@class='nextpage_continue'] | ||
3 | strip_id_or_class: nextpage | ||
4 | title: //div[@class='article_title']//h1 | ||
5 | body: //div[@class='article_title']/.. | ||
6 | body: //div[@class='content'] | ||
7 | test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nhk.or.jp.txt b/inc/3rdparty/site_config/standard/nhk.or.jp.txt new file mode 100644 index 00000000..0a3bb913 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nhk.or.jp.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@id = 'news_right'] | ||
2 | test_url: http://www.nhk.or.jp/news/html/20110309/t10014559982000.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt b/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt new file mode 100644 index 00000000..409a8977 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nintendoworldreport.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | body: //div[@id="main"] | ||
2 | title: //div[@id="main"]/h3 | ||
3 | |||
4 | # Remove ‘Review’ and ‘Wii’. | ||
5 | strip: //div[@class="badge"] | ||
6 | |||
7 | # Remove duplicate title and country flag. | ||
8 | strip: //h3 | ||
9 | |||
10 | # Commented out below are attempts to extract the author and date, which did not work. | ||
11 | # author: //p[@class="extra "]/a | ||
12 | # date: //p[@class="extra "]/span[@class="when"] | ||
13 | test_url: http://www.nintendoworldreport.com/review/28400 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nojesguiden.se.txt b/inc/3rdparty/site_config/standard/nojesguiden.se.txt new file mode 100644 index 00000000..ae2d7e41 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nojesguiden.se.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: //span[@class='meta']/span[@class='username'] | ||
2 | body: //div[@class='article-content'] | ||
3 | |||
4 | strip_id_or_class: 'article-actions' | ||
5 | test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/northumberlandview.ca.txt b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt new file mode 100644 index 00000000..04a0a34d --- /dev/null +++ b/inc/3rdparty/site_config/standard/northumberlandview.ca.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='pn-maincontent'] | ||
3 | strip_id_or_class: z-menu | ||
4 | strip_id_or_class: news_category | ||
5 | strip_id_or_class: news_title | ||
6 | strip_id_or_class: news_modify | ||
7 | strip_id_or_class: news_morearticlesincat | ||
8 | strip_id_or_class: ezc_comments | ||
9 | strip_comments: yes | ||
10 | |||
11 | test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nplusonemag.com.txt b/inc/3rdparty/site_config/standard/nplusonemag.com.txt new file mode 100644 index 00000000..205b1af4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nplusonemag.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: /html/body/div[3]/div/div/h1 | ||
2 | |||
3 | body: //*[@id="article-body"] | ||
4 | |||
5 | |||
6 | test_url: http://nplusonemag.com/the-outskirts-of-progress \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/npr.org.txt b/inc/3rdparty/site_config/standard/npr.org.txt new file mode 100644 index 00000000..afab0eb3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/npr.org.txt | |||
@@ -0,0 +1,32 @@ | |||
1 | title: //div[contains(@class, 'storytitle')]//h1 | ||
2 | author: //p[@class="byline"]/span | ||
3 | body: //div[@id='storyspan02']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext'] | //div[@class='transcript'] | ||
4 | date: //meta[@name="date"]/@content | ||
5 | |||
6 | strip: //div[@class='enlarge_measure'] | ||
7 | strip: //div[@class='enlarge_html'] | ||
8 | strip: //a[@class='enlargeicon'] | ||
9 | strip: //div[contains(@class, 'bookedition')] | ||
10 | strip: //div[@class='textsize'] | ||
11 | strip: //ul[@class='genres'] | ||
12 | strip: //span[@class='bull'] | ||
13 | strip_id_or_class: secondary | ||
14 | strip_id_or_class: con1col | ||
15 | strip: //h3[@class='conheader'] | ||
16 | |||
17 | replace_string(<a name="more"> </a>): <!-- no more --> | ||
18 | replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2> | ||
19 | |||
20 | prune: no | ||
21 | strip://div[@class="ecommercepop"] | ||
22 | strip://span[@class="bull"] | ||
23 | strip://span[@class="purchaseLink"] | ||
24 | strip://div[@class="enlarge_html"] | ||
25 | strip://div[@class="enlarge_measure"] | ||
26 | strip://div[@class="container con1col small"] | ||
27 | strip://a[contains(@class, "enlargebtn")] | ||
28 | strip://div[contains(@class, "bucketwrap internallink")] | ||
29 | |||
30 | test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates | ||
31 | test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right | ||
32 | test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nybooks.com.txt b/inc/3rdparty/site_config/standard/nybooks.com.txt new file mode 100644 index 00000000..8ecb8961 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nybooks.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | strip_id_or_class: sIFR-alternate | ||
2 | title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2 | ||
3 | single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))] | ||
4 | |||
5 | body: //div[@id = 'article-body'] | ||
6 | strip_id_or_class:article-tools | ||
7 | strip_id_or_class:js_target | ||
8 | strip_id_or_class:marker | ||
9 | author://div[@id = 'page-title']/h3 | ||
10 | date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')] | ||
11 | |||
12 | |||
13 | test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nymag.com.txt b/inc/3rdparty/site_config/standard/nymag.com.txt new file mode 100644 index 00000000..f664c93d --- /dev/null +++ b/inc/3rdparty/site_config/standard/nymag.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h2[contains(@class, 'primary')] | ||
2 | body: //div[@id='story'] | ||
3 | author: //*[@class='by']/a | ||
4 | date: substring-after(//*[@class='date'], 'Published') | ||
5 | |||
6 | next_page_link: //div[@class='page-navigation']//li[@class='next']/a | ||
7 | |||
8 | test_url: http://nymag.com/news/features/wall-street-2012-2/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nyteknik.se.txt b/inc/3rdparty/site_config/standard/nyteknik.se.txt new file mode 100644 index 00000000..8c9e37f4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/nyteknik.se.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@class="article default-article"]/h1 | ||
2 | author: //p[@class="author"]/a[2] | ||
3 | |||
4 | # Article introduction: | ||
5 | #move_into(//div[@class="article-bread"]): //p[@class="lead"] | ||
6 | |||
7 | body: //div[@class="article-bread"] | ||
8 | test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nytimes.com.txt b/inc/3rdparty/site_config/standard/nytimes.com.txt new file mode 100644 index 00000000..8d9a794a --- /dev/null +++ b/inc/3rdparty/site_config/standard/nytimes.com.txt | |||
@@ -0,0 +1,36 @@ | |||
1 | title://h1[@class="articleHeadline"] | ||
2 | body://div[@id="article"] | ||
3 | strip_id_or_class:articleTools | ||
4 | strip_id_or_class:readerscomment | ||
5 | #strip://div[contains(@class, "articleInline runaroundLeft")] | ||
6 | strip: //div[contains(@class, "doubleRule")] | ||
7 | # strip image credit - appears as a bold heading | ||
8 | strip: //div[contains(@class, "articleInline")]//h6 | ||
9 | strip_id_or_class:enlargeThis | ||
10 | strip_id_or_class:pageLinks | ||
11 | strip_id_or_class:memberTools | ||
12 | strip_id_or_class:articleExtras | ||
13 | strip_id_or_class:singleAd | ||
14 | strip_id_or_class:byline | ||
15 | strip_id_or_class:dateline | ||
16 | strip_id_or_class:articleheadline | ||
17 | strip_id_or_class:articleBottomExtra | ||
18 | strip://a[contains(@href, 'nytimes.com/adx/')] | ||
19 | strip: //nyt_byline | ||
20 | strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')] | ||
21 | strip: //p[@class='caption']//a[contains(., 'More Photos')] | ||
22 | |||
23 | prune: no | ||
24 | tidy: no | ||
25 | |||
26 | date: substring-after(//*[contains(@class, 'dateline')], 'Published:') | ||
27 | |||
28 | single_page_link: //link[contains(@href, 'pagewanted=all')] | ||
29 | #single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))] | ||
30 | |||
31 | strip://ul[@id = 'toolsList'] | ||
32 | strip://h6[@class = 'kicker'] | ||
33 | author:substring-after(//h6[@class='byline'],'By ') | ||
34 | |||
35 | test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html | ||
36 | test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/nzz.ch.txt b/inc/3rdparty/site_config/standard/nzz.ch.txt new file mode 100644 index 00000000..81faabae --- /dev/null +++ b/inc/3rdparty/site_config/standard/nzz.ch.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //*[@class='article-full'] | ||
2 | title: //h3 | ||
3 | strip: //header[@class='group'] | ||
4 | #body: //p[@class='lead'] | ||
5 | #move_into(//p[@class='lead']): //*[@class='article-full']/figure | ||
6 | #move_into(//p[@class='lead']): //div[@id='articleBodyText'] | ||
7 | strip: //div[@id='social-media-floater'] | ||
8 | strip: //div[@class='advertisement'] | ||
9 | strip: //div[@class='infobox'] | ||
10 | strip: //div[@id='articleComments'] | ||
11 | |||
12 | test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/observer.com.txt b/inc/3rdparty/site_config/standard/observer.com.txt new file mode 100644 index 00000000..e409ca2e --- /dev/null +++ b/inc/3rdparty/site_config/standard/observer.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //article[contains(@class, 'instapaper_body')] | ||
2 | |||
3 | prune: no | ||
4 | |||
5 | single_page_link: //a[@id='print-button'] | ||
6 | |||
7 | test_url: http://www.observer.com/2008/would-you-take-tumblr-man \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/off.net.mk.txt b/inc/3rdparty/site_config/standard/off.net.mk.txt new file mode 100644 index 00000000..a2fb5f21 --- /dev/null +++ b/inc/3rdparty/site_config/standard/off.net.mk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[(@id = "content")] | ||
2 | strip: //div[(@class = "links-bar")] | ||
3 | strip: //div[(@class = "povrzani")] | ||
4 | strip: //div[(@class = "povrzani-dolu")] | ||
5 | strip: //div[(@class = "tags")] | ||
6 | strip: //h1[(@id = "page-title")] | ||
7 | test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/omaha.com.txt b/inc/3rdparty/site_config/standard/omaha.com.txt new file mode 100644 index 00000000..53db061d --- /dev/null +++ b/inc/3rdparty/site_config/standard/omaha.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='story'] | ||
2 | test_url: http://www.omaha.com/article/20111031/BIGRED/111039984#pelini-tremendous-challenge-ahead-for-huskers \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/omiliya.org.txt b/inc/3rdparty/site_config/standard/omiliya.org.txt new file mode 100644 index 00000000..1b39b625 --- /dev/null +++ b/inc/3rdparty/site_config/standard/omiliya.org.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='squeeze']/h1 | ||
2 | strip: //div[@id='squeeze']/h1 | ||
3 | author: //div[@class='submitted']/a | ||
4 | strip: //div[@class='submitted']/a | ||
5 | convert_double_br_tags: yes | ||
6 | |||
7 | |||
8 | |||
9 | test_url: http://omiliya.org/content/predchuvstvie.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/on.net.mk.txt b/inc/3rdparty/site_config/standard/on.net.mk.txt new file mode 100644 index 00000000..be7a17ef --- /dev/null +++ b/inc/3rdparty/site_config/standard/on.net.mk.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[(@class = "statija")] | ||
2 | strip: //div[(@class = "relatedBlock")] | ||
3 | strip: //div[(@class = "swftools")] | ||
4 | strip: //table[(@class = "links")] | ||
5 | test_url: http://on.net.mk/video/na-trkala/lamborghini-aventador-avionot-shto-ne-leta \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/online.wsj.com.txt b/inc/3rdparty/site_config/standard/online.wsj.com.txt new file mode 100644 index 00000000..edb52855 --- /dev/null +++ b/inc/3rdparty/site_config/standard/online.wsj.com.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | body: //div[@id='article_story_body'] | ||
3 | |||
4 | author: //h3[@class='byline']/a | ||
5 | # for slid show content | ||
6 | body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1] | ||
7 | date: //li[@class='dateStamp']/small | ||
8 | |||
9 | strip_id_or_class: insetFullBracket | ||
10 | strip_id_or_class: insettipBox | ||
11 | #strip_id_or_class: legacyInset | ||
12 | strip_id_or_class: recipeACShopAndBuyText | ||
13 | |||
14 | strip: //div[contains(@class, 'insetContent')]//cite | ||
15 | strip: //*[contains(@style, 'visibility: hidden;')] | ||
16 | strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))] | ||
17 | |||
18 | prune: no | ||
19 | tidy: no | ||
20 | |||
21 | test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html | ||
22 | # slide show | ||
23 | test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/onlinewelten.com.txt b/inc/3rdparty/site_config/standard/onlinewelten.com.txt new file mode 100644 index 00000000..1609fa83 --- /dev/null +++ b/inc/3rdparty/site_config/standard/onlinewelten.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@id='news_detail']//div[@class='contents clearfix'] | ||
2 | test_url: http://www.onlinewelten.com/games/aliens-colonial-marines/news/offizielle-spiel-ankuendigung-nintendos-wii-u-103690/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/onstartups.com.txt b/inc/3rdparty/site_config/standard/onstartups.com.txt new file mode 100644 index 00000000..cccce8cd --- /dev/null +++ b/inc/3rdparty/site_config/standard/onstartups.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip: //div[@id="dnn_LeftPane"] | //div[@id="dnn_ContentPane"]//h1 | //div[@id="dnn_ContentPane"]//p[@class="Normal"] | //div[@class="Submissions"] | //div[@id="listing"]//h3 | //div[@id="listing"][2] | //div[@id="emart-fail"] | //div[@id="emart-success"] | //div[@id="emart-form"] | ||
2 | test_url: http://onstartups.com/tabid/3339/bid/37737/Secrets-Of-Freemium-Pricing-Make-The-Cheapskates-Pay.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/opensource.org.txt b/inc/3rdparty/site_config/standard/opensource.org.txt new file mode 100644 index 00000000..2bd3ccdb --- /dev/null +++ b/inc/3rdparty/site_config/standard/opensource.org.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='content clear-block'] | ||
2 | test_url: http://opensource.org/node/537 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/openthemagazine.com.txt b/inc/3rdparty/site_config/standard/openthemagazine.com.txt new file mode 100644 index 00000000..510eb252 --- /dev/null +++ b/inc/3rdparty/site_config/standard/openthemagazine.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id = 'content-inner'] | ||
2 | strip: //div[@id = 'content-bottom'] | ||
3 | strip_id_or_class: print_sharebutton | ||
4 | test_url: http://openthemagazine.com/article/nation/sania-vs-saina \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/openwebx.org.txt b/inc/3rdparty/site_config/standard/openwebx.org.txt new file mode 100644 index 00000000..b7663540 --- /dev/null +++ b/inc/3rdparty/site_config/standard/openwebx.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class="chapter"] | ||
2 | prune: no | ||
3 | tidy: no | ||
4 | test_url: http://openwebx.org/docs/springext.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/orf.at.txt b/inc/3rdparty/site_config/standard/orf.at.txt new file mode 100644 index 00000000..ff16ca79 --- /dev/null +++ b/inc/3rdparty/site_config/standard/orf.at.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | ||
2 | |||
3 | title: //div[@class='hidden offscreen']/h2 | ||
4 | body: //div[@id="storyText"] | ||
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | ||
6 | strip: //small[@class='credit'] | ||
7 | strip: //small[@class='caption'] | ||
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | ||
9 | strip: //p[@class='toplink'] | ||
10 | |||
11 | test_url: http://orf.at/stories/2084731/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/origo.hu.txt b/inc/3rdparty/site_config/standard/origo.hu.txt new file mode 100644 index 00000000..0dedac3d --- /dev/null +++ b/inc/3rdparty/site_config/standard/origo.hu.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: /html/body/div[5]/div[2]/h1 | ||
2 | body: /html/body/div[5]/div[2]/div[6]/div/div | ||
3 | body: //*[@id="cikk"] | ||
4 | strip: /html/body/div[5]/div[2]/h1 | ||
5 | strip: /html/body/div[5]/div[2]/div[4] | ||
6 | strip: //*[@id="multidoboz"] | ||
7 | strip: /html/body/div[5]/div[2]/div[6]/div[2] | ||
8 | strip: //*[@id="comments"] | ||
9 | strip: //*[@id="rating-doboz"] | ||
10 | strip: /html/body/div[5]/div[2]/div[10] | ||
11 | strip: /html/body/div[5]/div[2]/a | ||
12 | strip: /html/body/div[5]/div[2]/span | ||
13 | strip: /html/body/div[5]/div[2]/span[2] | ||
14 | strip: /html/body/div[5]/div[2]/span[3] | ||
15 | strip: /html/body/div[5]/div[2]/span[4] | ||
16 | strip: /html/body/div[5]/div[2]/span[5] | ||
17 | strip: //*[@id="kommentszam"] | ||
18 | test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt b/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt new file mode 100644 index 00000000..f03c9551 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pakistantvdekho.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | ||
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | ||
3 | body: (//div[starts-with(@id, 'post_message')])[1] | ||
4 | |||
5 | prune: no | ||
6 | tidy: no | ||
7 | |||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | ||
9 | #replace_string(</iframe>): </iframe> </div> | ||
10 | |||
11 | test_url: http://pakistantvdekho.com/showthread.php?647741-Sitam-Gar-by-HUM-TV-Episode-07&p=659080#post659080 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pandagon.net.txt b/inc/3rdparty/site_config/standard/pandagon.net.txt new file mode 100644 index 00000000..d0d2a5d0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pandagon.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title://h2 | ||
2 | author://div[@class="posted"]/a | ||
3 | date://div[@class="date"] | ||
4 | body://div[@class="entry"] | ||
5 | test_url: http://pandagon.net/index.php/site/its-okay-to-admit-that-mass-hysteria-is-real \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pandodaily.com.txt b/inc/3rdparty/site_config/standard/pandodaily.com.txt new file mode 100644 index 00000000..7d1c2183 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pandodaily.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | tidy: no | ||
2 | body: //article | ||
3 | date: //time/@datetime | ||
4 | strip_id_or_class: sharedaddy | ||
5 | test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/panic.com.txt b/inc/3rdparty/site_config/standard/panic.com.txt new file mode 100644 index 00000000..0361f06d --- /dev/null +++ b/inc/3rdparty/site_config/standard/panic.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='entry'] | ||
2 | date: //h3[@class='postDate'] | ||
3 | test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/parislemon.com.txt b/inc/3rdparty/site_config/standard/parislemon.com.txt new file mode 100644 index 00000000..a3bd4b0f --- /dev/null +++ b/inc/3rdparty/site_config/standard/parislemon.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2[@class="post-title"] | ||
2 | author: substring-after(//div[@class="description"],'Words by ') | ||
3 | date: //li[@class="date"] | ||
4 | strip: //h2[@class="post-title"] | ||
5 | body: //div[@class="copy"] | ||
6 | test_url: http://parislemon.com/post/13462682469/the-15-inch-air \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/parliament.uk.txt b/inc/3rdparty/site_config/standard/parliament.uk.txt new file mode 100644 index 00000000..478a669f --- /dev/null +++ b/inc/3rdparty/site_config/standard/parliament.uk.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='news-article'] | ||
3 | test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pastebin.com.txt b/inc/3rdparty/site_config/standard/pastebin.com.txt new file mode 100644 index 00000000..89d13b2a --- /dev/null +++ b/inc/3rdparty/site_config/standard/pastebin.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title://div[@class="paste_box_line1"]/h1 | ||
2 | author://div[@class="paste_box_line2"]/a | ||
3 | body://div[@class="text"] | ||
4 | date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|') | ||
5 | dissolve://li | ||
6 | test_url: http://pastebin.com/LAykd1es \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt b/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt new file mode 100644 index 00000000..40a049e0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pastepad.fivefilters.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='ff-pastepad-content'] | ||
3 | prune: no | ||
4 | # todo: add test file | ||
5 | test_url: http://pastepad.fivefilters.org/test.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pathawks.com.txt b/inc/3rdparty/site_config/standard/pathawks.com.txt new file mode 100644 index 00000000..1a4cd25b --- /dev/null +++ b/inc/3rdparty/site_config/standard/pathawks.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title://*[contains(@class,'post-title')] | ||
2 | body://div[contains(@class,'post-body')] | ||
3 | body://div[contains(@class,'entry-content')] | ||
4 | strip_comments:no | ||
5 | prune:no | ||
6 | convert_double_br_tags:yes | ||
7 | tidy:yes | ||
8 | test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pcast.me.txt b/inc/3rdparty/site_config/standard/pcast.me.txt new file mode 100644 index 00000000..ae38e8e1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pcast.me.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | prune: no | ||
2 | test_url: http://pcast.me/shownotes/get/16t \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pcmag.com.txt b/inc/3rdparty/site_config/standard/pcmag.com.txt new file mode 100644 index 00000000..cebea4d7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pcmag.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | prune:yes | ||
2 | |||
3 | date://*[contains(@class,'date')] | ||
4 | |||
5 | body://div[contains(@id,'content')] | ||
6 | |||
7 | next_page_link://a[contains(.,'Next >')] | ||
8 | |||
9 | strip_id_or_class:sponsors | ||
10 | test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pcworld.com.txt b/inc/3rdparty/site_config/standard/pcworld.com.txt new file mode 100644 index 00000000..30ccbb5f --- /dev/null +++ b/inc/3rdparty/site_config/standard/pcworld.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //div[@class='articleHead']//h1 | ||
2 | author: //div[@class="author-name"]/a[1] | ||
3 | body: //div[@class="main"] | ||
4 | |||
5 | # remove 'From the Lab' and 'Recent posts' text | ||
6 | strip: //div[@class='blogLabel'] | ||
7 | |||
8 | # remove byline and meta info | ||
9 | strip: //h1 | ||
10 | strip: //div[@class="article-meta"] | ||
11 | strip: //div[@class="author-info"] | ||
12 | |||
13 | #strip tags and categories | ||
14 | strip: //div[@class="department"] | ||
15 | |||
16 | #strip product cap links | ||
17 | strip: //div[@class="cap-main"] | ||
18 | strip: //div[@id="compare-lede"] | ||
19 | test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/penny-arcade.com.txt b/inc/3rdparty/site_config/standard/penny-arcade.com.txt new file mode 100644 index 00000000..f97615f1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/penny-arcade.com.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | # 2012-01-14 carlo@... - fixed title, body; added author, date | ||
2 | |||
3 | title: //div[@class="title"]/h2/a | ||
4 | # body: //div[@class="post"] | ||
5 | # author: //p[@class="iconEmail"]/a | ||
6 | # date: //p[@class="iconDate"] | ||
7 | |||
8 | # 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report | ||
9 | |||
10 | # Penny Arcade | ||
11 | |||
12 | author: //li[@class="iconEmail"]/a | ||
13 | date: //li[@class="iconDate"] | ||
14 | body: //div[@class="body"] | ||
15 | |||
16 | # PA Report | ||
17 | |||
18 | author: //div[@class="meta"]/p/a | ||
19 | date: substring-after(//div[@class="meta"]/p, '/ ') | ||
20 | title: substring-after(//title, '- ') | ||
21 | |||
22 | test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news | ||
23 | test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pentaxforums.com.txt b/inc/3rdparty/site_config/standard/pentaxforums.com.txt new file mode 100644 index 00000000..00f61a48 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pentaxforums.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | next_page_link: //a[contains(., 'Next:')] | ||
2 | test_url: http://www.pentaxforums.com/reviews/long-exposure-handhelds/introduction.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt b/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt new file mode 100644 index 00000000..a369fd65 --- /dev/null +++ b/inc/3rdparty/site_config/standard/philadelphiaeagles.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | prune: no | ||
2 | tidy: no | ||
3 | body: //div[@class='article-content'] | ||
4 | dissolve: //nobr/a | ||
5 | dissolve: //nobr | ||
6 | test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/philly.com.txt b/inc/3rdparty/site_config/standard/philly.com.txt new file mode 100644 index 00000000..41318f63 --- /dev/null +++ b/inc/3rdparty/site_config/standard/philly.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | author: //p[@class='byline']/span | ||
3 | body: //@id='body-content' | ||
4 | date: //div[@class='article_timestamp']/span | ||
5 | |||
6 | strip: //@class=b-group | ||
7 | strip: //*[contains(@style, 'none')] | ||
8 | strip: //a[contains(@href, 'comments')] | ||
9 | strip: //*[contains(@class, 'comment')] | ||
10 | test_url: http://www.philly.com/philly/sports/eagles/20120127_Ohio_State_s_Posey_didn_t_waste_time_lost_to_suspension.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt b/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt new file mode 100644 index 00000000..4e2ccb01 --- /dev/null +++ b/inc/3rdparty/site_config/standard/photo.tutsplus.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | author: substring-before(//div[@class='post_meta'],' on') | ||
2 | date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on') | ||
3 | title: //h1[class='post_title'] | ||
4 | body: //div[@class='article'] | ||
5 | |||
6 | test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/php.net.txt b/inc/3rdparty/site_config/standard/php.net.txt new file mode 100644 index 00000000..7c57a84d --- /dev/null +++ b/inc/3rdparty/site_config/standard/php.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='content'] | ||
2 | strip_id_or_class: manualnavbar | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.php.net/manual/en/migration5.incompatible.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/physicstoday.org.txt b/inc/3rdparty/site_config/standard/physicstoday.org.txt new file mode 100644 index 00000000..a8163995 --- /dev/null +++ b/inc/3rdparty/site_config/standard/physicstoday.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //div[@class='abstitle']//h1 | ||
2 | author: //div[@class='authorList'] | ||
3 | body: //div[@id='fulltext_body'] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.physicstoday.org/resource/1/phtoad/v64/i10/p48_s1?bypassSSO=1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pitchfork.com.txt b/inc/3rdparty/site_config/standard/pitchfork.com.txt new file mode 100644 index 00000000..3decc538 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pitchfork.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title:concat(//h1,' - ',//h2,' - ',//h3) | ||
2 | author://address | ||
3 | date://span[@class='pub-date'] | ||
4 | body://div[@id='main'] | ||
5 | single_page_link://link[@rel='canonical'] | ||
6 | strip://div[@class='info'] | ||
7 | strip_id_or_class:'object-grid related-content' | ||
8 | strip_id_or_class:'object-prevnext' | ||
9 | strip_id_or_class:'object-header' | ||
10 | strip_id_or_class:'source' | ||
11 | strip_id_or_class:'label' | ||
12 | strip_id_or_class:'title' | ||
13 | dissolve://ul | ||
14 | strip://li[@class='next'] | ||
15 | strip://li[@class='prev'] | ||
16 | test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittnews.com.txt b/inc/3rdparty/site_config/standard/pittnews.com.txt new file mode 100644 index 00000000..92777073 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittnews.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h2[@class='post-title'] | ||
2 | author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/') | ||
3 | date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in') | ||
4 | strip: //h2[@class='post-title'] | ||
5 | strip: //p[@class='post-details'] | ||
6 | strip: //h3[@class='post-byline'] | ||
7 | body: //div[@id='content'] | ||
8 | test_url: http://pittnews.com/newsstory/mens-basketball-pitt-recruit-robinson-to-bring-leadership/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt b/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt new file mode 100644 index 00000000..824cb064 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittsburgh.pirates.mlb.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: substring-before(//title,'pirates.com') | ||
2 | date: //span[@class='timeStamp'] | ||
3 | author: substring-before(substring-after(//div[@class='byLine'],'By'),'/') | ||
4 | body: //div[@id='article'] | ||
5 | #strip: //div[@class='inner'] | ||
6 | strip: //div[@id='article_head'] | ||
7 | strip: //p[@class='tagLine'] | ||
8 | strip: //div[@id='article_related_links'] | ||
9 | strip: //div[@id='article_related_mlb'] | ||
10 | strip: //div[@id='article_related_club'] | ||
11 | strip: //span[@class='more'] | ||
12 | strip: //div[@class='article_component'] | ||
13 | strip: //span[@class='screen_reader'] | ||
14 | strip: //ul[@class='columnists_blurb'] | ||
15 | test_url: http://pittsburgh.pirates.mlb.com/news/article.jsp?ymd=20120330&content_id=27759040&vkey=news_pit&c_id=pit \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittsburghlive.com.txt b/inc/3rdparty/site_config/standard/pittsburghlive.com.txt new file mode 100644 index 00000000..b3e66166 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittsburghlive.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: substring-before(//title,'- Pittsburgh Tribune') | ||
2 | author: substring-before(substring-after(//div[@class='byline'],'By '),',') | ||
3 | date: substring-after(substring-after(//div[@class='byline'],','),',') | ||
4 | body: //div[@id='storyBody'] | ||
5 | strip: //div[@class='morestories'] | ||
6 | dissolve: //p[@class='subheader'] | ||
7 | test_url: http://www.pittsburghlive.com/x/pittsburghtrib/sports/columnists/s_785654.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt b/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt new file mode 100644 index 00000000..dd715d8f --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittsburghmagazine.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //title | ||
2 | author: substring-after(//div[@class='by-line'],'BY') | ||
3 | |||
4 | body: //div[@id='article-body'] | ||
5 | |||
6 | strip: //div[@class='by-line'] | ||
7 | strip: //div[@id='article-body']/h1 | ||
8 | test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt b/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt new file mode 100644 index 00000000..6113b96e --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittsburghpanthers.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //span[@class='StoryHeadline'] | ||
2 | strip: //div[@class='fivevert'] | ||
3 | body: //div[@id='Content'] | ||
4 | test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pittscriptblog.com.txt b/inc/3rdparty/site_config/standard/pittscriptblog.com.txt new file mode 100644 index 00000000..3936310d --- /dev/null +++ b/inc/3rdparty/site_config/standard/pittscriptblog.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class='articletitle'] | ||
2 | author: substring-after(//span[@class='author'],'by') | ||
3 | date: //span[@class='created'] | ||
4 | body: //div[@class='article'] | ||
5 | strip: //div[@class='headline'] | ||
6 | strip: //p[@class='articleinfo'] | ||
7 | #dissolve: //p[@class='subheader'] | ||
8 | test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/playboy.com.txt b/inc/3rdparty/site_config/standard/playboy.com.txt new file mode 100644 index 00000000..07b347a0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/playboy.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | author: //article//*[@class="author"] | ||
2 | date: //article//*[@class="publication-date"] | ||
3 | body: //article | ||
4 | strip: //article/header | ||
5 | strip: //article/section | ||
6 | test_url: http://www.playboy.com/playground/view/playboy-interview-jon-hamm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/plus.google.com.txt b/inc/3rdparty/site_config/standard/plus.google.com.txt new file mode 100644 index 00000000..50a5dbf5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/plus.google.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | body: //div[@id='contentPane']//div[@class='vg'] | ||
2 | body: //div[@id='contentPane'] | ||
3 | |||
4 | # Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :( | ||
5 | |||
6 | author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title | ||
7 | |||
8 | |||
9 | strip: //*[@title="People who +1'd this"]/../.. | ||
10 | strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')] | ||
11 | strip: //*[@role='menu'] | ||
12 | strip: //img[contains(@alt, 'profile photo')] | ||
13 | strip: //*[@class='a-f-i-Ad'] | ||
14 | |||
15 | tidy: no | ||
16 | |||
17 | test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/plzkthxbai.com.txt b/inc/3rdparty/site_config/standard/plzkthxbai.com.txt new file mode 100644 index 00000000..bb9be0a9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/plzkthxbai.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h2[@class='jcw-pagetitle' | ||
2 | date: //p[@class='postinfo'] | ||
3 | body: //div[@class='contenttext'] | ||
4 | test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt b/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt new file mode 100644 index 00000000..880311d3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/pogue.blogs.nytimes.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id="content"]/div[1] | ||
2 | |||
3 | title: //h1[@class="entry-title"] | ||
4 | test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt new file mode 100644 index 00000000..121fd5b9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/politico.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title://div[contains(@class, "article")]/h1 | ||
2 | body://div[contains(@class,"story-text")] | ||
3 | |||
4 | # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] | ||
5 | |||
6 | next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a | ||
7 | date://meta[@name="publish_date"]/@content | ||
8 | |||
9 | strip://div[contains(@class, "breadcrumbs")] | ||
10 | strip://a[contains(@class, "hidden")] | ||
11 | strip://div[contains(@class, "story-embed")] | ||
12 | strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. | ||
13 | test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/politifact.com.txt b/inc/3rdparty/site_config/standard/politifact.com.txt new file mode 100644 index 00000000..fd247b5b --- /dev/null +++ b/inc/3rdparty/site_config/standard/politifact.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id="content"] | ||
2 | |||
3 | strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"] | ||
4 | test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/politiken.dk.txt b/inc/3rdparty/site_config/standard/politiken.dk.txt new file mode 100644 index 00000000..8deecbca --- /dev/null +++ b/inc/3rdparty/site_config/standard/politiken.dk.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | # 21/10-2011: | ||
2 | # Added Author+Date | ||
3 | # Remove fakta-boks if found | ||
4 | # Deleted 'Læs også...' filter | ||
5 | # - Change in markup caused it to strip too much. | ||
6 | |||
7 | author://span[@class='autor-name'] | ||
8 | date:substring-after(//div[@class='art-created'], ' ') | ||
9 | title: //h1[contains(@class, 'stor-type')] | ||
10 | body: //div[@id='art-body'] | ||
11 | strip: //div[@class='art-fakta article-box'] | ||
12 | |||
13 | test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/popularmechanics.com.txt b/inc/3rdparty/site_config/standard/popularmechanics.com.txt new file mode 100644 index 00000000..85b7656b --- /dev/null +++ b/inc/3rdparty/site_config/standard/popularmechanics.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | next_page_link: //div[@id='longPagination']/a[@class='next'] | ||
2 | |||
3 | title: //div[@id='contentHeader']//h1 | ||
4 | |||
5 | body: //div[@id='articleBody'] | ||
6 | # this is so sad | ||
7 | body: //div[@id='intelliTXT'] | ||
8 | test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/positioningmag.com.txt b/inc/3rdparty/site_config/standard/positioningmag.com.txt new file mode 100644 index 00000000..21cd833c --- /dev/null +++ b/inc/3rdparty/site_config/standard/positioningmag.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //div[@id="newsDetailTitle"] | ||
2 | author: //span[@id="showAuthor"] | ||
3 | date: //span[@id="showRefDate"] | ||
4 | |||
5 | strip: //div[@id="breadcrumbs"] | ||
6 | strip: //span[@id="PageTitle"] | ||
7 | strip: //div[@id="newsDetailAuthorPublish"] | ||
8 | |||
9 | strip: //div[@class="leadPix"] | ||
10 | |||
11 | strip: //span[@id="ctl00_PageTitle"] | ||
12 | strip: //div[@id="newsDetailTitle"] | ||
13 | convert_double_br_tags:yes | ||
14 | |||
15 | strip: //div[@id="newsDetailCredential"] | ||
16 | strip: //div[@id="sidebar2"] | ||
17 | strip: //div[@id="footer"] | ||
18 | |||
19 | test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/post-gazette.com.txt b/inc/3rdparty/site_config/standard/post-gazette.com.txt new file mode 100644 index 00000000..1ea945a0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/post-gazette.com.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | title: //div[@class='story_headline'] | ||
2 | author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/') | ||
3 | date: //div[@class='story_lastupdate'] | ||
4 | body: //div[@id='story'] | ||
5 | strip: //div[@class='story_byline'] | ||
6 | strip: //div[@class='story_lastupdate'] | ||
7 | strip: //div[@class='story_headline'] | ||
8 | strip: //div[@id='abuse'] | ||
9 | strip: //h2 | ||
10 | strip: //div[@class='pagenumbers_wrap'] | ||
11 | strip: //ul[@class='pagenumbers'] | ||
12 | strip: //div[starts-with(., 'To report inappropriate comments')] | ||
13 | |||
14 | strip_id_or_class: story_share | ||
15 | strip_id_or_class: OUTBRAIN | ||
16 | strip_id_or_class: story_box_right | ||
17 | strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']] | ||
18 | strip: //ul[@id='pikame']/li[position()>1] | ||
19 | |||
20 | prune: no | ||
21 | tidy: no | ||
22 | |||
23 | single_page_link: //a[contains(@href, '?p=0')] | ||
24 | |||
25 | test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/ | ||
26 | test_url: http://www.post-gazette.com/stories/sports/pirates/pirates-fork-over-changes-for-fans-at-pnc-park-629789 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/posta.com.tr.txt b/inc/3rdparty/site_config/standard/posta.com.tr.txt new file mode 100644 index 00000000..86cb5d0b --- /dev/null +++ b/inc/3rdparty/site_config/standard/posta.com.tr.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //div[@id='divAdnetKeyword']/h1 | ||
2 | body: //div[@id='_middle_content_bottom'] | ||
3 | |||
4 | wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img | ||
5 | |||
6 | strip: //div[@id='_middle_content_bottom_child1'] | ||
7 | strip: //div[@id='_middle_content_bottom_child4'] | ||
8 | strip: //div[@class='cls'] | ||
9 | strip: //div[@class='iphoneBox'] | ||
10 | strip: //ul[@class='ilgiliHaber'] | ||
11 | strip: //div[@class='yorumlar'] | ||
12 | strip: //div[@class='kategoriler'] | ||
13 | strip: //div[@class='textSize'] | ||
14 | strip: //span[@class='tarih'] | ||
15 | test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/prb.org.txt b/inc/3rdparty/site_config/standard/prb.org.txt new file mode 100644 index 00000000..7f7a5031 --- /dev/null +++ b/inc/3rdparty/site_config/standard/prb.org.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | date: /html/head/meta[@name="date"]/@content | ||
3 | body: //div[@id="featuredlinksbox"] | ||
4 | strip: //div[@class="relatedbox"] | ||
5 | strip: //h1 | ||
6 | strip: //br | ||
7 | strip_image_src: "/images" | ||
8 | test_url: http://www.prb.org/Journalists/Webcasts/2011/military-families.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt b/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt new file mode 100644 index 00000000..906c27a0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/prog21.dadgum.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='left'] | ||
3 | strip: //h1 | ||
4 | convert_double_br_tags: yes | ||
5 | strip_id_or_class: entry-footer | ||
6 | strip: //h1[. = 'Previously']/following::* | ||
7 | author: string('James Hague') | ||
8 | date: //div[@class = 'entry-footer']/text() | ||
9 | test_url: http://prog21.dadgum.com/105.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/prolost.com.txt b/inc/3rdparty/site_config/standard/prolost.com.txt new file mode 100644 index 00000000..cef811d4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/prolost.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class='body'] | ||
2 | title: //h2[@class='title'] | ||
3 | date: //span[@class='posted-on'] | ||
4 | test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/propublica.org.txt b/inc/3rdparty/site_config/standard/propublica.org.txt new file mode 100644 index 00000000..11e63bd0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/propublica.org.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[@class="article-title"] | ||
2 | author: //meta[@name="author"]/@content | ||
3 | body: //div[@class="article-full"] | ||
4 | strip_id_or_class: sidebar_inject | ||
5 | strip_id_or_class: callout | ||
6 | strip_id_or_class: content-inset | ||
7 | strip_id_or_class: byline-block | ||
8 | strip_id_or_class: photo-caption | ||
9 | strip_id_or_class: foot-tools | ||
10 | |||
11 | test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/prosa.dk.txt b/inc/3rdparty/site_config/standard/prosa.dk.txt new file mode 100644 index 00000000..dedd33d3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/prosa.dk.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //p[@class='name'] | ||
2 | date: substring-before(//p[@class='date'], ' | ') | ||
3 | body: //div[@class='news_single_item'] | ||
4 | test_url: http://www.prosa.dk/aktuelt/nyhed/artikel/internetaktivisten-uden-maske/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt b/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt new file mode 100644 index 00000000..19059c4a --- /dev/null +++ b/inc/3rdparty/site_config/standard/prospectmagazine.co.uk.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | #basics | ||
2 | author: (//div[contains(@class,'author')])[1] | ||
3 | date: substring-before(//a[@class='issue'], '—') | ||
4 | #body://div[@class = 'entry'] | ||
5 | # use this until move_into support is ready | ||
6 | body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image'] | ||
7 | |||
8 | #moves header image and tagline into body | ||
9 | move_into(//div[@class='entry']/div)://div[@class = 'lead_image'] | ||
10 | move_into(//div[@class='entry']/div)://div[@class = 'standfirst'] | ||
11 | |||
12 | |||
13 | # moves author info to end of text | ||
14 | move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em | ||
15 | |||
16 | prune: no | ||
17 | |||
18 | # strips social links | ||
19 | strip_id_or_class:login-status | ||
20 | strip_id_or_class:shareinpost | ||
21 | strip_id_or_class:content_subscribe | ||
22 | strip_id_or_class:postinfo | ||
23 | strip_id_or_class:postutils | ||
24 | strip_id_or_class:comments | ||
25 | strip://strong[string(.) = 'Follow Prospect on Twitter'] | ||
26 | test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/psychologytoday.com.txt b/inc/3rdparty/site_config/standard/psychologytoday.com.txt new file mode 100644 index 00000000..3da3cea3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/psychologytoday.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@class="page-title"]/h1 | ||
2 | author: //a[@title="View Bio"] | ||
3 | date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by') | ||
4 | strip://div[@class="page-title"]/h1 | ||
5 | strip://div[@class="article-abstract"] | ||
6 | strip://div[@class="article-meta"] | ||
7 | strip://div[@id="rightColumn"] | ||
8 | strip://div[@id="inline-content-bottom-left"] | ||
9 | test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/publications.parliament.uk.txt b/inc/3rdparty/site_config/standard/publications.parliament.uk.txt new file mode 100644 index 00000000..fa099473 --- /dev/null +++ b/inc/3rdparty/site_config/standard/publications.parliament.uk.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //meta[@name="Author"] | ||
2 | date: //meta[@name="Date"] | ||
3 | strip: //h5 | ||
4 | test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt b/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt new file mode 100644 index 00000000..126f9e27 --- /dev/null +++ b/inc/3rdparty/site_config/standard/purpleplanetmedia.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[@class='title'] | ||
2 | body: //div[@class='body'] | ||
3 | next_page_link: //div[@class='source']/text()[contains(., 'page')]/following-sibling::a | ||
4 | test_url: http://purpleplanetmedia.com/eye/inte/ngaiman.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/quantumdiaries.org.txt b/inc/3rdparty/site_config/standard/quantumdiaries.org.txt new file mode 100644 index 00000000..a366c1b3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/quantumdiaries.org.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //div[contains(@class, "hentry")]/h3 | ||
2 | |||
3 | author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")] | ||
4 | |||
5 | date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under") | ||
6 | |||
7 | body: //div[contains(@class, "entry")] | ||
8 | |||
9 | strip_id_or_class: addtoany_share_save_container | ||
10 | strip_id_or_class: postmetadata | ||
11 | strip_id_or_class: author_bio | ||
12 | strip_id_or_class: author_bio_2 | ||
13 | strip: //div[contains(@class, "hentry")]/h3 | ||
14 | test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/queerty.com.txt b/inc/3rdparty/site_config/standard/queerty.com.txt new file mode 100644 index 00000000..655f8b80 --- /dev/null +++ b/inc/3rdparty/site_config/standard/queerty.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='copy'] | ||
2 | title: //h1[@class='hed'] | ||
3 | test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/quepasa.cl.txt b/inc/3rdparty/site_config/standard/quepasa.cl.txt new file mode 100644 index 00000000..fae4e6a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/quepasa.cl.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | body: //div[@class="cuerpoArticulo"] | ||
4 | |||
5 | |||
6 | test_url: http://www.quepasa.cl/magazine/articulo/print.html?id=5299 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/quora.com.txt b/inc/3rdparty/site_config/standard/quora.com.txt new file mode 100644 index 00000000..3d34f2f8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/quora.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | tidy: no | ||
2 | prune: no | ||
3 | body: //div[contains(@class, 'main_col')] | ||
4 | title: //h1 | ||
5 | |||
6 | strip_id_or_class: hidden | ||
7 | strip_id_or_class: item_action_bar | ||
8 | strip_id_or_class: answer_voters | ||
9 | strip_id_or_class: question_topics | ||
10 | strip_id_or_class: answer_header_text | ||
11 | strip_id_or_class: editor_link | ||
12 | strip_id_or_class: view_tag | ||
13 | strip_id_or_class: include_details | ||
14 | strip_id_or_class: sig_edit | ||
15 | strip_id_or_class: profile_photo_img | ||
16 | |||
17 | test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/radar.oreilly.com.txt b/inc/3rdparty/site_config/standard/radar.oreilly.com.txt new file mode 100644 index 00000000..99ab4bb1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/radar.oreilly.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | date://span[@class='date'] | ||
2 | body://div[@class='entry-body'] | ||
3 | test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/radionz.co.nz.txt b/inc/3rdparty/site_config/standard/radionz.co.nz.txt new file mode 100644 index 00000000..e2617dc5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/radionz.co.nz.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='body'] | ||
2 | title: //div[@class='newsstory']/h2 | ||
3 | test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/randsinrepose.com.txt b/inc/3rdparty/site_config/standard/randsinrepose.com.txt new file mode 100644 index 00000000..f0c91c51 --- /dev/null +++ b/inc/3rdparty/site_config/standard/randsinrepose.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //div[@id='center-col']/h4 | ||
2 | author: substring-before(//title,'In') | ||
3 | date: substring-after(//div[@class='commenttext']/span,'#') | ||
4 | body: //div[@id='center-col'] | ||
5 | strip: //div[@id='center-col']/h4 | ||
6 | strip: //div[@class='graytext'] | ||
7 | |||
8 | # Anthony Perez-Sanz 2012.3.14 | ||
9 | # Removed long gif from the end | ||
10 | strip: //img[@src='http://www.randsinrepose.com/spreader.gif'] | ||
11 | test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/readability.com.txt b/inc/3rdparty/site_config/standard/readability.com.txt new file mode 100644 index 00000000..80337291 --- /dev/null +++ b/inc/3rdparty/site_config/standard/readability.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //link[@rel='canonical']/@href | ||
2 | |||
3 | test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/readwriteweb.com.txt b/inc/3rdparty/site_config/standard/readwriteweb.com.txt new file mode 100644 index 00000000..ff799aa0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/readwriteweb.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1[@class="titlelink"] | ||
2 | date: //span[@class="timestamp"]/@data-published | ||
3 | body: //div[@class="asset-content"] | ||
4 | strip_id_or_class: related-entries | ||
5 | strip_id_or_class: like-and-retweet | ||
6 | |||
7 | author: //div[@id="submeta"]/a[1] | ||
8 | test_url: http://www.readwriteweb.com/archives/why_facebook_terrifies_google.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/real.gr.txt b/inc/3rdparty/site_config/standard/real.gr.txt new file mode 100644 index 00000000..fe5ab672 --- /dev/null +++ b/inc/3rdparty/site_config/standard/real.gr.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='_ctl12__ctl0_Article'] | ||
2 | prune: no | ||
3 | autodetect_on_failure: no \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/recipe.com.txt b/inc/3rdparty/site_config/standard/recipe.com.txt new file mode 100644 index 00000000..8c8f0e0c --- /dev/null +++ b/inc/3rdparty/site_config/standard/recipe.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients'] | ||
2 | |||
3 | strip_id_or_class: location | ||
4 | strip_id_or_class: savings | ||
5 | strip_id_or_class: recipeDetailDescButton | ||
6 | |||
7 | prune: no | ||
8 | tidy: no | ||
9 | |||
10 | test_url: http://www.recipe.com/avocado-basil-pasta/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/red-hot-girls.com.txt b/inc/3rdparty/site_config/standard/red-hot-girls.com.txt new file mode 100644 index 00000000..3ae959b1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/red-hot-girls.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='short-text' or starts-with(@id, 'news-id-')] | ||
2 | prune: no | ||
3 | tidy: no | ||
4 | |||
5 | test_url: http://red-hot-girls.com/2011/06/10/the_red_hot_natalia_maria_53_pics.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/reddit.com.txt b/inc/3rdparty/site_config/standard/reddit.com.txt new file mode 100644 index 00000000..58ca9ece --- /dev/null +++ b/inc/3rdparty/site_config/standard/reddit.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | # This setup grabs the text from a Reddit self post. It ignores all comments etc. | ||
2 | |||
3 | title: //p[@class="title"]/a/text() | ||
4 | |||
5 | author: //p[@class="tagline"]/a | ||
6 | |||
7 | # this doesn't work for some reason...? | ||
8 | date: //p[@class="tagline"]//@datetime | ||
9 | |||
10 | body: //div[@class="expando"]//div[@class="usertext-body"] | ||
11 | |||
12 | strip_id_or_class: tagline | ||
13 | strip_id_or_class: unvotable-message | ||
14 | strip_id_or_class: buttons | ||
15 | |||
16 | test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/redmondpie.com.txt b/inc/3rdparty/site_config/standard/redmondpie.com.txt new file mode 100644 index 00000000..12a96187 --- /dev/null +++ b/inc/3rdparty/site_config/standard/redmondpie.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //div[@class='posthead']//h2 | ||
2 | body: //div[contains(@class, 'postcontent') or @class='posthead'] | ||
3 | author: //div[@class='posthead']//a[@rel='author'] | ||
4 | |||
5 | strip: //div[@class='posthead']//h2 | ||
6 | replace_string(>Advertisements</div>): ></div> | ||
7 | replace_string(<p>You can follow us on): <p style="display:none;"> | ||
8 | strip_id_or_class: likeThisPost | ||
9 | |||
10 | prune: no | ||
11 | tidy: no | ||
12 | |||
13 | test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt b/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt new file mode 100644 index 00000000..4f195a06 --- /dev/null +++ b/inc/3rdparty/site_config/standard/redtape.msnbc.msn.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | # Think there might be something up with your parser that it strips out 'print' from the title :) | ||
2 | |||
3 | title: //meta[@name='title']/@content | ||
4 | author: //meta[@name='author']/@content | ||
5 | date: //meta[@name='date']/@content | ||
6 | |||
7 | body: //div[@class='articleText'] | ||
8 | |||
9 | strip: //div[contains(@class, 'day')] | ||
10 | strip: //div[contains(@class, 'month')] | ||
11 | strip: //div[contains(@class, 'year')] | ||
12 | strip: //div[contains(@class, 'time')] | ||
13 | strip: //h1[@class='gl_headline'] | ||
14 | strip: //div[@class='byline'] | ||
15 | strip: //div[@id='left_ear'] | ||
16 | strip: //div[@id='right_ear'] | ||
17 | strip: //div[contains(@class, 'PopularPosts')] | ||
18 | strip ://div[@class='discuss_page_break'] | ||
19 | strip ://div[contains(@class, 'p-content_TagList')] | ||
20 | test_url: http://redtape.msnbc.msn.com/_news/2011/09/28/8020661-sprint-raises-fee-but-wont-free-users-from-two-year-contracts?preview=true \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/reflets.info.txt b/inc/3rdparty/site_config/standard/reflets.info.txt new file mode 100644 index 00000000..4a9fab67 --- /dev/null +++ b/inc/3rdparty/site_config/standard/reflets.info.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body://div[@class='storycontent'] | ||
2 | date://div[@class='date'] | ||
3 | strip://li[@class='sharing_label'] | ||
4 | strip://a[@class='FlattrButton'] | ||
5 | test_url: http://reflets.info/orange-nokia-siemens-deep-packet-inspection/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/renenekuda.cz.txt b/inc/3rdparty/site_config/standard/renenekuda.cz.txt new file mode 100644 index 00000000..0b3dee1d --- /dev/null +++ b/inc/3rdparty/site_config/standard/renenekuda.cz.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //*[@class='entry-title'] | ||
2 | body: //div[@class='entry-content'] | ||
3 | test_url: http://www.renenekuda.cz/recept-na-produktivitu/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/retrieverweekly.com.txt b/inc/3rdparty/site_config/standard/retrieverweekly.com.txt new file mode 100644 index 00000000..1264ee3f --- /dev/null +++ b/inc/3rdparty/site_config/standard/retrieverweekly.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | single_page_link://a[contains(@href, 'print')] | ||
2 | |||
3 | # Grab metadata from the "printer-friendly" page, after specifying single_page_link | ||
4 | title://h2 | ||
5 | date://cite | ||
6 | test_url: http://www.retrieverweekly.com/?cmd=displaystory&story_id=7548&format=html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/reuters.com.txt b/inc/3rdparty/site_config/standard/reuters.com.txt new file mode 100644 index 00000000..c5c94a4f --- /dev/null +++ b/inc/3rdparty/site_config/standard/reuters.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1[@class='headline3'] | ||
2 | author: substring-after(//p[@class="byline"], 'By ') | ||
3 | date: //meta[@name="REVISION_DATE"]/@content | ||
4 | body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation'] | ||
5 | strip: //li[@class='next'] | ||
6 | strip: //span[@class='articleLocation'] | ||
7 | prune: no | ||
8 | tidy: no | ||
9 | |||
10 | test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt b/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt new file mode 100644 index 00000000..dbe42932 --- /dev/null +++ b/inc/3rdparty/site_config/standard/revistapiaui.estadao.com.br.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@class="article_header"]/h3 | ||
2 | author: //div[@class="autor"]/p/* | ||
3 | date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ") | ||
4 | |||
5 | move_into(//div[@class="new_article"]): //div[@class="img_article"]/img | ||
6 | |||
7 | body: //div[@class="article_content"] | ||
8 | convert_double_br_tags: yes | ||
9 | |||
10 | test_url: http://revistapiaui.estadao.com.br/edicao-68/questoes-latino-americanas/filhos-da-guerra-suja \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt b/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt new file mode 100644 index 00000000..904a11dd --- /dev/null +++ b/inc/3rdparty/site_config/standard/richardmuscat.wordpress.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id="post"] | ||
2 | strip: //div[@id="author-description"] | ||
3 | date: //span[@class="entry-date"] | ||
4 | author: //span[@class="author vcard"] | ||
5 | test_url: http://richardmuscat.wordpress.com/2011/06/20/the-price-of-free/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+TheBrooksReview+%28The+Brooks+Review%29 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt b/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt new file mode 100644 index 00000000..82cfaf27 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ritemail.blogspot.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='post-body entry-content'] | ||
2 | strip: //div[@id='lws_0'] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://ritemail.blogspot.com/2011/06/hayden-panettiere-candids-in-los.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt b/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt new file mode 100644 index 00000000..3035527c --- /dev/null +++ b/inc/3rdparty/site_config/standard/rockpapershotgun.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h2 | ||
2 | |||
3 | strip: //div[ contains(@class, 'respond') ] | //h2 | //h1 | ||
4 | |||
5 | date: substring-after(//p[@class='info'], ' on ') | ||
6 | |||
7 | author: //p[@class='info']//a | ||
8 | test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt b/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt new file mode 100644 index 00000000..abe70351 --- /dev/null +++ b/inc/3rdparty/site_config/standard/rodrigo.sharpcube.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | author: //article/header/span[@class='author'] | ||
2 | title://article/header/h1 | ||
3 | body: //article | ||
4 | strip: //article/header | ||
5 | strip: //article/p[@class='metadata'] | ||
6 | footnotes: yes | ||
7 | test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rogerebert.com.txt b/inc/3rdparty/site_config/standard/rogerebert.com.txt new file mode 100644 index 00000000..26792330 --- /dev/null +++ b/inc/3rdparty/site_config/standard/rogerebert.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: substring-before(//title,':') | ||
2 | author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY') | ||
3 | |||
4 | body: //div[@class='text'] | ||
5 | |||
6 | strip: //a[contains(@href,'printart')] | ||
7 | strip_id_or_class: enlarge_photo | ||
8 | test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt b/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt new file mode 100644 index 00000000..d618c23f --- /dev/null +++ b/inc/3rdparty/site_config/standard/rolfinjapan.nl.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[contains(@class, 'inhoud')] | ||
2 | date: //span[@class ='published'] | ||
3 | author: //span[@class ='author'] | ||
4 | strip: //div[@class = 'grid_2'] | ||
5 | strip: //div[@class = 'block-citation-text'] | ||
6 | test_url: http://www.rolfinjapan.nl/2011/06/duizend-kraanvogels/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rollingstone.com.txt b/inc/3rdparty/site_config/standard/rollingstone.com.txt new file mode 100644 index 00000000..9a10a69e --- /dev/null +++ b/inc/3rdparty/site_config/standard/rollingstone.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1 | ||
2 | author: //h3[@class="byline"]/strong | ||
3 | |||
4 | body: //div[@id='main']/h2 | //div[@id='main']//div[@class='body'] | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | single_page_link: //a[@class='print-page'] | ||
9 | |||
10 | test_url: http://www.rollingstone.com/politics/news/the-plastic-bag-wars-20110725 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rottentomatoes.com.txt b/inc/3rdparty/site_config/standard/rottentomatoes.com.txt new file mode 100644 index 00000000..b5b29fe4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/rottentomatoes.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class='movie_content_area'] | ||
2 | strip_id_or_class: tomatometer_bar_help | ||
3 | strip_id_or_class: critic-links | ||
4 | strip_id_or_class: top-critics-numbers | ||
5 | strip_id_or_class: fan_side | ||
6 | strip_id_or_class: fblike | ||
7 | strip_id_or_class: rating_widget | ||
8 | strip_id_or_class: friend_reviews | ||
9 | prune: no | ||
10 | |||
11 | test_url: http://www.rottentomatoes.com/m/thor/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/roughtype.com.txt b/inc/3rdparty/site_config/standard/roughtype.com.txt new file mode 100644 index 00000000..f2f00392 --- /dev/null +++ b/inc/3rdparty/site_config/standard/roughtype.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='content'] | ||
2 | strip: //p[@class='postmeta']/following::* | ||
3 | strip: //p[@class='postmeta'] | ||
4 | strip: //p[@align='left'] | ||
5 | test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/roy.gbiv.com.txt b/inc/3rdparty/site_config/standard/roy.gbiv.com.txt new file mode 100644 index 00000000..6ff03de8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/roy.gbiv.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_comments: no | ||
2 | test_url: http://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rpgsite.net.txt b/inc/3rdparty/site_config/standard/rpgsite.net.txt new file mode 100644 index 00000000..e7f29bbe --- /dev/null +++ b/inc/3rdparty/site_config/standard/rpgsite.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id='news-text'] | ||
2 | prune: no | ||
3 | test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy | ||
4 | test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/rubysfera.pl.txt b/inc/3rdparty/site_config/standard/rubysfera.pl.txt new file mode 100644 index 00000000..d9df7684 --- /dev/null +++ b/inc/3rdparty/site_config/standard/rubysfera.pl.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | author: //div[contains(@class, 'author_text')]/h4/text() | ||
2 | date: //li[@class='date'] | ||
3 | |||
4 | # stripping excessive tags | ||
5 | strip: //div[contains(@class, 'entry_meta')] | ||
6 | strip: //div[contains(@class, 'single_meta')] | ||
7 | strip: //br[contains(@class, 'clear')] | ||
8 | strip: //h3[contains(., 'Komentarz')] | ||
9 | test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ruhlman.com.txt b/inc/3rdparty/site_config/standard/ruhlman.com.txt new file mode 100644 index 00000000..7a21c4af --- /dev/null +++ b/inc/3rdparty/site_config/standard/ruhlman.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='entry-title'] | ||
2 | author: ///span[@class='author vcard'] | ||
3 | date: //abbr[@class='published'] | ||
4 | body: //div[@class='entry-content'] | ||
5 | |||
6 | test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ruttloff.org.txt b/inc/3rdparty/site_config/standard/ruttloff.org.txt new file mode 100644 index 00000000..c036dcf8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ruttloff.org.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: //a[@class='author'] | ||
2 | tidy: no | ||
3 | test_url: http://ruttloff.org/2012/06/13/intervention \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/salon.com.txt b/inc/3rdparty/site_config/standard/salon.com.txt new file mode 100644 index 00000000..04f8afd5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/salon.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | author: (//span[@class="byline"]/a)[1] | ||
3 | date: //span[contains(@class, "toLocalTime")] | ||
4 | body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")] | ||
5 | |||
6 | prune: no | ||
7 | |||
8 | # deal with singleton links | ||
9 | single_page_link: (//h1/a[contains(@href, '/singleton')])[1] | ||
10 | |||
11 | test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/singleton/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/salzburg.com.txt b/inc/3rdparty/site_config/standard/salzburg.com.txt new file mode 100644 index 00000000..31067481 --- /dev/null +++ b/inc/3rdparty/site_config/standard/salzburg.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //p[@class='teaser1 darkgrey myriad'] | ||
2 | move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear'] | ||
3 | strip: //div[@class='hidden'] | ||
4 | strip: //div[@id='article_related_source'] | ||
5 | |||
6 | test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/saveyourself.ca.txt b/inc/3rdparty/site_config/standard/saveyourself.ca.txt new file mode 100644 index 00000000..354f5911 --- /dev/null +++ b/inc/3rdparty/site_config/standard/saveyourself.ca.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | title://h1 | ||
2 | |||
3 | # my section divs seem to interfere with the Instapaper parser, so I ditch 'em | ||
4 | dissolve://div[contains(@class, 'section')] | ||
5 | |||
6 | #these don't seem to be necessary, but just in case | ||
7 | strip_id_or_class:'masthead' | ||
8 | strip_id_or_class:'footer' | ||
9 | |||
10 | #again, Instapaper seems to understand where my content is, but just in case | ||
11 | body://div[@id='content'] | ||
12 | |||
13 | # in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing | ||
14 | strip_id_or_class:'screen-only' | ||
15 | strip_id_or_class:'no-print' | ||
16 | |||
17 | #other misc removals and simplifications | ||
18 | strip_id_or_class:'popup' | ||
19 | strip_id_or_class:'ZoomSpin' | ||
20 | |||
21 | #I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes | ||
22 | wrap_in(blockquote)://div[contains(@class, 'sidebar')] | ||
23 | wrap_in(blockquote)://div[contains(@class, 'meta')] | ||
24 | wrap_in(blockquote)://p[contains(@class, 'meta')] | ||
25 | test_url: http://saveyourself.ca/tutorials/low-back-pain.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sbnation.com.txt b/inc/3rdparty/site_config/standard/sbnation.com.txt new file mode 100644 index 00000000..c213843c --- /dev/null +++ b/inc/3rdparty/site_config/standard/sbnation.com.txt | |||
@@ -0,0 +1,28 @@ | |||
1 | title: //h1[@id='stream_title'] | ||
2 | |||
3 | # Author and date don't work | ||
4 | author: //div[@class='byline'] | ||
5 | date: //div[@class='date-stamp'] | ||
6 | |||
7 | body: //div[@class='node-article'] | ||
8 | |||
9 | strip_id_or_class: fb-like-box | ||
10 | strip_id_or_class: stream-fb-like | ||
11 | strip_id_or_class: social-meta | ||
12 | strip_id_or_class: social-spoken | ||
13 | strip_id_or_class: twitter-share-button | ||
14 | strip_id_or_class: twitter-follow-button | ||
15 | strip_id_or_class: spinner_node_list | ||
16 | strip_id_or_class: node-sort-link | ||
17 | strip_id_or_class: stream_title | ||
18 | strip_id_or_class: stream_summary | ||
19 | strip_id_or_class: update-count-container | ||
20 | strip_id_or_class: major-updates | ||
21 | strip_id_or_class: newsletter-slide | ||
22 | strip_id_or_class: author-mini-profile | ||
23 | strip_id_or_class: byline | ||
24 | strip_id_or_class: header | ||
25 | strip_id_or_class: footer | ||
26 | |||
27 | # Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns | ||
28 | test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/schneier.com.txt b/inc/3rdparty/site_config/standard/schneier.com.txt new file mode 100644 index 00000000..67181b65 --- /dev/null +++ b/inc/3rdparty/site_config/standard/schneier.com.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | author: //p[@class='mastname'] | ||
2 | |||
3 | body: //div[@class='indivbody'] | ||
4 | date: //div[@class='indivbody']/h2[1] | ||
5 | |||
6 | # Remove blog title. Specify first occurrence in case h1 is used in article | ||
7 | strip: //div[@class='indivbody']/h1[1] | ||
8 | |||
9 | # Remove blog description (the first p element) | ||
10 | strip: //div[@class='indivbody']/p[1] | ||
11 | |||
12 | # Remove navigation (second p element) | ||
13 | strip: //div[@class='indivbody']/p[2] | ||
14 | |||
15 | # Remove duplicate of article title. Specify first occurrence in case h3 is used in article | ||
16 | strip: //div[@class='indivbody']/h3[1] | ||
17 | |||
18 | # Remove publishing date, it's extracted by rule above | ||
19 | strip: //div[@class='indivbody']/h2[1] | ||
20 | |||
21 | # Remove duplicate of date at end, and newsletter signup | ||
22 | strip: //p[@class='posted'] | ||
23 | |||
24 | # Leave date at top | ||
25 | test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/science.orf.at.txt b/inc/3rdparty/site_config/standard/science.orf.at.txt new file mode 100644 index 00000000..89ebfe08 --- /dev/null +++ b/inc/3rdparty/site_config/standard/science.orf.at.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class="storybox"] | ||
2 | title: //div[@class="storybox"]//h1 | ||
3 | strip: //p[@class='metaline'] | ||
4 | date: substring-after(//*[@class='time'],'Erstellt am') | ||
5 | strip: //div[@class='fact'] | ||
6 | strip: //p[@class='backlink'] | ||
7 | strip: //div[@class='mailto'] | ||
8 | strip: //div[@id='forumDisclaimer'] | ||
9 | strip: //div[@class='forum'] | ||
10 | |||
11 | test_url: http://science.orf.at/stories/1700900/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scienceblogs.de.txt b/inc/3rdparty/site_config/standard/scienceblogs.de.txt new file mode 100644 index 00000000..08c16842 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scienceblogs.de.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a | ||
2 | |||
3 | author: //div[@class='details clear']//a[@class='hi'] | ||
4 | body: //div[@class='title'] | ||
5 | strip: //p[@class='entrypagination'] | ||
6 | strip: //p[@class='details_top'] | ||
7 | date: //p[@class='details_top'] | ||
8 | title: //div[@class='title']/h1 | ||
9 | strip: //p[@class='details'] | ||
10 | strip: //p[@class='details_bottom'] | ||
11 | |||
12 | test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scienceticker.info.txt b/inc/3rdparty/site_config/standard/scienceticker.info.txt new file mode 100644 index 00000000..75a52824 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scienceticker.info.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body: //div[@class='post'] | ||
2 | title: //h1[@id='singlePageTitle'] | ||
3 | date: substring-before(//small,'• Rubrik') | ||
4 | |||
5 | strip: //div[@class='post-ratings'] | ||
6 | strip: //div[@class='post-ratings-loading'] | ||
7 | strip: //a[@title='Empfehlen Sie den Text weiter!'] | ||
8 | strip: //a[@title='Drucken'] | ||
9 | strip: //div[@class='share'] | ||
10 | |||
11 | test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scientificamerican.com.txt b/inc/3rdparty/site_config/standard/scientificamerican.com.txt new file mode 100644 index 00000000..d510407d --- /dev/null +++ b/inc/3rdparty/site_config/standard/scientificamerican.com.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | # | ||
2 | # After site revisions at SciAm, this configuration does | ||
3 | # not work, especially for multi-page articles. For | ||
4 | # every article there is now a "Print" link which | ||
5 | # is far more reliable. So this configuration should be | ||
6 | # removed or disabled. | ||
7 | # 2/3/13 | ||
8 | # | ||
9 | |||
10 | # meta data | ||
11 | title://h1[@class = 'articleTitle'] | ||
12 | author:substring-after(//span[@class = 'byline'],'By ') | ||
13 | date:substring-before(//span[@class = 'datestamp'],'|') | ||
14 | |||
15 | #body content | ||
16 | body://div[@id = 'articleContent'] | ||
17 | #next_page_link://li[@id = 'flairPagination']/a[last()] | ||
18 | |||
19 | single_page_link: //a[contains(@href, 'print=true')] | ||
20 | |||
21 | #cleanup | ||
22 | strip://div[@class = 'fsgBooks'] | ||
23 | |||
24 | test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state | ||
25 | test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scotusblog.com.txt b/inc/3rdparty/site_config/standard/scotusblog.com.txt new file mode 100644 index 00000000..f29e37f9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scotusblog.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //title | ||
2 | author: //p[@id='author-name-role']/a | ||
3 | date: substring-after(//p[@class='time'],'Posted') | ||
4 | body: //div[@id='main'] | ||
5 | strip: //div[@id='author-info'] | ||
6 | strip: //div[@id='author-links'] | ||
7 | strip: //h1 | ||
8 | test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scraplab.net.txt b/inc/3rdparty/site_config/standard/scraplab.net.txt new file mode 100644 index 00000000..84be27f9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scraplab.net.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h2 | ||
2 | body: //div[@class='body'] | ||
3 | test_url: http://scraplab.net/2010/10/26/please-keep-your-belongings-with-you-at-all-times/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/scripting.com.txt b/inc/3rdparty/site_config/standard/scripting.com.txt new file mode 100644 index 00000000..d8b969b1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/scripting.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | strip: //a[starts-with(@href, '#')] | ||
2 | strip: //*[@class='storyByline'] | ||
3 | body: //*[@class='storyPageText']/.. | ||
4 | author: string('Dave Winer') | ||
5 | date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at') | ||
6 | title: //h1 | ||
7 | footnotes: no | ||
8 | test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sct.temple.edu.txt b/inc/3rdparty/site_config/standard/sct.temple.edu.txt new file mode 100644 index 00000000..9927675b --- /dev/null +++ b/inc/3rdparty/site_config/standard/sct.temple.edu.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //*[@class="entry-content"] | ||
2 | title: //h1[@class="entry-title"] | ||
3 | date: //*[@class="entry-date"] | ||
4 | author: //*[@class="author vcard"] | ||
5 | test_url: http://sct.temple.edu/blogs/news-events/2011/05/congratulations-sct-class-of-2011/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/searchengineland.com.txt b/inc/3rdparty/site_config/standard/searchengineland.com.txt new file mode 100644 index 00000000..f176d7c7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/searchengineland.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | body: //div[@class="storyBox"] | ||
2 | title: //div[@class="storyBox"]/h1 | ||
3 | author: //a[@rel="author"] | ||
4 | date: substring-before(//span[@class="dateline"], 'by') | ||
5 | |||
6 | #Removes related content but cleans up article text | ||
7 | strip: //h1 | ||
8 | strip: //p[@class="homeStory tdmSideInfo"] | ||
9 | strip: //div[@id="bylineShare"] | ||
10 | strip: //script | ||
11 | strip: //hr | ||
12 | |||
13 | strip_id_or_class: homeStory | ||
14 | strip_id_or_class: authorpic | ||
15 | strip_id_or_class: insideComments | ||
16 | strip_id_or_class: authorbio | ||
17 | strip_id_or_class: gpt-ad-sel-cube | ||
18 | strip_id_or_class: smxTextAd | ||
19 | |||
20 | test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/seattletransitblog.com.txt b/inc/3rdparty/site_config/standard/seattletransitblog.com.txt new file mode 100644 index 00000000..5129c069 --- /dev/null +++ b/inc/3rdparty/site_config/standard/seattletransitblog.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h3[@class="storytitle"] | ||
2 | date: //div[@class='meta'] | ||
3 | body: //div[@class='storycontent'] | ||
4 | |||
5 | test_url: http://seattletransitblog.com/2012/06/19/times-st-louis-interested-in-buying-waterfront-streetcars/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sebbo.net.txt b/inc/3rdparty/site_config/standard/sebbo.net.txt new file mode 100644 index 00000000..3e800a16 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sebbo.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: substring-before(//title, '«') | ||
2 | body: //div[@class = 'entry'] | ||
3 | strip_id_or_class: 'postmetabox' | ||
4 | test_url: http://sebbo.net/2010/12/akkus/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/seriouseats.com.txt b/inc/3rdparty/site_config/standard/seriouseats.com.txt new file mode 100644 index 00000000..d7b4788c --- /dev/null +++ b/inc/3rdparty/site_config/standard/seriouseats.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body: //div[@id='content'] | ||
2 | |||
3 | # clean up recipe pages | ||
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | ||
5 | |||
6 | #recipe pages | ||
7 | strip_id_or_class: "recipe-feedback" | ||
8 | strip_id_or_class: "comments" | ||
9 | strip_id_or_class: "procedure-number" | ||
10 | strip_id_or_class: "more-with-author" | ||
11 | |||
12 | #slice | ||
13 | strip_id_or_class: "inner" | ||
14 | |||
15 | test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sf.curbed.com.txt b/inc/3rdparty/site_config/standard/sf.curbed.com.txt new file mode 100644 index 00000000..9f443d5c --- /dev/null +++ b/inc/3rdparty/site_config/standard/sf.curbed.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@class='post-title'] | ||
2 | author: //div[@class='post-byline']/a | ||
3 | date: substring-before(//div[@class='post-byline'], ', by') | ||
4 | |||
5 | body: //div[@class='post-body'] | ||
6 | dissolve: //noscript | ||
7 | test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sf.eater.com.txt b/inc/3rdparty/site_config/standard/sf.eater.com.txt new file mode 100644 index 00000000..fca656d2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sf.eater.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1[@class="post-title"] | ||
2 | author: //div[@class="post-byline"]/a | ||
3 | date: substring-before(//div[@class='post-byline'], ', by') | ||
4 | |||
5 | body: //div[@class='post-body'] | ||
6 | strip_id_or_class: post-kicker | ||
7 | test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sfgate.com.txt b/inc/3rdparty/site_config/standard/sfgate.com.txt new file mode 100644 index 00000000..5f73fbcb --- /dev/null +++ b/inc/3rdparty/site_config/standard/sfgate.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: /html/head/title | ||
2 | |||
3 | body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')] | ||
4 | author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn'] | ||
5 | date: //div[@class = 'articleheadings']/span[@class = 'updated'] | ||
6 | strip: //div[div[contains(@class, 'imgbox')]] | ||
7 | |||
8 | body: //div[@class = 'blogitem'] | ||
9 | author: //p[@class="credit"]/span[@class="author"]/a[position() = 1] | ||
10 | date: //span[@class = 'pubdate'] | ||
11 | |||
12 | test_url: http://www.sfgate.com/columnists/garchik/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sfweekly.com.txt b/inc/3rdparty/site_config/standard/sfweekly.com.txt new file mode 100644 index 00000000..a11fe4cb --- /dev/null +++ b/inc/3rdparty/site_config/standard/sfweekly.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[contains(@class, 'content_body')] | ||
2 | strip_id_or_class: det_rel | ||
3 | test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/shabayek.com.txt b/inc/3rdparty/site_config/standard/shabayek.com.txt new file mode 100644 index 00000000..b175720e --- /dev/null +++ b/inc/3rdparty/site_config/standard/shabayek.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | date: //span[@class='date'] | ||
2 | body: //div[@class='post_content'] | ||
3 | test_url: http://www.shabayek.com/blog/2011/10/16/%D8%AF%D8%B1%D9%88%D8%B3-%D9%85%D9%86-%D9%82%D8%B5%D8%A9-%D8%AA%D8%A3%D8%B3%D9%8A%D8%B3-%D8%AA%D9%88%D9%8A%D8%AA%D8%B1-%E2%80%93%D8%AC3/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/shawnblanc.net.txt b/inc/3rdparty/site_config/standard/shawnblanc.net.txt new file mode 100644 index 00000000..b536fc3a --- /dev/null +++ b/inc/3rdparty/site_config/standard/shawnblanc.net.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title://*[@class='primary']/h1 | ||
2 | date: //*[@class='articledate'] | ||
3 | author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.') | ||
4 | body: //div[@class='primary'] | ||
5 | footnotes: yes | ||
6 | strip: //*[@class='primary']/h1 | ||
7 | strip: //*[@class='articledate'] | ||
8 | strip: //*[@class='detailsarticle'] | ||
9 | strip: //*[@class='endnav'] | ||
10 | strip: //*[@class='endmeta'] | ||
11 | test_url: http://shawnblanc.net/2011/11/kindle-touch-review/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/shifteleven.com.txt b/inc/3rdparty/site_config/standard/shifteleven.com.txt new file mode 100644 index 00000000..68059ae1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/shifteleven.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[ @class='entry-content' ] | ||
2 | |||
3 | strip: //div[ contains(@class, 'sharing') ] | ||
4 | |||
5 | date: //div[ @class='entry-meta' ]/a | ||
6 | test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/siasat.pk.txt b/inc/3rdparty/site_config/standard/siasat.pk.txt new file mode 100644 index 00000000..a82ce69c --- /dev/null +++ b/inc/3rdparty/site_config/standard/siasat.pk.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | ||
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | ||
3 | body: (//div[starts-with(@id, 'post_message')])[1] | ||
4 | |||
5 | prune: no | ||
6 | tidy: no | ||
7 | |||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | ||
9 | #replace_string(</iframe>): </iframe> </div> | ||
10 | |||
11 | test_url: http://www.siasat.pk/forum/showthread.php?107668-Policy-Matters-17th-March-2012-Dr-Shahid-Masood-Gen-Hameed-gul-amp-Fawad-Chudhary-Pak-US-Relationship&p=787733 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/simonwillison.net.txt b/inc/3rdparty/site_config/standard/simonwillison.net.txt new file mode 100644 index 00000000..e3ad6e41 --- /dev/null +++ b/inc/3rdparty/site_config/standard/simonwillison.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[contains(@class, "entry")] | ||
2 | |||
3 | date: //div[contains(@class, "entryFooter")]/a | ||
4 | |||
5 | test_url: http://simonwillison.net/2009/Oct/22/redis/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt b/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt new file mode 100644 index 00000000..a1b6b673 --- /dev/null +++ b/inc/3rdparty/site_config/standard/singaporeanstocksinvestor.blogspot.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@class='post-body'] | ||
2 | strip: //div[@id='lws_0'] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/singularityhub.com.txt b/inc/3rdparty/site_config/standard/singularityhub.com.txt new file mode 100644 index 00000000..3999d4d4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/singularityhub.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body://div[contains(@class,"entry-content")] | ||
2 | test_url: http://singularityhub.com/2011/05/21/google-invades-your-home-android-phones-control-your-appliances-and-accessories-video/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sintagoulis.gr.txt b/inc/3rdparty/site_config/standard/sintagoulis.gr.txt new file mode 100644 index 00000000..822bbeb0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sintagoulis.gr.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@class='headline']//h2 | ||
2 | body: //div[contains(@class, 'storycontent')] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | test_url: http://sintagoulis.gr/sokolatenia/sokolatenia-mpompa-me-amaretti- \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/slashfilm.com.txt b/inc/3rdparty/site_config/standard/slashfilm.com.txt new file mode 100644 index 00000000..78d38ecf --- /dev/null +++ b/inc/3rdparty/site_config/standard/slashfilm.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: substring-before(//title,'| /Film') | ||
2 | date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by') | ||
3 | strip: //div[@class='pm-left'] | ||
4 | strip: //div[@class='pm-right'] | ||
5 | strip: //h2/span | ||
6 | next_page_link: //h2/strong/a | ||
7 | strip: //h2/strong/a | ||
8 | strip: //p[contains(text(),'we have to split this post over')] | ||
9 | strip: //p[@class='post-info'] | ||
10 | strip: //h1/a | ||
11 | strip: //img[contains(@src,'siteimages/authors')] | ||
12 | strip: //div[@id='header'] | ||
13 | strip: //div[@class='topad-right'] | ||
14 | strip: //strong[contains(text(),'Cool Posts From Around the Web:')] | ||
15 | test_url: http://www.slashfilm.com/superhero-bits-206/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/slate.com.txt b/inc/3rdparty/site_config/standard/slate.com.txt new file mode 100644 index 00000000..e92f6a06 --- /dev/null +++ b/inc/3rdparty/site_config/standard/slate.com.txt | |||
@@ -0,0 +1,19 @@ | |||
1 | title: //h1[@class="sl-art-head-dek"] | ||
2 | body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')] | ||
3 | strip: //div[@class="department_kicker"] | ||
4 | strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"] | ||
5 | strip: //div[@id="bottom_sponsored_links"] | ||
6 | strip: //div[@class="sl-art-ad-midflex"] | ||
7 | #strip: //dl | ||
8 | #strip: //p[em/a[contains(@href, 'facebook.com')]] | ||
9 | prune: no | ||
10 | |||
11 | author: //div[@id='author_bio']//a[contains(@href, '/author/')] | ||
12 | author: //a[contains(@href, '/authors.')] | ||
13 | |||
14 | date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ') | ||
15 | |||
16 | single_page_link: //a[@class='sl-art-sinpage'] | ||
17 | |||
18 | test_url: http://www.slate.com/id/2274583/pagenum/all/ | ||
19 | test_url: http://www.slate.com/id/2293116/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt b/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt new file mode 100644 index 00000000..1a902b96 --- /dev/null +++ b/inc/3rdparty/site_config/standard/slice.seriouseats.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body: //div[@id='content'] | ||
2 | |||
3 | # clean up recipe pages | ||
4 | strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3'] | ||
5 | |||
6 | #recipe pages | ||
7 | strip_id_or_class: "recipe-feedback" | ||
8 | strip_id_or_class: "comments" | ||
9 | strip_id_or_class: "procedure-number" | ||
10 | strip_id_or_class: "more-with-author" | ||
11 | |||
12 | #slice | ||
13 | strip_id_or_class: "inner" | ||
14 | |||
15 | test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/slog.thestranger.com.txt b/inc/3rdparty/site_config/standard/slog.thestranger.com.txt new file mode 100644 index 00000000..daa5e31b --- /dev/null +++ b/inc/3rdparty/site_config/standard/slog.thestranger.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | strip_id_or_class: postCategory | ||
2 | title: //h3[@class='postTitle'] | ||
3 | body: //div[@class='postBody'] | ||
4 | test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/smartinvestor.de.txt b/inc/3rdparty/site_config/standard/smartinvestor.de.txt new file mode 100644 index 00000000..ec6c55c8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/smartinvestor.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //td[@class='hweissblau2'] | ||
2 | body: //p[@class='copy'] | //div[@class='Section1'] | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.smartinvestor.de/news/smartinvestor/detail.hbs?itemid=item949496655&recnr=14593 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sme.sk.txt b/inc/3rdparty/site_config/standard/sme.sk.txt new file mode 100644 index 00000000..c3d01ffb --- /dev/null +++ b/inc/3rdparty/site_config/standard/sme.sk.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //meta[@property='og:title']/@content | ||
2 | date: //p[@class='autor_line']/b/text() | ||
3 | test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/smithsonianmag.com.txt b/inc/3rdparty/site_config/standard/smithsonianmag.com.txt new file mode 100644 index 00000000..10a3f717 --- /dev/null +++ b/inc/3rdparty/site_config/standard/smithsonianmag.com.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | # meta data | ||
2 | title://h1[@id = 'articleTitle'] | ||
3 | author:substring-after(//ul[@id = 'byLine']/li[1],'By ') | ||
4 | date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',') | ||
5 | body://div[@id = 'article-body'] | ||
6 | |||
7 | # full content | ||
8 | single_page_link://td/li[@class = 'article-singlepage']/a | ||
9 | |||
10 | # caption clean up | ||
11 | wrap_in(i)://span[@class='articleImageCaptionwide'] | ||
12 | move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p | ||
13 | |||
14 | |||
15 | # clean up | ||
16 | strip://p[@id = 'articlePaginationWrapper'] | ||
17 | strip://ul[contains(@class, 'cat-breadcrumb')] | ||
18 | strip://div [@class= 'viewMorePhotos'] | ||
19 | |||
20 | test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/smokingapples.com.txt b/inc/3rdparty/site_config/standard/smokingapples.com.txt new file mode 100644 index 00000000..e22af7a9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/smokingapples.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h2[@class='custom-entry-title'] | ||
2 | author: substring-after(//span[@class='author vcard'],'by ') | ||
3 | date: substring-after(//span[@class='publ'],'Published on ') | ||
4 | body: //div[@class='postentry-content'] | ||
5 | test_url: http://smokingapples.com/software/popclip-for-mac/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sourcebooks.com.txt b/inc/3rdparty/site_config/standard/sourcebooks.com.txt new file mode 100644 index 00000000..668fc44a --- /dev/null +++ b/inc/3rdparty/site_config/standard/sourcebooks.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | #grab the actual content div | ||
2 | body: //div[@class='rt-article'] | ||
3 | |||
4 | test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/spectator.co.uk.txt b/inc/3rdparty/site_config/standard/spectator.co.uk.txt new file mode 100644 index 00000000..a05c8395 --- /dev/null +++ b/inc/3rdparty/site_config/standard/spectator.co.uk.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text() | ||
2 | |||
3 | body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body'] | ||
4 | |||
5 | # Not very helpfull, the title and author are container by the same element that contains the body | ||
6 | strip: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/h2 | /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link'] | ||
7 | test_url: http://www.spectator.co.uk/arts-and-culture/night-and-day/7449683/spotify-sunday-my-personal-soundtrack.thtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt b/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt new file mode 100644 index 00000000..4b0704a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/spectrum.ieee.org.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body://div[@class="articleBody"] | ||
2 | author://p[@class="articleBodyTtl"] | ||
3 | test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/speirs.org.txt b/inc/3rdparty/site_config/standard/speirs.org.txt new file mode 100644 index 00000000..3bf859e3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/speirs.org.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body://div[@class="body"] | ||
2 | test_url: http://speirs.org/blog/2011/5/5/ipad-trials-at-oklahoma-state.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/spiegel.de.txt b/inc/3rdparty/site_config/standard/spiegel.de.txt new file mode 100644 index 00000000..390c075c --- /dev/null +++ b/inc/3rdparty/site_config/standard/spiegel.de.txt | |||
@@ -0,0 +1,75 @@ | |||
1 | # A. Niepel, narya.de@... | ||
2 | # - added single_page_link | ||
3 | # - added author for default and single page view | ||
4 | # - added date for single page view | ||
5 | # fforst@... | ||
6 | # - Fixed it | ||
7 | # bode2104@... | ||
8 | # - Fixed single_page_link | ||
9 | # - Included intro text in single page view | ||
10 | # - Added body in default view | ||
11 | |||
12 | # set body | ||
13 | tidy: no | ||
14 | # body in single page view | ||
15 | body: //div[@id="spArticleContent"] | ||
16 | # body in default view | ||
17 | body: //div[@id="spArticleSection"] | ||
18 | # body in "Fotostrecke" | ||
19 | body: //div[@id="spBigaContent"] | ||
20 | |||
21 | # set date in single page view | ||
22 | date: //div[@id="spArticleContent"]/h3 | ||
23 | # strip date | ||
24 | strip: //div[@id="spArticleContent"]/h3 | ||
25 | # set date in "Fotostrecke" | ||
26 | date: //div[@id="spBigaDatum"] | ||
27 | |||
28 | #set title in single page view | ||
29 | title: //div[@id='spArticleContent']/h2 | ||
30 | # strip title | ||
31 | strip: //div[@id='spArticleContent']/h1 | ||
32 | strip: //div[@id='spArticleContent']/h2 | ||
33 | #set title in "Fotostrecke" | ||
34 | title: //div[@class='spBigaHeadline'] | ||
35 | |||
36 | # set author | ||
37 | author: //p[@class="spAuthor"]/a | ||
38 | author: substring-after(//p[@class="spAuthor"], 'Von ') | ||
39 | # strip author | ||
40 | strip: //p[@class='spAuthor'] | ||
41 | |||
42 | # remove captions | ||
43 | strip: //*/span[@class='spPicLayerText'] | ||
44 | strip: //*/div[@class='spPanoPlayerPaneControl'] | ||
45 | strip: //*/div[@class='spCredit'] | ||
46 | strip: //*/div[@class='spCredit']/following-sibling::p | ||
47 | |||
48 | # remove ads | ||
49 | strip: //div[@class='spMInline'] | ||
50 | |||
51 | # remove photogalleries and extras | ||
52 | strip: //div[@class='spPhotoGallery'] | ||
53 | strip: //div[@class='spPhotoGallery']/following-sibling::br | ||
54 | strip: //div[@class='spAssetAlignleft'] | ||
55 | strip: //div[contains(@class,'spAsset')] | ||
56 | strip: //br[@clear='all'] | ||
57 | |||
58 | # remove community functions | ||
59 | strip: //div[@id='spSocialBookmark'] | ||
60 | strip: //div[contains(@class, 'spCommunityBox')] | ||
61 | strip: //div[contains(@class, 'spArticleNewsfeedBox')] | ||
62 | strip: //div[@class='spArticleCredit'] | ||
63 | |||
64 | # remove clutter in "Fotostrecke" | ||
65 | strip: //div[@id='spBreadcrumb'] | ||
66 | strip: //div[@id='spBigaLatestEntries'] | ||
67 | strip: //div[contains(@class, 'spBigaNavi')] | ||
68 | strip: //div[@class='spDottedLine'] | ||
69 | |||
70 | # Use link to print article for single page view | ||
71 | single_page_link: //a[contains(@href, '-druck')] | ||
72 | |||
73 | # use next link in "Fotostrecke" | ||
74 | next_page_link: //a[@class='spBigaControlForw'] | ||
75 | test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/spin.com.txt b/inc/3rdparty/site_config/standard/spin.com.txt new file mode 100644 index 00000000..66f6192b --- /dev/null +++ b/inc/3rdparty/site_config/standard/spin.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | tidy: no | ||
2 | body: //section[contains(@class, 'main')] | ||
3 | strip: //footer | ||
4 | strip: //a[@class='paginated'] | ||
5 | test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/splatf.com.txt b/inc/3rdparty/site_config/standard/splatf.com.txt new file mode 100644 index 00000000..d5671652 --- /dev/null +++ b/inc/3rdparty/site_config/standard/splatf.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author:string('Dan Frommer/SplatF') | ||
2 | date://div[@class='postdate'] | ||
3 | body://div[@class='entry'] | ||
4 | title://div[@class='post']/h1 | ||
5 | test_url: http://www.splatf.com/2012/02/month-six/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/splitsider.com.txt b/inc/3rdparty/site_config/standard/splitsider.com.txt new file mode 100644 index 00000000..d1d392e7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/splitsider.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //div[@class='byline']/a | ||
2 | date: //div[@id='date'] | ||
3 | body: //div[@class='entry'] | ||
4 | test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sport.detik.com.txt b/inc/3rdparty/site_config/standard/sport.detik.com.txt new file mode 100644 index 00000000..b404b829 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sport.detik.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title://div[@class="content_detail"]/h1 | ||
2 | |||
3 | author://div[@class="author"]/strong | ||
4 | |||
5 | date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB') | ||
6 | |||
7 | body://div[@class='text_detail'] | ||
8 | test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sport.orf.at.txt b/inc/3rdparty/site_config/standard/sport.orf.at.txt new file mode 100644 index 00000000..a794ded9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sport.orf.at.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | single_page_link: //div[@id='content']//p[@class='readMore']/a | ||
2 | |||
3 | title: //div[@class='hidden offscreen']/h2 | ||
4 | body: //div[@id="storyText"] | ||
5 | move_into(//div[@id='storyText']): //div[@class='fact'] | ||
6 | strip: //small[@class='credit'] | ||
7 | strip: //small[@class='caption'] | ||
8 | date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am') | ||
9 | strip: //p[@class='toplink'] | ||
10 | |||
11 | test_url: http://sport.orf.at/stories/2084851/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sports.espn.go.com.txt b/inc/3rdparty/site_config/standard/sports.espn.go.com.txt new file mode 100644 index 00000000..e0f8223c --- /dev/null +++ b/inc/3rdparty/site_config/standard/sports.espn.go.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //div[@class='headline'] | //div[@class='mod-header']/h3 | ||
2 | body: //div[contains(@class, 'article')] | ||
3 | strip: //div[contains(@class, 'mod-inline')] | ||
4 | strip: //*/span[@class='page-actions']/a | ||
5 | strip: //*/span[@class='page-actions']/a | ||
6 | strip: //div[@class='page-actions']/* | ||
7 | strip: //div[@class='headline'] | //div[@class='mod-header']/h3 | ||
8 | strip: //div[@class='mod-blog-navigation'] | ||
9 | strip: //div[@class='monthday'] | ||
10 | strip: //div[@class='time'] | ||
11 | strip: //div[@class='timeofday'] | ||
12 | test_url: http://sports.espn.go.com/espn/page2/story?page=simmonsnfl2010/lebron_james_return_clevelend&sportCat=nba \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sports.yahoo.com.txt b/inc/3rdparty/site_config/standard/sports.yahoo.com.txt new file mode 100644 index 00000000..96a3bb71 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sports.yahoo.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //div[@id='article']/div[@class='hd']/h1 | ||
2 | body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0'] | ||
3 | strip: //div[@class='foot'] | ||
4 | strip: //div[@id='sidebar']//div[@class='ft'] | ||
5 | strip: //p[@id='byline']//em | ||
6 | tidy: no | ||
7 | prune: no | ||
8 | |||
9 | test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sportschau.de.txt b/inc/3rdparty/site_config/standard/sportschau.de.txt new file mode 100644 index 00000000..6500e75c --- /dev/null +++ b/inc/3rdparty/site_config/standard/sportschau.de.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title://div[@id='ardContent']/h1 | ||
2 | |||
3 | author://p[@id='ardAutor'] | ||
4 | author://span[@id='ardQuelle'] | ||
5 | author:string('sportschau.de') | ||
6 | |||
7 | date:substring-after(//span[@id='ardStand'], 'Stand: ') | ||
8 | |||
9 | body://div[@id='ardContent'] | ||
10 | |||
11 | strip://div[@id='ardContent']/h1 | ||
12 | strip://p[@id='ardAutor'] | ||
13 | strip: //div[@class='embeddedPlayer_clipinfo'] | ||
14 | strip: //div[@class='ardMehrZumThemaRechts'] | ||
15 | strip: //*[contains(@class, 'inv')] | ||
16 | |||
17 | strip: //p[@id='ardAbbinder'] | ||
18 | strip: //div[@class='socialBookmarks'] | ||
19 | strip: //div[@id='ardContentEnd'] | ||
20 | strip: //div[@id='ardDisclaimer'] | ||
21 | strip: //div[@id='ardRechteSpalte'] | ||
22 | test_url: http://www.sportschau.de/sp/fussball/news201203/17/analyse_leverkusen_gladbach.jsp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt b/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt new file mode 100644 index 00000000..afc5879f --- /dev/null +++ b/inc/3rdparty/site_config/standard/sportsillustrated.cnn.com.txt | |||
@@ -0,0 +1,26 @@ | |||
1 | # main sportsillustrated.com articles | ||
2 | # | ||
3 | body: //div[@id="cnnStoryContent"] | ||
4 | title: //div[@id="cnnStoryHeadline"]//h1 | ||
5 | author: //div[@id="cnnSubBanner"]//strong | ||
6 | date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ") | ||
7 | date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ") | ||
8 | |||
9 | # kill ugly font buttons | ||
10 | strip: //div[@id="cnnSCFontButtons"] | ||
11 | |||
12 | # kill misc filler videos & etc | ||
13 | strip: //div[@class="cnnDivideContent"] | ||
14 | strip: //*[@class="cnnTMbox"] | ||
15 | |||
16 | # si vault articles | ||
17 | # ------------- | ||
18 | body: //div[@class="siv_artPara"] | ||
19 | title: //div[@class="siv_artHeader"]//h1 | ||
20 | author: //div[@class="byline"] | ||
21 | date: //div[@class="date"] | ||
22 | |||
23 | next_page_link: //div[@id='cnnStoryContinue']/a | ||
24 | strip_id_or_class: cnnstorypagination | ||
25 | |||
26 | test_url: http://sportsillustrated.cnn.com/2012/writers/peter_king/02/27/combine/index.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sprengsatz.de.txt b/inc/3rdparty/site_config/standard/sprengsatz.de.txt new file mode 100644 index 00000000..16636bc5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sprengsatz.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //h2 | ||
2 | author: string('Michael Spreng') | ||
3 | date: //div[@class='date'] | ||
4 | body: //div[@class='entry'] | ||
5 | test_url: http://www.sprengsatz.de/?p=3691 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sqlite.org.txt b/inc/3rdparty/site_config/standard/sqlite.org.txt new file mode 100644 index 00000000..4872519a --- /dev/null +++ b/inc/3rdparty/site_config/standard/sqlite.org.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@id='ff-body'] | ||
2 | |||
3 | replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center> | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | test_url: http://www.sqlite.org/fileformat2.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt b/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt new file mode 100644 index 00000000..388209a9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/squashed.tumblr.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class='content'] | ||
2 | date: substring-before( //div[@class='unit dateAndNotes'], 'with') | ||
3 | title: //h3 | ||
4 | test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stackoverflow.com.txt b/inc/3rdparty/site_config/standard/stackoverflow.com.txt new file mode 100644 index 00000000..e5317bac --- /dev/null +++ b/inc/3rdparty/site_config/standard/stackoverflow.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2 | ||
2 | |||
3 | replace_string(<div class="user-details"><br></div>): <!-- nothing --> | ||
4 | replace_string(<div class="vote">): <div class="vote"><h3>Vote count: | ||
5 | |||
6 | strip_id_or_class: vote-up | ||
7 | strip_id_or_class: vote-down | ||
8 | strip_id_or_class: star-off | ||
9 | strip_id_or_class: favoritecount | ||
10 | strip_id_or_class: -share | ||
11 | strip_id_or_class: badgecount | ||
12 | |||
13 | |||
14 | test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt b/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt new file mode 100644 index 00000000..bde14217 --- /dev/null +++ b/inc/3rdparty/site_config/standard/stalbansreview.co.uk.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //div[@class='articleLeft']/h3 | ||
2 | |||
3 | author: substring-after(//span[@class='articleAuthor']/a,'By ') | ||
4 | |||
5 | date: substring-before(//span[@class='articleDateTime'],'in ') | ||
6 | |||
7 | body: //div[@class='articleLeft'] | ||
8 | strip: //div[@class='articleMoreNews'] | ||
9 | strip: //div[@class='articleLeft']/h3 | ||
10 | strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix'] | ||
11 | |||
12 | # Remove duplicate title from text | ||
13 | strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3 | ||
14 | test_url: http://www.stalbansreview.co.uk/news/9581446.New_roundabout_in_King_Harry_Lane/r/?ref=rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/standard.co.uk.txt b/inc/3rdparty/site_config/standard/standard.co.uk.txt new file mode 100644 index 00000000..22a33484 --- /dev/null +++ b/inc/3rdparty/site_config/standard/standard.co.uk.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | autodetect_next_page: no | ||
2 | footnotes: no | ||
3 | dissolve: //div[@class="column-2"]//div[@class="widget"] | ||
4 | dissolve: //div[@class="column-2"]//div | ||
5 | |||
6 | author: //div[@class="innerbyline"]/a | ||
7 | strip: //div[@class="innerbyline"]/a | ||
8 | |||
9 | strip: //p[@class="dateline"] | ||
10 | date: //p[@class="dateline"] | ||
11 | |||
12 | title: //h1[@class="title"] | ||
13 | author: //div[@class="innerbyline"]/a | ||
14 | date: //p[@class="dateline"] | ||
15 | body: //div[@class="column-2"] | ||
16 | test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/staradvertiser.com.txt b/inc/3rdparty/site_config/standard/staradvertiser.com.txt new file mode 100644 index 00000000..0579455f --- /dev/null +++ b/inc/3rdparty/site_config/standard/staradvertiser.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[@id='storyTitle'] | ||
2 | author: substring-after(//span[@class='hsa_postCredit'], 'By ') | ||
3 | date://span[@class='hsa_dateStamp'] | ||
4 | body: //div[@class='storytext'] | ||
5 | strip_id_or_class: insideStoryAd | ||
6 | strip_id_or_class: printDesc | ||
7 | strip_id_or_class: sb_2010_story_tools | ||
8 | strip_id_or_class: FBConnectButton_Text | ||
9 | strip_id_or_class: breadcrumbs | ||
10 | prune: no | ||
11 | test_url: http://www.staradvertiser.com/news/20111112_World_leaders_step_onto_isle_stage.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stephenfry.com.txt b/inc/3rdparty/site_config/standard/stephenfry.com.txt new file mode 100644 index 00000000..1169984f --- /dev/null +++ b/inc/3rdparty/site_config/standard/stephenfry.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: /html/head/meta[@name='title']/@content | ||
2 | author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a | ||
3 | date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')] | ||
4 | |||
5 | body: //div[@class='entry-content'] | ||
6 | |||
7 | single_page_link: //p[@class='pagination']/a | ||
8 | test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stlbeacon.org.txt b/inc/3rdparty/site_config/standard/stlbeacon.org.txt new file mode 100644 index 00000000..d66fee9f --- /dev/null +++ b/inc/3rdparty/site_config/standard/stlbeacon.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: article/h1 | ||
2 | author: //p[@class='byline'] | ||
3 | date: //p[@class='date'] | ||
4 | body: //div[@class='body'] | ||
5 | test_url: https://www.stlbeacon.org/#!/content/23404/mogop_caucus_031712 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stockholm.etc.se.txt b/inc/3rdparty/site_config/standard/stockholm.etc.se.txt new file mode 100644 index 00000000..073043d5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/stockholm.etc.se.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | strip_id_or_class: 'left' | ||
2 | strip_id_or_class: 'right' | ||
3 | strip_id_or_class: 'block-belowcontent' | ||
4 | |||
5 | test_url: http://stockholm.etc.se/reportage/bakom-stangda-dorrar-pa-fas-3-massa \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/streetsblog.net.txt b/inc/3rdparty/site_config/standard/streetsblog.net.txt new file mode 100644 index 00000000..0b62a3d6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/streetsblog.net.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h2[@class="post-title"] | ||
2 | date: //span[@class="post-date"] | ||
3 | body: //div[@class="post-entry"] | ||
4 | |||
5 | #This is also good for *.streetsblog.org, for example: | ||
6 | #http://dc.streetsblog.org/2011/10/21/friday-job-market/ | ||
7 | test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stuff.co.nz.txt b/inc/3rdparty/site_config/standard/stuff.co.nz.txt new file mode 100644 index 00000000..12fd0939 --- /dev/null +++ b/inc/3rdparty/site_config/standard/stuff.co.nz.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title://div[@id='left_col']/h1 | ||
2 | author:substring-after(//span[contains(@class,'storycredit')],'BY ') | ||
3 | author://span[contains(@class,'storycredit')] | ||
4 | date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ') | ||
5 | date://div[contains(@class,'toolbox_date')] | ||
6 | body://div[@id='left_col'] | ||
7 | |||
8 | strip_id_or_class: toolbox | ||
9 | strip_id_or_class: story_features | ||
10 | strip_id_or_class: sharebox_new | ||
11 | strip_id_or_class: related_box | ||
12 | strip_id_or_class: sponsored_links | ||
13 | strip_id_or_class: hidden_ad | ||
14 | strip_id_or_class: story_content_top | ||
15 | strip_id_or_class: total_number | ||
16 | strip_id_or_class: sort_order | ||
17 | strip_id_or_class: subscribe_order | ||
18 | |||
19 | strip://div[contains(@class,'ad_story')] | ||
20 | |||
21 | test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge | ||
22 | test_url: http://www.stuff.co.nz/entertainment/7045944/International-praise-for-Ladyhawke \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/stumbleupon.com.txt b/inc/3rdparty/site_config/standard/stumbleupon.com.txt new file mode 100644 index 00000000..85682166 --- /dev/null +++ b/inc/3rdparty/site_config/standard/stumbleupon.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link: //iframe[@id='stumbleFrame']/@src | ||
2 | |||
3 | test_url: www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/subtraction.com.txt b/inc/3rdparty/site_config/standard/subtraction.com.txt new file mode 100644 index 00000000..454e37b1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/subtraction.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //*[@id='posts']/div[1]/h2 | ||
2 | author: //*[@id='posts']/div[1]/div[2]/span[2]/a | ||
3 | date: //*[@class='date'] | ||
4 | body: //div[@class='body-lead'] | ||
5 | |||
6 | # take out the bit saying 'body' | ||
7 | strip: //div[@class='body-lead']/div[@class='info-label'] | ||
8 | |||
9 | |||
10 | |||
11 | |||
12 | |||
13 | |||
14 | |||
15 | |||
16 | |||
17 | test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/sueddeutsche.de.txt new file mode 100644 index 00000000..4aa9410c --- /dev/null +++ b/inc/3rdparty/site_config/standard/sueddeutsche.de.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | # 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@... | ||
2 | |||
3 | single_page_link: //a[ contains( @href, "/2.220/" ) ] | ||
4 | |||
5 | body: //article[@id="sitecontent"]/section[@class="body"] | ||
6 | author: //address[@class="author"] | ||
7 | date: //div[@class="header"]//h1//span[@class="updated"] | ||
8 | wrap_in(small): //div[@class="footer"] | ||
9 | wrap_in(i): //figcaption/h3 | ||
10 | dissolve: //figcaption//h3 | ||
11 | dissolve: //figure/div[@class="body"] | ||
12 | dissolve: //figure/a | ||
13 | |||
14 | strip: //figure[ not( contains(@class, "zoomimage" ) ) ] | ||
15 | strip: //div[@data-onlineonly="true"] | ||
16 | strip: //address[@class="author"] | ||
17 | |||
18 | test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/summify.com.txt b/inc/3rdparty/site_config/standard/summify.com.txt new file mode 100644 index 00000000..1128e1bb --- /dev/null +++ b/inc/3rdparty/site_config/standard/summify.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_id_or_class: toolbar | ||
2 | test_url: http://summify.com/story/Tmt1YQ0JBgKTAHwK/www.nybooks.com/articles/archives/2003/jan/16/fixed-opinions-or-the-hinge-of-history/?pagination=false \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/suntimes.com.txt b/inc/3rdparty/site_config/standard/suntimes.com.txt new file mode 100644 index 00000000..13390e4f --- /dev/null +++ b/inc/3rdparty/site_config/standard/suntimes.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //div[@class='story-details']/h1 | ||
2 | date: //span[@class='date-time'] | ||
3 | Author: substring-after(//p[@class='by-line'], 'By ') | ||
4 | |||
5 | strip: //div[@class='videoThumbnails'] | ||
6 | strip: //div[@class='ad-square2-container'] | ||
7 | strip: //div[@class='homeDeliveryContainer5'] | ||
8 | |||
9 | strip: //div[@class='image-description'] | ||
10 | strip: //div[@id='internal-side-bar'] | ||
11 | |||
12 | strip: //span[@class='hide'] | ||
13 | strip: //div[@class='date'] | ||
14 | test_url: http://www.suntimes.com/technology/ihnatko/8816567-452/review-kindle-fire-is-no-ipad-killer-but-it-is-a-killer-device.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/svd.se.txt b/inc/3rdparty/site_config/standard/svd.se.txt new file mode 100644 index 00000000..02b5b8ca --- /dev/null +++ b/inc/3rdparty/site_config/standard/svd.se.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | # Ads | ||
2 | strip_id_or_class: articlead | ||
3 | |||
4 | test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sydsvenskan.se.txt b/inc/3rdparty/site_config/standard/sydsvenskan.se.txt new file mode 100644 index 00000000..da6772aa --- /dev/null +++ b/inc/3rdparty/site_config/standard/sydsvenskan.se.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | author: //a[contains(@href, '/sok/?')]/text() | ||
4 | |||
5 | date: substring-after(//span[@class='date'], 'Publicerad ') | ||
6 | |||
7 | body: //div[@class='two_column_left'] | ||
8 | strip_id_or_class: story | ||
9 | strip: //div[@class='leadText saplo:lead']/h5 | ||
10 | |||
11 | test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna-- \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt b/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt new file mode 100644 index 00000000..3109c0e7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/symmetrymagazine.org.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //div[contains(@class, "post")]/h2 | ||
2 | |||
3 | author: //div[contains(@class, "post")]/p[position()=last()]/text()[1] | ||
4 | |||
5 | date: //div[contains(@class, "post")]/p[1] | ||
6 | |||
7 | body: //div[contains(@class, "post")] | ||
8 | |||
9 | strip: //div[contains(@class, "post")]/h2[1] | ||
10 | strip: //div[contains(@class, "post")]/p[1] | ||
11 | strip: //div[contains(@class, "post")]/p[position()=last()] | ||
12 | test_url: http://www.symmetrymagazine.org/breaking/?p=12784 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt b/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt new file mode 100644 index 00000000..c3e34977 --- /dev/null +++ b/inc/3rdparty/site_config/standard/sz-magazin.sueddeutsche.de.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //h1 | ||
2 | body://div[@class='drucken'] | ||
3 | author: substring-after(//span[@class='autor'], 'Von ') | ||
4 | author: //span[@class='autor'] | ||
5 | |||
6 | single_page_link://a[contains(@href, '/drucken/')] | ||
7 | convert_double_br_tags:yes | ||
8 | |||
9 | dissolve://div[@class='vorspann'] | ||
10 | |||
11 | strip://h1 | ||
12 | strip_id_or_class: klassifizierung | ||
13 | strip_id_or_class: source | ||
14 | strip_id_or_class: autor | ||
15 | test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tagesschau.de.txt b/inc/3rdparty/site_config/standard/tagesschau.de.txt new file mode 100644 index 00000000..8ce8a90e --- /dev/null +++ b/inc/3rdparty/site_config/standard/tagesschau.de.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | title://h1[1] | ||
2 | |||
3 | author: substring-after(//em, 'Von ') | ||
4 | author:string('tagesschau.de') | ||
5 | |||
6 | date:substring-after(//div[@class='standDatum'], 'Stand: ') | ||
7 | |||
8 | body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')] | ||
9 | |||
10 | strip://h1[1] | ||
11 | strip: //div[contains(@class, 'directLinks')] | ||
12 | strip: //div[contains(@class, 'zitatBox')] | ||
13 | strip: //div[contains(@class, 'teaserBox metaBlock')] | ||
14 | strip: //*[contains(@class, 'inv')] | ||
15 | strip: //span[@class='imgSubline'] | ||
16 | strip: //*[contains(@class, 'topline')][1] | ||
17 | strip: //div[@id='rightCol'][1] | ||
18 | strip: //div[@id="footer"][1] | ||
19 | strip: //div[@class="fPlayer"] | ||
20 | strip: //div[@id='seitenanfang'] | ||
21 | strip: //div[@class='standDatum'] | ||
22 | strip: //em | ||
23 | test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tampabay.com.txt b/inc/3rdparty/site_config/standard/tampabay.com.txt new file mode 100644 index 00000000..bfe841c6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tampabay.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //span[@class="entry-title"] | ||
2 | author: //*[contains(@class, 'item')]/p/a/text() | ||
3 | date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:') | ||
4 | body: //div[@class="entry-content"] | ||
5 | test_url: http://www.tampabay.com/news/salvador-dali-leaders-want-st-petersburg-city-council-to-put-brakes-on/1236349 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/taptaptap.com.txt b/inc/3rdparty/site_config/standard/taptaptap.com.txt new file mode 100644 index 00000000..13de70e9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/taptaptap.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h3[@class="storytitle"] | ||
2 | body: //div[@class="post"] | ||
3 | strip: //div[@class="blurbBox"] | ||
4 | test_url: http://taptaptap.com/blog/apples-precedents-vs-apples-guidelines/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tasteofhome.com.txt b/inc/3rdparty/site_config/standard/tasteofhome.com.txt new file mode 100644 index 00000000..77773363 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tasteofhome.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //span[@id='ctl00_ctl00_MainContent_MainContent_RecipeImage1_lblRecipeTitle'] | ||
2 | body: //div[@id='RDNEW']//*[@class='Recipe-imgCon' or @class='Recipe-Intro' or @class='recipeDetails'] | ||
3 | strip_id_or_class: rec-ExRightPanel | ||
4 | strip_id_or_class: divCarousel | ||
5 | strip_id_or_class: preptimeOuter | ||
6 | strip_id_or_class: cooktimeOuter | ||
7 | strip_id_or_class: durationOuter | ||
8 | strip_id_or_class: divImageFooter | ||
9 | strip_id_or_class: microFormatFnIngred | ||
10 | strip: //span[@class='Recipe-Intro']//*[@class='link' or @class='rating'] | ||
11 | |||
12 | prune: no | ||
13 | tidy: no | ||
14 | |||
15 | test_url: http://www.tasteofhome.com/recipes/Grinch-Punch \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/taz.de.txt b/inc/3rdparty/site_config/standard/taz.de.txt new file mode 100644 index 00000000..6e84527b --- /dev/null +++ b/inc/3rdparty/site_config/standard/taz.de.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | date: //div[@class='secthead'] | ||
2 | body: //div[@class='sectbody'] | ||
3 | title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1) | ||
4 | author: //span[@class='author'] | ||
5 | strip: //p[@class='caption'] | ||
6 | strip_id_or_class: rack | ||
7 | |||
8 | test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tbray.org.txt b/inc/3rdparty/site_config/standard/tbray.org.txt new file mode 100644 index 00000000..fbe94fa4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tbray.org.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | body: //div[@id='centercontent'] | ||
2 | strip: //div[@id='rightcontent'] | ||
3 | date: substring-before( //div[@id='cats'], '·') | ||
4 | title: //h1 | ||
5 | test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tcng.org.txt b/inc/3rdparty/site_config/standard/tcng.org.txt new file mode 100644 index 00000000..765224e4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tcng.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //div[@id='main-content']/h1 | ||
2 | body: //div[@id='main-content'] | ||
3 | strip: //div[@id='main-content']/h1 | ||
4 | test_url: http://www.tcng.org/index.php/blog/view/teaching-basic-health-cutting-down-costs \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt b/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt new file mode 100644 index 00000000..b6d17da4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tech.fortune.cnn.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h1[@class='storyheadline'] | ||
2 | body: //div[@class='storytext'] | ||
3 | strip: //strong | ||
4 | test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt b/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt new file mode 100644 index 00000000..f7228ebf --- /dev/null +++ b/inc/3rdparty/site_config/standard/tech.sina.com.cn.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title://h1[contains(@id,'artibodyTitle')] | ||
2 | |||
3 | date://span[contains(@id,'pub_date')] | ||
4 | |||
5 | body://div[contains(@id,'artibody')] | ||
6 | |||
7 | strip://div[contains(@class,'otherContent')] | ||
8 | |||
9 | next_page_link://p[@class='page']/a[contains(.,'下一页')] | ||
10 | |||
11 | test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techcrunch.com.txt b/inc/3rdparty/site_config/standard/techcrunch.com.txt new file mode 100644 index 00000000..f436acb5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/techcrunch.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')] | ||
2 | |||
3 | author: //a[@class="name"] | ||
4 | |||
5 | date: //div[@class="post-time"] | ||
6 | |||
7 | title: //h1[@class="headline"] | ||
8 | strip_id_or_class: module-crunchbase | ||
9 | |||
10 | # The following is for the mobile site | ||
11 | body: //div[@id="singlentry"] | ||
12 | author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ') | ||
13 | date: substring-before(//div[@class="single-post-meta-top"],' @') | ||
14 | title: //a[@class="sh2"] | ||
15 | |||
16 | prune: no | ||
17 | |||
18 | test_url: http://techcrunch.com/2011/10/18/apples-insanely-great-q1-2012/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techdirt.com.txt b/inc/3rdparty/site_config/standard/techdirt.com.txt new file mode 100644 index 00000000..727f3701 --- /dev/null +++ b/inc/3rdparty/site_config/standard/techdirt.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@class='story'] | ||
2 | title: //div[@class='story']/h1 | ||
3 | strip: //div[@class='story']/h1 | ||
4 | |||
5 | author: //div[@class='details']/p[contains(., 'by ')]/a | ||
6 | date: //p[@class='storydate'] | ||
7 | |||
8 | strip: //p[a[contains(., 'Leave a Comment')]] | ||
9 | strip_id_or_class: share | ||
10 | strip_id_or_class: maincolumn_head | ||
11 | strip_id_or_class: maincolmod | ||
12 | test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techmeme.com.txt b/inc/3rdparty/site_config/standard/techmeme.com.txt new file mode 100644 index 00000000..8644e00f --- /dev/null +++ b/inc/3rdparty/site_config/standard/techmeme.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | single_page_link_in_feed: //b/a | ||
2 | |||
3 | test_url_feed: http://www.techmeme.com/feed.xml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt b/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt new file mode 100644 index 00000000..cc26ee4c --- /dev/null +++ b/inc/3rdparty/site_config/standard/technicallyjordan.tumblr.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h2 | ||
2 | author: //meta[@name="author"]/@content | ||
3 | date: //h3 | ||
4 | body: //div[@class="postBody"] | ||
5 | strip: //h1 | ||
6 | strip: //h2 | ||
7 | strip: //h3 | ||
8 | test_url: http://technicallyjordan.tumblr.com/post/22914659822/facebook-to-launch-app-store-knock-off \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/technologyreview.com.txt b/inc/3rdparty/site_config/standard/technologyreview.com.txt new file mode 100644 index 00000000..41f21d46 --- /dev/null +++ b/inc/3rdparty/site_config/standard/technologyreview.com.txt | |||
@@ -0,0 +1,16 @@ | |||
1 | title: //header[@class='article-meta']/h1 | ||
2 | title: substring-before(//title, '|') | ||
3 | |||
4 | body: //section[contains(@class, 'body')] | ||
5 | |||
6 | # Author & Date for News and Featured Stories | ||
7 | author: //ul[@class='byline']/li/a | ||
8 | author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on') | ||
9 | date: substring-after(//ul[@class='byline']/li, 'on ') | ||
10 | |||
11 | # Author & Date for "Views" | ||
12 | author: //div[@class='view-byline']/div[@class='meta']/h2[1] | ||
13 | date: //div[@class='view-byline']/div[@class='meta']/h2[2] | ||
14 | |||
15 | next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')] | ||
16 | test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techpinions.com.txt b/inc/3rdparty/site_config/standard/techpinions.com.txt new file mode 100644 index 00000000..89ed8349 --- /dev/null +++ b/inc/3rdparty/site_config/standard/techpinions.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: //div[@class="post"] | ||
2 | |||
3 | strip: //div[@class="post-meta"] | ||
4 | strip: //div[@id="socialicons"] | ||
5 | strip: //div[@id="authorbox"] | ||
6 | |||
7 | test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/techradar.com.txt b/inc/3rdparty/site_config/standard/techradar.com.txt new file mode 100644 index 00000000..ed92a974 --- /dev/null +++ b/inc/3rdparty/site_config/standard/techradar.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | # Title without news/reviews etc. appended | ||
2 | title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1 | ||
3 | |||
4 | # Remove home link | ||
5 | strip: //div[@id='page_logo']/a | ||
6 | |||
7 | # Remove utilities | ||
8 | strip: //*[(@id = "utilities")] | ||
9 | |||
10 | # Remove comments link | ||
11 | strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny'] | ||
12 | test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/telegraaf.nl.txt b/inc/3rdparty/site_config/standard/telegraaf.nl.txt new file mode 100644 index 00000000..ff3cd06e --- /dev/null +++ b/inc/3rdparty/site_config/standard/telegraaf.nl.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='artikelKolom'] | ||
2 | strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper'] | ||
3 | strip: //div[@id='artikeltoolbar'] | ||
4 | strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer'] | ||
5 | strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget'] | ||
6 | tidy: no | ||
7 | prune: no | ||
8 | |||
9 | test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/telegraph.co.uk.txt b/inc/3rdparty/site_config/standard/telegraph.co.uk.txt new file mode 100644 index 00000000..e1faf23b --- /dev/null +++ b/inc/3rdparty/site_config/standard/telegraph.co.uk.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea'] | ||
2 | strip: //p[@class='comments'] | ||
3 | strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")] | ||
4 | strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links'] | ||
5 | strip: //p[@class='bbpTweet']/span[@class='timestamp'] | ||
6 | strip: //p[@class='bbpTweet']/span[@class='metadata']//img | ||
7 | tidy: no | ||
8 | prune: no | ||
9 | |||
10 | test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theappleblog.com.txt b/inc/3rdparty/site_config/standard/theappleblog.com.txt new file mode 100644 index 00000000..3bd555f1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theappleblog.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | # Remove home link | ||
2 | strip: //div[@id='blog-title']/a | ||
3 | test_url: http://theappleblog.com/2010/10/21/the-new-macbook-air-is-underwhelming/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theatlantic.com.txt b/inc/3rdparty/site_config/standard/theatlantic.com.txt new file mode 100644 index 00000000..267fd39c --- /dev/null +++ b/inc/3rdparty/site_config/standard/theatlantic.com.txt | |||
@@ -0,0 +1,18 @@ | |||
1 | title: //div[@id='article']/h1 | ||
2 | title: //h1 | ||
3 | |||
4 | body: //div[@class='articleText'] | ||
5 | body: //div[@class='articleContent'] | ||
6 | body: //div[@id='article'] | ||
7 | date: //*[contains(@class, 'date')] | ||
8 | author: //div[@id='profile']//*[@class='authors']//a[1] | ||
9 | author: //*[@class='author']/span | ||
10 | prune: no | ||
11 | |||
12 | strip: //div[@class='moreOnBoxWithImages'] | ||
13 | |||
14 | single_page_link: //a[@class='print'] | ||
15 | |||
16 | test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/ | ||
17 | test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/ | ||
18 | test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thebostonchannel.com.txt b/inc/3rdparty/site_config/standard/thebostonchannel.com.txt new file mode 100644 index 00000000..64df90c1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thebostonchannel.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //meta[@name='og:title']/@content | ||
2 | date: //meta[@name='created']/@content | ||
3 | body: //div[@class="StoryBody" or @class="storyTeaser"] | ||
4 | |||
5 | replace_string(<p></p>): <br /><br /> | ||
6 | |||
7 | test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thebrowser.com.txt b/inc/3rdparty/site_config/standard/thebrowser.com.txt new file mode 100644 index 00000000..c3c20504 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thebrowser.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h2[contains(@class, 'page-title')] | ||
2 | body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content'] | ||
3 | |||
4 | prune: no | ||
5 | |||
6 | strip: //div[contains(@class, 'node-book')]//a[@class='button'] | ||
7 | |||
8 | single_page_link: //a[@class='tool-print'] | ||
9 | |||
10 | test_url: http://thebrowser.com/interviews/yotam-ottolenghi-on-his-favourite-cookery-books \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thecarton.net.txt b/inc/3rdparty/site_config/standard/thecarton.net.txt new file mode 100644 index 00000000..9ef4ed8b --- /dev/null +++ b/inc/3rdparty/site_config/standard/thecarton.net.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: substring-before(//title, ' – ') | ||
2 | author:string('Shawn') | ||
3 | date: //*/time/@pubdate | ||
4 | |||
5 | |||
6 | strip: //header | ||
7 | strip: //div[@id='prev_next'] | ||
8 | strip: //div[@id='masthead'] | ||
9 | |||
10 | test_url: http://thecarton.net/2012/12/20/imdb \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thedaily.com.txt b/inc/3rdparty/site_config/standard/thedaily.com.txt new file mode 100644 index 00000000..24ebbbac --- /dev/null +++ b/inc/3rdparty/site_config/standard/thedaily.com.txt | |||
@@ -0,0 +1,24 @@ | |||
1 | #keep all body text | ||
2 | prune: no | ||
3 | |||
4 | #title, body, metadata | ||
5 | title: //div[@class='story_header']/h1 | ||
6 | body: //div[@id='content'] | ||
7 | author: substring-after(//span[@class='byline'], "by ") | ||
8 | author: substring-after(//span[@class='byline'], "By ") | ||
9 | author: //span[@class='byline'] | ||
10 | date: //span[@class='date'] | ||
11 | |||
12 | #formatting | ||
13 | convert_double_br_tags: yes | ||
14 | dissolve: //div[@class='slides_full']/ul/li | ||
15 | |||
16 | # cleanup | ||
17 | strip: //a[@id='story_note'] | ||
18 | strip: //br | ||
19 | strip: //div[@class='intro'] | ||
20 | strip: //div[@class='share-block'] | ||
21 | strip: //div[@class='sidebar-social'] | ||
22 | strip: //div[@class='top-stories'] | ||
23 | strip: //div[@class='prevnext'] | ||
24 | test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thedailybeast.com.txt b/inc/3rdparty/site_config/standard/thedailybeast.com.txt new file mode 100644 index 00000000..4781c65a --- /dev/null +++ b/inc/3rdparty/site_config/standard/thedailybeast.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //h1 | ||
2 | body: //article/div[contains(@class, 'article-body')] | ||
3 | #strip: //header/hgroup/h1 | ||
4 | strip: //footer[@class='storyFooter'] | ||
5 | single_page_link: //li[@class='print']/a | ||
6 | prune: no | ||
7 | test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt b/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt new file mode 100644 index 00000000..0f15558d --- /dev/null +++ b/inc/3rdparty/site_config/standard/thedailymash.co.uk.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | # Remove duplicated title | ||
2 | strip: //div[@id='content']/div[1][@class='full_intro']/h2 | ||
3 | |||
4 | # Remove links, ads etc. | ||
5 | strip: //*[(@class= "aside")] | ||
6 | |||
7 | # Remove the date and add it to the date published field in Instapaper | ||
8 | strip: //div[@class="date"] | ||
9 | date: //div[@class="date"] | ||
10 | |||
11 | # There is no byline on The Daily Mash. | ||
12 | |||
13 | convert_double_br_tags: yes | ||
14 | test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thefilmexperience.net.txt b/inc/3rdparty/site_config/standard/thefilmexperience.net.txt new file mode 100644 index 00000000..e6b5115a --- /dev/null +++ b/inc/3rdparty/site_config/standard/thefilmexperience.net.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='body'] | ||
2 | test_url: http://thefilmexperience.net/blog/2011/12/30/distant-relatives-2001-a-space-odyssey-and-the-tree-of-life.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theglobalmail.org.txt b/inc/3rdparty/site_config/standard/theglobalmail.org.txt new file mode 100644 index 00000000..fae0fb29 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theglobalmail.org.txt | |||
@@ -0,0 +1,41 @@ | |||
1 | title: //h1[@id="headline"] | ||
2 | author: //div[contains(@class, "editorial-byline-author")]/a | ||
3 | date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ") | ||
4 | |||
5 | # The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed | ||
6 | body: //div[@id="template"] | ||
7 | strip_id_or_class: editorial-byline-pic | ||
8 | strip_id_or_class: editorial-byline | ||
9 | strip_id_or_class: headline | ||
10 | |||
11 | # Include the leadin paragraph in the body text, but remove quotes because they're out of context | ||
12 | dissolve: //div[contains(@id, "leadin")] | ||
13 | strip_id_or_class: pullquote | ||
14 | |||
15 | # Image captions removed because they're confusing in body text | ||
16 | strip_id_or_class: image-caption-content | ||
17 | |||
18 | # Remove header and footer | ||
19 | strip_id_or_class: header | ||
20 | strip_id_or_class: footer | ||
21 | |||
22 | # Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image | ||
23 | strip: /html/body/span[contains(@style, "display: none")] | ||
24 | |||
25 | # Remove search box | ||
26 | strip_id_or_class: searchContainer | ||
27 | strip: //div[contains(@class, "searchInstruction")] | ||
28 | strip: //div[contains(@class, "searchResults")]/h4 | ||
29 | |||
30 | # Remove the 'Letters to the Editor' section | ||
31 | strip_id_or_class: letter-text | ||
32 | strip_id_or_class: letter-from | ||
33 | strip_id_or_class: letter-date | ||
34 | |||
35 | # Remove Like/Tweet links | ||
36 | strip_id_or_class: social-tab | ||
37 | |||
38 | # Remove 'divider' which causes an inexplicable slash to appear in the article body | ||
39 | strip_id_or_class: divider | ||
40 | |||
41 | test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theglobeandmail.com.txt b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt new file mode 100644 index 00000000..90634a08 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theglobeandmail.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')] | ||
2 | tidy: no | ||
3 | prune: no | ||
4 | |||
5 | test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theindychannel.com.txt b/inc/3rdparty/site_config/standard/theindychannel.com.txt new file mode 100644 index 00000000..3544f247 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theindychannel.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //h1[@class="Headline"] | ||
2 | date: substring-after(//div[@class="posted"], 'EDT ') | ||
3 | body: //div[@class="storyBody"] | ||
4 | |||
5 | strip: //td[@class="AssocContentTD"] | ||
6 | strip: //div[@id="pageTitle"] | ||
7 | strip: //div[@class="posted"] | ||
8 | strip: //div[@class="updated"] | ||
9 | strip: //div[@class="js-kit-disclaimer"] | ||
10 | strip: //table[@class="row3table"] | ||
11 | strip: //div[@class="container2"] | ||
12 | strip: //div[@id="delta"] | ||
13 | test_url: http://www.theindychannel.com/news/31050840/detail.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/themillions.com.txt b/inc/3rdparty/site_config/standard/themillions.com.txt new file mode 100644 index 00000000..e3e57fea --- /dev/null +++ b/inc/3rdparty/site_config/standard/themillions.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: /html/body/div/div[2]/div/div/div/h3 | ||
2 | |||
3 | body: /html/body/div/div[2]/div/div/div/div[2] | ||
4 | |||
5 | strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div | ||
6 | |||
7 | tidy: no | ||
8 | |||
9 | # any way to get rid of this word character garbage? | ||
10 | test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt b/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt new file mode 100644 index 00000000..518bff93 --- /dev/null +++ b/inc/3rdparty/site_config/standard/themuseumofinnocence.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | body: single-review | ||
2 | strip_id_or_class: featured-review | ||
3 | strip_id_or_class: resources | ||
4 | strip_id_or_class: rate-the-book | ||
5 | strip_id_or_class: write-review | ||
6 | |||
7 | test_url: http://themuseumofinnocence.com/review.php?id=1179 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thenation.com.txt b/inc/3rdparty/site_config/standard/thenation.com.txt new file mode 100644 index 00000000..d88bcdd6 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thenation.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[@class='print-title'] | ||
2 | body: //div[@class='print-content'] | ||
3 | author: //a[contains(@href, '/authors')] | ||
4 | author: substring-before(//div[@class='print-created'], '|') | ||
5 | date: //span[@class='article-date'] | ||
6 | date: substring-after(//div[@class='print-created'], '|') | ||
7 | prune: no | ||
8 | |||
9 | single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')] | ||
10 | |||
11 | test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt b/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt new file mode 100644 index 00000000..846b8a8a --- /dev/null +++ b/inc/3rdparty/site_config/standard/thenetworkgarden.blogs.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@id="beta-inner"] | ||
2 | title: //h3[@class="entry-header"] | ||
3 | |||
4 | test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thenextweb.com.txt b/inc/3rdparty/site_config/standard/thenextweb.com.txt new file mode 100644 index 00000000..fdc70005 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thenextweb.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body: //div[@class= 'article-body'] | ||
2 | author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')] | ||
3 | |||
4 | strip: //div[@class = 'bargo'] | ||
5 | strip: //div[@class = 'tf'] | ||
6 | strip: //div[@class = 'article']/div[@class = 'blue-box'] | ||
7 | strip_id_or_class: respond | ||
8 | |||
9 | tidy: no | ||
10 | next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href | ||
11 | |||
12 | test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theoaklandpress.com.txt b/inc/3rdparty/site_config/standard/theoaklandpress.com.txt new file mode 100644 index 00000000..c7132321 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theoaklandpress.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='fullstory'] | ||
2 | strip: //div[@id='page_leftbar'] | ||
3 | test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theonion.com.txt b/inc/3rdparty/site_config/standard/theonion.com.txt new file mode 100644 index 00000000..12918b88 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theonion.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h2[@class='title'] | ||
2 | date: substring-before(//p[@class='meta'], '|') | ||
3 | body: //div[@class='story'] | ||
4 | #body: //div[@class='article_body'] | ||
5 | |||
6 | strip: //h2[@class='title'] | ||
7 | strip: //p[@class='meta'] | ||
8 | strip: //div[@class='ga_section'] | ||
9 | strip: //div[@id='recent_slider'] | ||
10 | |||
11 | test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt b/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt new file mode 100644 index 00000000..f89f3a87 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thepioneerwoman.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title: //h1[@class='post-title'] | ||
2 | body: //div[@class='post'] | ||
3 | author: //p[@class='posted-by'] | ||
4 | date: //div[@class='sprite post-date'] | ||
5 | |||
6 | # The body of the post doesn't have it's own div so we have to strip out the metadata | ||
7 | strip: //div[@class='author_avatar'] | ||
8 | strip: //div[@class='sprite post-date'] | ||
9 | strip: //h1[@class='post-title'] | ||
10 | strip: //p[@class='posted-by'] | ||
11 | test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theregister.co.uk.txt b/inc/3rdparty/site_config/standard/theregister.co.uk.txt new file mode 100644 index 00000000..ebcc55d5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theregister.co.uk.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@id="article"]/h2 | ||
2 | author: //div[@id="article"]/p[@class="byline"]/a[1] | ||
3 | date: //div[@id="article"]/p[@class="dateline"]/a[2] | ||
4 | body: //div[@id="article"]/div[@id="body"] | ||
5 | test_url: http://www.theregister.co.uk/2011/10/06/gas_bill_shocker/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theroot.com.txt b/inc/3rdparty/site_config/standard/theroot.com.txt new file mode 100644 index 00000000..ebff662d --- /dev/null +++ b/inc/3rdparty/site_config/standard/theroot.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='node-content'] | ||
2 | strip_id_or_class: pager | ||
3 | test_url: http://www.theroot.com/views/why-i-am-male-feminist \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/therumpus.net.txt b/inc/3rdparty/site_config/standard/therumpus.net.txt new file mode 100644 index 00000000..d01a89bb --- /dev/null +++ b/inc/3rdparty/site_config/standard/therumpus.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: /html/body/div/div[2]/div/div/h1 | ||
2 | |||
3 | body: /html/body/div/div[2]/div/div/div[2] | ||
4 | test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thesiasat.com.txt b/inc/3rdparty/site_config/standard/thesiasat.com.txt new file mode 100644 index 00000000..ab9a99e8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thesiasat.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | #body: (//div[@class='ftr-yt-vid'])[1] | ||
2 | body: (//blockquote[contains(@class, 'postcontent')])[1] | ||
3 | body: (//div[starts-with(@id, 'post_message')])[1] | ||
4 | |||
5 | prune: no | ||
6 | tidy: no | ||
7 | |||
8 | #replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player" | ||
9 | #replace_string(</iframe>): </iframe> </div> | ||
10 | |||
11 | test_url: http://www.thesiasat.com/showthread.php?19220-Dunya-News-HASB-E-HAAL-16-06-2012-Part-1-5 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thesimpledollar.com.txt b/inc/3rdparty/site_config/standard/thesimpledollar.com.txt new file mode 100644 index 00000000..d5c6c9e0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thesimpledollar.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h3[@class='post-title']/a[@class='post-title-link'] | ||
2 | body: //div[@class='post-content'] | ||
3 | author: //div[@class='post-meta-under-title']/a | ||
4 | test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt b/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt new file mode 100644 index 00000000..e2ed1e63 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thespoiler.co.uk.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | strip: //*[(@id = "content")]/h2 | ||
2 | strip: //*[(@class = "wp-notable-line")] | ||
3 | test_url: http://www.thespoiler.co.uk/index.php/2010/10/21/wayne-rooney-tells-man-utd-its-not-me-its-you \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thespoof.com.txt b/inc/3rdparty/site_config/standard/thespoof.com.txt new file mode 100644 index 00000000..409dc0c9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thespoof.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h1[contains(@class, 'cTitle')] | ||
2 | body: //div[contains(@class, 'KonaBody') or @id='articleimageright'] | ||
3 | author: //meta[@name='Author']/@content | ||
4 | date: //meta[@name='OriginalPublicationDate']/@content | ||
5 | |||
6 | prune: no | ||
7 | tidy: no | ||
8 | |||
9 | test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thestranger.com.txt b/inc/3rdparty/site_config/standard/thestranger.com.txt new file mode 100644 index 00000000..0f9855c8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thestranger.com.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | # savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029 | ||
2 | |||
3 | #other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885 | ||
4 | |||
5 | title: //div[@id='savageColumn_head']/h1 | ||
6 | title: //h1[@class="headlineLarge"] | ||
7 | |||
8 | strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner'] | ||
9 | |||
10 | body: //div[@id='savageColumn'] | ||
11 | body: //div[@id='story_text'] | ||
12 | test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thestreet.com.txt b/inc/3rdparty/site_config/standard/thestreet.com.txt new file mode 100644 index 00000000..5de75637 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thestreet.com.txt | |||
@@ -0,0 +1,25 @@ | |||
1 | title: //div[@id='storyHdr']/h1 | ||
2 | title: //div[@id='print']//h2 | ||
3 | body: //div[@class="virtualpage"] | ||
4 | body: //div[@id='print']//div[@id='bd'] | ||
5 | author: //meta[@name="AUTHOR"]/@content | ||
6 | author: (//div[@id='print']//div[@id='bd']/h4)[1] | ||
7 | date: //meta[@name="DATE"]/@content | ||
8 | date: //div[@id='print']//div[@id='dte'] | ||
9 | |||
10 | strip_id_or_class: articleFooter | ||
11 | strip_id_or_class: sidebar | ||
12 | strip_id_or_class: ie6PrintSubhead | ||
13 | strip_id_or_class: subHdr | ||
14 | |||
15 | |||
16 | replace_string(<P/>): </p><p> | ||
17 | |||
18 | prune: no | ||
19 | |||
20 | #TODO: redirects back - perhaps needs referer to work | ||
21 | single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')] | ||
22 | |||
23 | test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html | ||
24 | # multi page | ||
25 | test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt b/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt new file mode 100644 index 00000000..6b3277eb --- /dev/null +++ b/inc/3rdparty/site_config/standard/thethaovanhoa.vn.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip:/html/body/form/div[2]/div[3]/div/div/div/div/div/div/div/div/div/div[2]/div[3]/div[2]/div/p[2] | ||
2 | test_url: http://thethaovanhoa.vn/151N20110519085606745T129/levante-quyet-giu-caicedo.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theverge.com.txt b/inc/3rdparty/site_config/standard/theverge.com.txt new file mode 100644 index 00000000..11c5c153 --- /dev/null +++ b/inc/3rdparty/site_config/standard/theverge.com.txt | |||
@@ -0,0 +1,31 @@ | |||
1 | title: //h1[contains(@class, "headline")] | ||
2 | |||
3 | author: //p[contains(@class, "byline")]/a[contains(@class, "author")] | ||
4 | |||
5 | date: substring-after(normalize-space(//p[contains(@class, "byline")]/span[contains(@class, "publish-date")]), "on ") | ||
6 | |||
7 | body: //article[contains(@class, 'feature-entry')] | ||
8 | body: //article | ||
9 | prune: no | ||
10 | tidy: no | ||
11 | |||
12 | strip: //article/header | ||
13 | strip: //*[@id='sticky-menu'] | ||
14 | strip: //aside | ||
15 | strip: //nav | ||
16 | |||
17 | strip_id_or_class: gallery | ||
18 | strip_id_or_class: article-meta | ||
19 | strip_id_or_class: story-navigation | ||
20 | strip_id_or_class: slegend | ||
21 | strip_id_or_class: related-product-meta | ||
22 | strip_id_or_class: comments | ||
23 | strip_id_or_class: ui-jump-list | ||
24 | strip_id_or_class: pullquote | ||
25 | |||
26 | strip: //q | ||
27 | |||
28 | strip: //a[contains(@class, 'entry-section-title')] | ||
29 | |||
30 | test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review | ||
31 | test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/theweek.com.txt b/inc/3rdparty/site_config/standard/theweek.com.txt new file mode 100644 index 00000000..27281ceb --- /dev/null +++ b/inc/3rdparty/site_config/standard/theweek.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | body: //div[@class="briefingEntry"] | ||
2 | prune: no | ||
3 | |||
4 | test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thinkprogress.org.txt b/inc/3rdparty/site_config/standard/thinkprogress.org.txt new file mode 100644 index 00000000..8934b68e --- /dev/null +++ b/inc/3rdparty/site_config/standard/thinkprogress.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //p[@class="byline"]/a | ||
2 | body: //div[@class="post"] | ||
3 | |||
4 | test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thisdaylive.com.txt b/inc/3rdparty/site_config/standard/thisdaylive.com.txt new file mode 100644 index 00000000..958d4b27 --- /dev/null +++ b/inc/3rdparty/site_config/standard/thisdaylive.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body'] | ||
2 | test_url: http://www.thisdaylive.com/articles/australia-pm-talks-human-rights-with-chinas-wen/90394/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/thisismynext.com.txt b/inc/3rdparty/site_config/standard/thisismynext.com.txt new file mode 100644 index 00000000..6850b4be --- /dev/null +++ b/inc/3rdparty/site_config/standard/thisismynext.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | author: //div[@class='meta clearfix']/a | ||
2 | body: //div[@class='post'] | ||
3 | |||
4 | strip: //div[@class='metaCat'] | ||
5 | strip: //div[@class='post']/h1 | ||
6 | strip: //div[@class='post']/div[@class='meta clearfix'] | ||
7 | strip: //div[@class='post']/div[@class='social-bar clearfix'] | ||
8 | test_url: http://thisismynext.com/2011/10/18/galaxy-nexus-android-ice-cream-sandwich-pictures-video-hands-on/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tidbits.com.txt b/inc/3rdparty/site_config/standard/tidbits.com.txt new file mode 100644 index 00000000..8bcf2ec1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tidbits.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | author: //span[@class='fn'] | ||
2 | date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|') | ||
3 | test_url: http://tidbits.com/article/12651 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/time.com.txt b/inc/3rdparty/site_config/standard/time.com.txt new file mode 100644 index 00000000..fd3fe08c --- /dev/null +++ b/inc/3rdparty/site_config/standard/time.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | # 2011-10-25 - carlo@... - Initial setup. | ||
2 | |||
3 | single_page_link: //li[@class='print']/a/@href | ||
4 | |||
5 | title: //h1 | ||
6 | author: //meta[@name="byline"]/@content | ||
7 | date: //meta[@name="date"]/@content | ||
8 | |||
9 | strip: //span[@class="see"] | ||
10 | strip: //div[@class="byline"] | ||
11 | strip: //div[@id="date2"] | ||
12 | strip: //h1 | ||
13 | |||
14 | test_url: http://www.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt b/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt new file mode 100644 index 00000000..17297732 --- /dev/null +++ b/inc/3rdparty/site_config/standard/timeshighereducation.co.uk.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@class="storytext"] | ||
3 | strip: //div[@id="thelogin"] | ||
4 | strip: //*[@class="hide"] | ||
5 | strip: //div[@id="anchored"] | ||
6 | test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tipb.com.txt b/inc/3rdparty/site_config/standard/tipb.com.txt new file mode 100644 index 00000000..9533eb0f --- /dev/null +++ b/inc/3rdparty/site_config/standard/tipb.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | body: //div[@id='content'] | ||
2 | |||
3 | strip_id_or_class: featured-box | ||
4 | strip_id_or_class: postmeta | ||
5 | strip_id_or_class: respond | ||
6 | |||
7 | author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')] | ||
8 | date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ') | ||
9 | test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tnr.com.txt b/inc/3rdparty/site_config/standard/tnr.com.txt new file mode 100644 index 00000000..65a1899f --- /dev/null +++ b/inc/3rdparty/site_config/standard/tnr.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1 | ||
2 | title: //div[contains(@class, 'article_detail')]//h1 | ||
3 | title: //h1 | ||
4 | |||
5 | body: //div[contains(@class, 'article_detail')] | ||
6 | |||
7 | author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3 | ||
8 | author: div[@class='author']//h3 | ||
9 | strip: //div[contains(@class, 'field-field-book-cover')] | ||
10 | |||
11 | date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '') | ||
12 | |||
13 | prune: no | ||
14 | |||
15 | single_page_link: //a[@class='print-page'] | ||
16 | |||
17 | test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tomdispatch.com.txt b/inc/3rdparty/site_config/standard/tomdispatch.com.txt new file mode 100644 index 00000000..d8548c78 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tomdispatch.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //div[@id='maincontent']//div[@class='title'] | ||
2 | body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat'] | ||
3 | |||
4 | tidy: no | ||
5 | |||
6 | test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tomshardware.com.txt b/inc/3rdparty/site_config/standard/tomshardware.com.txt new file mode 100644 index 00000000..2bba6de8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tomshardware.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | tidy: no | ||
2 | title: //title | ||
3 | author: //a[@itemprop = 'author'] | ||
4 | date: //time[@itemprop = 'datePublished'] | ||
5 | body: //div[@id = 'intelliTXT'] | ||
6 | |||
7 | next_page_link: //li[@class="pagin next"]/a | ||
8 | test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tomshardware.de.txt b/inc/3rdparty/site_config/standard/tomshardware.de.txt new file mode 100644 index 00000000..e910003c --- /dev/null +++ b/inc/3rdparty/site_config/standard/tomshardware.de.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | body://div[@id="news-content"]/div[@id="intelliTXT"][1] | ||
2 | |||
3 | author://div[@id="header-news-infos"]/a[1] | ||
4 | |||
5 | date: //div[@id="header-news-infos"]/span[1] | ||
6 | |||
7 | title://h1[@id="header-news-title" and @class="hardwareTitle"][1] | ||
8 | |||
9 | strip://div[@id="news-content"]/div[@id="intelliTXT"]/table | ||
10 | |||
11 | footnotes: no | ||
12 | test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/toolsandtoys.net.txt b/inc/3rdparty/site_config/standard/toolsandtoys.net.txt new file mode 100644 index 00000000..dbe60b15 --- /dev/null +++ b/inc/3rdparty/site_config/standard/toolsandtoys.net.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class='post'] | ||
2 | |||
3 | strip: //div[@class='social'] | ||
4 | strip: //span[@class='next'] | ||
5 | strip: //span[@class='previous'] | ||
6 | test_url: http://toolsandtoys.net/noble-tonic-02/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/trailer.web-view.net.txt b/inc/3rdparty/site_config/standard/trailer.web-view.net.txt new file mode 100644 index 00000000..e7a9c82d --- /dev/null +++ b/inc/3rdparty/site_config/standard/trailer.web-view.net.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: concat(substring-before(//title,':'),': ',//div[@class='Date2']) | ||
2 | test_url: http://trailer.web-view.net/Show/0XC4EFE5D648B716BA2E134BC7CE61B9CC001E04F11E9434438186735DBD637488.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/traningslara.se.txt b/inc/3rdparty/site_config/standard/traningslara.se.txt new file mode 100644 index 00000000..96e491fa --- /dev/null +++ b/inc/3rdparty/site_config/standard/traningslara.se.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //div[@class="Post-body"]//span[@class="PostHeader"] | ||
2 | author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"] | ||
3 | date: substring-before(//div[@class="PostHeaderIcons metadata"], '|') | ||
4 | body: //div[@class="Post-body"] | ||
5 | strip_id_or_class: print1 | ||
6 | strip_id_or_class: metadata | ||
7 | strip_id_or_class: authorbox | ||
8 | test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/triblive.com.txt b/inc/3rdparty/site_config/standard/triblive.com.txt new file mode 100644 index 00000000..82797db9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/triblive.com.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //title | ||
2 | author: //span/a | ||
3 | date: substring-after(//small,'Published:') | ||
4 | |||
5 | strip: //h1[@class='vert_class'] | ||
6 | strip: //h1[@class='headline'] | ||
7 | strip: //img[contains(@src,'logo_triblive.gif')] | ||
8 | |||
9 | #strip: //h6 | ||
10 | #strip_img_src: logo_triblive.gif | ||
11 | |||
12 | single_page_link: //a[@class='stprint'] | ||
13 | test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/truthdig.com.txt b/inc/3rdparty/site_config/standard/truthdig.com.txt new file mode 100644 index 00000000..e7c1a4bc --- /dev/null +++ b/inc/3rdparty/site_config/standard/truthdig.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //div[@class='printbody']/h1 | ||
2 | body: //div[@class='printbody'] | ||
3 | prune: no | ||
4 | |||
5 | strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/'] | ||
6 | strip: //table[@class='footer'] | ||
7 | |||
8 | single_page_link: //div[@class='article_tools']//a[contains(@href, '/print/')] | ||
9 | |||
10 | test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tthfanfic.org.txt b/inc/3rdparty/site_config/standard/tthfanfic.org.txt new file mode 100644 index 00000000..0dab5b0f --- /dev/null +++ b/inc/3rdparty/site_config/standard/tthfanfic.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h2 | ||
2 | author: //a[starts-with(@href, '/AuthorStories')] | ||
3 | body: //div[@id='storyinnerbody'] | ||
4 | test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tthor.com.txt b/inc/3rdparty/site_config/standard/tthor.com.txt new file mode 100644 index 00000000..902fcd13 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tthor.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | prune: no | ||
2 | test_url: http://www.tthor.com/06/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tuaw.com.txt b/inc/3rdparty/site_config/standard/tuaw.com.txt new file mode 100644 index 00000000..b86f8ccb --- /dev/null +++ b/inc/3rdparty/site_config/standard/tuaw.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='posttitle'] | ||
2 | author: //span[@class='author']/a | ||
3 | date: //span[@class='timestamp'] | ||
4 | body: //div[@class='body'] | ||
5 | |||
6 | test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tuckreview.com.txt b/inc/3rdparty/site_config/standard/tuckreview.com.txt new file mode 100644 index 00000000..a3946cbc --- /dev/null +++ b/inc/3rdparty/site_config/standard/tuckreview.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='post-title'] | ||
2 | author: //div[@class='display-name'] | ||
3 | date: //div[@class='date'] | ||
4 | body: //div[@class='body'] | ||
5 | footnotes: no | ||
6 | test_url: http://tuckreview.com/2012/8/14/migrating-to-v6 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/tvtropes.org.txt b/inc/3rdparty/site_config/standard/tvtropes.org.txt new file mode 100644 index 00000000..08dbba59 --- /dev/null +++ b/inc/3rdparty/site_config/standard/tvtropes.org.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | # Google Custom Search | ||
2 | strip_id_or_class: google_branding_style | ||
3 | |||
4 | # Avoid double title | ||
5 | strip_id_or_class: pagetitle | ||
6 | |||
7 | # external links are labelled | ||
8 | strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif | ||
9 | |||
10 | title: //div[@class="pagetitle"] | ||
11 | body: //div[@id="wikitext"] | ||
12 | |||
13 | # don't get clever. | ||
14 | strip_comments: no | ||
15 | prune: no | ||
16 | |||
17 | # navigation in footer lives inside the wikitext div, annoyingly. | ||
18 | strip_id_or_class: pathholder | ||
19 | |||
20 | test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/twitter.com.txt b/inc/3rdparty/site_config/standard/twitter.com.txt new file mode 100644 index 00000000..12ab1546 --- /dev/null +++ b/inc/3rdparty/site_config/standard/twitter.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //title | ||
2 | body: (//p[contains(@class, 'js-tweet-text')])[1] | ||
3 | author: (//strong[contains(@class, 'fullname')])[1] | ||
4 | date: //span[contains(@class, 'js-short-timestamp')]/@data-time | ||
5 | |||
6 | prune: no | ||
7 | tidy: no | ||
8 | |||
9 | test_url: https://twitter.com/medialens/status/216883678582804480 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/uefa.com.txt b/inc/3rdparty/site_config/standard/uefa.com.txt new file mode 100644 index 00000000..088d6586 --- /dev/null +++ b/inc/3rdparty/site_config/standard/uefa.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText'] | ||
2 | strip: //div[contains(@class, 'mpindex')] | ||
3 | prune: no | ||
4 | tidy: no | ||
5 | |||
6 | test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt b/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt new file mode 100644 index 00000000..29e19565 --- /dev/null +++ b/inc/3rdparty/site_config/standard/uk.xbox360.ign.com.txt | |||
@@ -0,0 +1,23 @@ | |||
1 | # applies to uk.ds.ign.com, uk.wii.ign.com etc. | ||
2 | # possibly to non-UK versions, but I can’t test that | ||
3 | |||
4 | title: //h1[@class="headline"] | ||
5 | author: //div[@class="hdr-sub byline"]/a | ||
6 | date: //h2[@class="publish-date"]/span | ||
7 | body: //div[@id="main-article-content"] | ||
8 | |||
9 | strip: //ul[@class="lnks-readmore"] | ||
10 | |||
11 | strip: //div[@class="inlineImageCaption"] | ||
12 | # can’t make the images appear, so remove the captions | ||
13 | |||
14 | strip: //div[@style="width:468px"] | ||
15 | # video caption links | ||
16 | |||
17 | convert_double_br_tags: yes | ||
18 | |||
19 | strip_comments: no | ||
20 | # otherwise the ‘Closing Comments’ are removed | ||
21 | |||
22 | # Ratings box could do with some rearranging, but it’s tricky | ||
23 | test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/uni-watch.com.txt b/inc/3rdparty/site_config/standard/uni-watch.com.txt new file mode 100644 index 00000000..cbe87d19 --- /dev/null +++ b/inc/3rdparty/site_config/standard/uni-watch.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on') | ||
2 | date: substring-after(//div[@class='post-byline'], ', on') | ||
3 | |||
4 | # for some reason, the following is producing a "no text [48]" error | ||
5 | #title: //div[@class='post-headline'] | ||
6 | |||
7 | # for some reason, the following doesn't appear to isolate just the body copy | ||
8 | body: //div[@class='post-bodycopy'] | ||
9 | |||
10 | # we solve the above issue by stripping out everything else we don't want | ||
11 | # these can probably all be removed if the body: command above worked | ||
12 | strip_id_or_class: reply | ||
13 | strip_id_or_class: left | ||
14 | strip_id_or_class: post-headline | ||
15 | strip_id_or_class: post-byline | ||
16 | strip_id_or_class: footer | ||
17 | test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/urbandictionary.com.txt b/inc/3rdparty/site_config/standard/urbandictionary.com.txt new file mode 100644 index 00000000..86061f77 --- /dev/null +++ b/inc/3rdparty/site_config/standard/urbandictionary.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //title | ||
2 | body: //td[@id='content'] | ||
3 | test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/usccb.org.txt b/inc/3rdparty/site_config/standard/usccb.org.txt new file mode 100644 index 00000000..eb10a48f --- /dev/null +++ b/inc/3rdparty/site_config/standard/usccb.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='CS_Element_maincontent'] | ||
2 | |||
3 | tidy: no | ||
4 | prune: no | ||
5 | |||
6 | test_url: http://www.usccb.org/bible/readings/072412.cfm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/useit.com.txt b/inc/3rdparty/site_config/standard/useit.com.txt new file mode 100644 index 00000000..f6be84c4 --- /dev/null +++ b/inc/3rdparty/site_config/standard/useit.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | |||
3 | date: substring-after(//p[@class='overline']/strong, ',') | ||
4 | body: //div[@class="maintext"] | ||
5 | strip: //p[@class='overline'] | ||
6 | strip: //h1 | ||
7 | tidy: no | ||
8 | test_url: http://www.useit.com/alertbox/mobile-startup-screen.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/ux.artu.tv.txt b/inc/3rdparty/site_config/standard/ux.artu.tv.txt new file mode 100644 index 00000000..a893bda0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/ux.artu.tv.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | author: ("Arturo Toledo") | ||
2 | title: //div[@class="post"]/h2 | ||
3 | body: //div[@class="entry"] | ||
4 | |||
5 | # Remove Twitter button | ||
6 | strip: //div[@class="entry"]/p[2]/a/img | ||
7 | test_url: http://ux.artu.tv/?p=192 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt b/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt new file mode 100644 index 00000000..3661b06a --- /dev/null +++ b/inc/3rdparty/site_config/standard/uzivatelsketestovani.cz.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title:h1 | ||
2 | test_url: http://www.uzivatelsketestovani.cz/wiki/doku.php/skoleni-axure-rp \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vanityfair.com.txt b/inc/3rdparty/site_config/standard/vanityfair.com.txt new file mode 100644 index 00000000..bfc47d1f --- /dev/null +++ b/inc/3rdparty/site_config/standard/vanityfair.com.txt | |||
@@ -0,0 +1,30 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')] | ||
3 | date: //div[contains(@class, 'cn_date_time')] | ||
4 | body: //div[contains(@class, 'pageContainers')] | ||
5 | body: //article[@id='items-container'] | ||
6 | #body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container'] | ||
7 | |||
8 | strip_id_or_class: bc | ||
9 | strip_id_or_class: utilities | ||
10 | strip_id_or_class: list-supporting | ||
11 | strip_id_or_class: yrail | ||
12 | strip_id_or_class: urail | ||
13 | |||
14 | prune: no | ||
15 | #tidy: no | ||
16 | |||
17 | strip_id_or_class: super-rubric-section | ||
18 | strip_id_or_class: cn_date_time | ||
19 | strip_id_or_class: cn_contributors | ||
20 | strip_id_or_class: cn_pagination_controls | ||
21 | strip_id_or_class: cn_features_container | ||
22 | strip_id_or_class: global-footer | ||
23 | strip_id_or_class: cn_ecom_placement | ||
24 | strip: //li[@class='blogNavPrev'] | ||
25 | |||
26 | single_page_link: //a[@title='Print this page'] | ||
27 | |||
28 | test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105 | ||
29 | test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808 | ||
30 | test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/varingen.no.txt b/inc/3rdparty/site_config/standard/varingen.no.txt new file mode 100644 index 00000000..6b5e0ae0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/varingen.no.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class='ArticleHeadlineDetailedView'] | ||
2 | date: //span[@class='ArticlePublicationDateTimeDetailedView'] | ||
3 | author://span[@class='ArticleBylineDetailedView'] | ||
4 | body: //div[@class='ArticleTextDetailedView'] | ||
5 | test_url: http://www.varingen.no/Nyheter/tabid/392/Default.aspx?ModuleId=56651&articleView=true \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/varsity.co.uk.txt b/inc/3rdparty/site_config/standard/varsity.co.uk.txt new file mode 100644 index 00000000..b1db4c35 --- /dev/null +++ b/inc/3rdparty/site_config/standard/varsity.co.uk.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | # FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser | ||
2 | |||
3 | strip: //h2 | ||
4 | test_url: http://www.varsity.co.uk/reviews/2662 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vedomosti.ru.txt b/inc/3rdparty/site_config/standard/vedomosti.ru.txt new file mode 100644 index 00000000..ba999171 --- /dev/null +++ b/inc/3rdparty/site_config/standard/vedomosti.ru.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //td[@class='second_content']/h1 | ||
2 | body: //td[@class='second_content']/div[@class='article_text'] | ||
3 | test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/veggbilder.no.txt b/inc/3rdparty/site_config/standard/veggbilder.no.txt new file mode 100644 index 00000000..14144c0f --- /dev/null +++ b/inc/3rdparty/site_config/standard/veggbilder.no.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | author: //div[@class="blogginnleggForfatter"] | ||
2 | date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd']) | ||
3 | strip: //div[contains(@id,"bloggDelingslenker")] | ||
4 | strip: //div[contains(@id,"bloggDelingslenker")] | ||
5 | test_url: http://veggbilder.no/blogginnlegg/fristelser \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vemedio.com.txt b/inc/3rdparty/site_config/standard/vemedio.com.txt new file mode 100644 index 00000000..294ace9c --- /dev/null +++ b/inc/3rdparty/site_config/standard/vemedio.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2 | ||
2 | date: substring-before(//small," • Permalink") | ||
3 | author:string('Martin Hering') | ||
4 | |||
5 | Strip: //p/small | ||
6 | test_url: http://vemedio.com/blog/posts/state-of-support-and-icloud \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/venturebeat.com.txt b/inc/3rdparty/site_config/standard/venturebeat.com.txt new file mode 100644 index 00000000..41bfa8c5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/venturebeat.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | author: //div[@class="author-name"] | ||
3 | date: //span[@class="the-time"] | ||
4 | body: //div[@class="entry-content"] | ||
5 | strip: //div[@class="vb-gallery"] | ||
6 | test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/version.php b/inc/3rdparty/site_config/standard/version.php index e61807ed..34a87357 100644 --- a/inc/3rdparty/site_config/standard/version.php +++ b/inc/3rdparty/site_config/standard/version.php | |||
@@ -1,2 +1 @@ | |||
1 | <?php | <?php return 4; ?> \ No newline at end of file | |
2 | return 1; \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/version.txt b/inc/3rdparty/site_config/standard/version.txt new file mode 100644 index 00000000..bf0d87ab --- /dev/null +++ b/inc/3rdparty/site_config/standard/version.txt | |||
@@ -0,0 +1 @@ | |||
4 \ No newline at end of file | |||
diff --git a/inc/3rdparty/site_config/standard/version2.dk.txt b/inc/3rdparty/site_config/standard/version2.dk.txt new file mode 100644 index 00000000..74203cad --- /dev/null +++ b/inc/3rdparty/site_config/standard/version2.dk.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //article/header/h1 | ||
2 | |||
3 | author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a | ||
4 | date: //article/header/section[@class='byline']/span[@class='published']/span | ||
5 | |||
6 | body: //article/section[@class='body'] | ||
7 | |||
8 | convert_double_br_tags: yes | ||
9 | |||
10 | # This is required, because Tidy chokes on the HTML5 tags... | ||
11 | tidy: no | ||
12 | test_url: http://www.version2.dk/artikel/17069-amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/verybestbaking.com.txt b/inc/3rdparty/site_config/standard/verybestbaking.com.txt new file mode 100644 index 00000000..4cdd0c0f --- /dev/null +++ b/inc/3rdparty/site_config/standard/verybestbaking.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | title: //title | ||
2 | body: //div[contains(@class, 'printRecipe')] | ||
3 | strip: //div[@class='recipeHeader'] | ||
4 | prune: no | ||
5 | tidy: no | ||
6 | single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')] | ||
7 | test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vg.no.txt b/inc/3rdparty/site_config/standard/vg.no.txt new file mode 100644 index 00000000..fceeea09 --- /dev/null +++ b/inc/3rdparty/site_config/standard/vg.no.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@id='artikkelspalte'] | ||
2 | strip_id_or_class: 'breadcrumb' | ||
3 | test_url: http://www.vg.no/spill/artikkel.php?artid=10003628 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/video.forbes.com.txt b/inc/3rdparty/site_config/standard/video.forbes.com.txt new file mode 100644 index 00000000..1dca55a3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/video.forbes.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: concat("Video: ", //div[@id='currentVideoTitleDivId']) | ||
2 | body: //div[@id='currentVideoDescriptionId'] | ||
3 | author: //meta[@name='author']/@content | ||
4 | |||
5 | replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease | ||
6 | |||
7 | replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease | ||
8 | |||
9 | test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/videogum.com.txt b/inc/3rdparty/site_config/standard/videogum.com.txt new file mode 100644 index 00000000..a1663813 --- /dev/null +++ b/inc/3rdparty/site_config/standard/videogum.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2[@class='posttitle'] | ||
2 | date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by') | ||
3 | date: //span[@class='postdate'] | ||
4 | author: //span[@class='postdate']/a | ||
5 | body: //div[@class='entry line_top'] | ||
6 | test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/villagevoice.com.txt b/inc/3rdparty/site_config/standard/villagevoice.com.txt new file mode 100644 index 00000000..df374602 --- /dev/null +++ b/inc/3rdparty/site_config/standard/villagevoice.com.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h2[@class='headline'] | ||
2 | |||
3 | body: //div[@class='ContentPrint'] | ||
4 | |||
5 | prune: no | ||
6 | |||
7 | single_page_link: //a[contains(@href, '/printVersion/')] | ||
8 | |||
9 | test_url: http://www.villagevoice.com/2010-03-16/news/new-york-s-ten-worst-landlords/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vimeo.com.txt b/inc/3rdparty/site_config/standard/vimeo.com.txt new file mode 100644 index 00000000..d6c6701a --- /dev/null +++ b/inc/3rdparty/site_config/standard/vimeo.com.txt | |||
@@ -0,0 +1,17 @@ | |||
1 | title: //title | ||
2 | body: //iframe | ||
3 | |||
4 | find_string: <html><iframe | ||
5 | replace_string: <iframe id="video" | ||
6 | |||
7 | find_string: ></iframe></html> | ||
8 | replace_string: ></iframe> | ||
9 | |||
10 | replace_string("): " | ||
11 | |||
12 | single_page_link: //link[@type='text/xml+oembed'] | ||
13 | |||
14 | prune: no | ||
15 | tidy: no | ||
16 | |||
17 | test_url: http://vimeo.com/35941909 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/visir.is.txt b/inc/3rdparty/site_config/standard/visir.is.txt new file mode 100644 index 00000000..0f03198e --- /dev/null +++ b/inc/3rdparty/site_config/standard/visir.is.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | # Author's name, when present, has 'skrifar:' ('writes:') appended to it. | ||
2 | # In case of multiple authors, this would be 'skrifa:', hence only 7 characters | ||
3 | # are stripped off. | ||
4 | author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7) | ||
5 | |||
6 | date: //span[@class='date'] | ||
7 | title: //h1 | ||
8 | body: //div[@class='paragraph'] | ||
9 | |||
10 | # Strip out author string when present | ||
11 | strip: //div[@class='paragraph']/div[@class='meta'] | ||
12 | |||
13 | convert_double_br_tags: yes | ||
14 | test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vitispr.com.txt b/inc/3rdparty/site_config/standard/vitispr.com.txt new file mode 100644 index 00000000..8b2a300e --- /dev/null +++ b/inc/3rdparty/site_config/standard/vitispr.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | strip: //*[(@id = "ja-search")] | ||
2 | body: //*[(@id = "ja-mainbody")] | ||
3 | body: //*[(@id = "content-mass-bottom")] | ||
4 | strip://h3[contains(span,'Related Posts')] | ||
5 | strip://img | ||
6 | test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vivirmexico.com.txt b/inc/3rdparty/site_config/standard/vivirmexico.com.txt new file mode 100644 index 00000000..e6a72700 --- /dev/null +++ b/inc/3rdparty/site_config/standard/vivirmexico.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | body: //*[(@class = "historia")] | ||
2 | test_url: http://vivirmexico.com/2011/09/en-veracruz-arrojan-35-cuerpos-a-plena-luz-del-dia-esta-si-es-una-alarma-social \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vnexpress.net.txt b/inc/3rdparty/site_config/standard/vnexpress.net.txt new file mode 100644 index 00000000..23c928bf --- /dev/null +++ b/inc/3rdparty/site_config/standard/vnexpress.net.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table | ||
2 | strip://div[@class="box-item"] | ||
3 | strip://div[@id="ARTICLE_BANNER"] | ||
4 | strip://a | ||
5 | strip://div[@class="tag-parent"] | ||
6 | strip://div[@class="email-print txtr"] | ||
7 | |||
8 | test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt b/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt new file mode 100644 index 00000000..6bd0e855 --- /dev/null +++ b/inc/3rdparty/site_config/standard/voices.washingtonpost.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@class='entrytext'] | ||
3 | test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/vworker.com.txt b/inc/3rdparty/site_config/standard/vworker.com.txt new file mode 100644 index 00000000..a39c9f4e --- /dev/null +++ b/inc/3rdparty/site_config/standard/vworker.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[contains(@class, 'KonaBody')] | ||
2 | |||
3 | test_url: http://www.vworker.com/RentACoder/misc/BidRequests/ShowBidRequest.asp?lngBidRequestId=1634186 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/waffle.wootest.net.txt b/inc/3rdparty/site_config/standard/waffle.wootest.net.txt new file mode 100644 index 00000000..afcba0f3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/waffle.wootest.net.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h2[@class="title"] | ||
2 | body: //div[@class="post"] | ||
3 | |||
4 | test_url: http://waffle.wootest.net/2011/06/22/on-reading-news/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/walrusmagazine.com.txt b/inc/3rdparty/site_config/standard/walrusmagazine.com.txt new file mode 100644 index 00000000..3ab22172 --- /dev/null +++ b/inc/3rdparty/site_config/standard/walrusmagazine.com.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | title: //div[@id='pr']/h3 | ||
2 | author: //div[@class='dateline']//a[contains(@href, '/author/')] | ||
3 | |||
4 | # print page | ||
5 | body: //div[@id='prbody'] | ||
6 | # standard page | ||
7 | body: //div[@id='pgbody'] | ||
8 | |||
9 | # for multi-page articles | ||
10 | single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')] | ||
11 | |||
12 | prune: no | ||
13 | |||
14 | test_url: http://www.walrusmagazine.com/articles/2011.12-memoir-kidnapped \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/warnerbros.fr.txt b/inc/3rdparty/site_config/standard/warnerbros.fr.txt new file mode 100644 index 00000000..a41a3511 --- /dev/null +++ b/inc/3rdparty/site_config/standard/warnerbros.fr.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h3 | ||
2 | body: //div[@class="content_wysiwyg"] | ||
3 | test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt b/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt new file mode 100644 index 00000000..edf16422 --- /dev/null +++ b/inc/3rdparty/site_config/standard/washingtonmonthly.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title://a[@class = 'headline-article'] | ||
2 | |||
3 | author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ') | ||
4 | date://div[@class = 'article']/span[@class = 'date'] | ||
5 | body://div[@class = 'article'] | ||
6 | single_page_link://a[@class = 'print'] | ||
7 | strip://p[@class = 'author'] | ||
8 | strip://a[@class = 'headline-article'] | ||
9 | strip://span[@class = 'date'] | ||
10 | test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/washingtonpost.com.txt b/inc/3rdparty/site_config/standard/washingtonpost.com.txt new file mode 100644 index 00000000..2931ca5f --- /dev/null +++ b/inc/3rdparty/site_config/standard/washingtonpost.com.txt | |||
@@ -0,0 +1,21 @@ | |||
1 | body: //div[@class="article_body"] | ||
2 | author://meta[@name='DC.creator']/@content | ||
3 | title://meta[@name='title']/@content | ||
4 | date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title | ||
5 | date://meta[@name="DC.date.issued"]/@content | ||
6 | strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"] | ||
7 | strip://div[@id="wp-column six end"] | ||
8 | strip://div[contains(@class,'hidden')] | ||
9 | strip://div[@id='article-side-rail'] | ||
10 | strip://div[@class="module component todays-paper-module curved"] | ||
11 | strip://div[@class="module component live-qa curved img-border"] | ||
12 | strip://div[@class="module component newsletter-signup curved"] | ||
13 | strip://div[@class="module featured-stories component curved img-border"] | ||
14 | |||
15 | strip_id_or_class: carousel | ||
16 | strip_id_or_class: toolbar | ||
17 | strip_id_or_class: module | ||
18 | |||
19 | test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1 | ||
20 | test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html | ||
21 | test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/web-libre.org.txt b/inc/3rdparty/site_config/standard/web-libre.org.txt new file mode 100644 index 00000000..dfcd0081 --- /dev/null +++ b/inc/3rdparty/site_config/standard/web-libre.org.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | body: //div[@id='template_article'] | ||
2 | |||
3 | strip_id_or_class: article_more | ||
4 | strip: //hr | ||
5 | |||
6 | test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt b/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt new file mode 100644 index 00000000..9e75a8a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/weblog.bignerdranch.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title://div[@class="post"]/h2 | ||
2 | author://p[@class="postinfo"]/a | ||
3 | date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ') | ||
4 | body://div[@class="contenttext"] | ||
5 | test_url: http://weblog.bignerdranch.com/?p=304 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/weblogs.asp.net.txt b/inc/3rdparty/site_config/standard/weblogs.asp.net.txt new file mode 100644 index 00000000..3fabda0b --- /dev/null +++ b/inc/3rdparty/site_config/standard/weblogs.asp.net.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | title: //h2[@class="pageTitle"] | ||
2 | strip: //div[@class="postfoot"] | ||
3 | strip: //h2[@class="pageTitle"] | ||
4 | strip: //h3[@class="pageTitle"] | ||
5 | body: //div[@class="post"] | ||
6 | author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed') | ||
7 | date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by') | ||
8 | |||
9 | test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt b/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt new file mode 100644 index 00000000..8922b02f --- /dev/null +++ b/inc/3rdparty/site_config/standard/webpaper.nzz.ch.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | tidy: no | ||
2 | dissolve: //div[@id="content"]/div/article/header | ||
3 | body: //div[@id="content"]/div/article | ||
4 | title: //div[@id="content"]/div/article/h1 | ||
5 | date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"] | ||
6 | strip: //div[@id="content"]/div/article/h1 | ||
7 | |||
8 | test_url: http://webpaper.nzz.ch/2012/06/23/front/JJKMS/aphrodite-und-die-kommunisten?guest_pass=24a3ca5b6d%3AJJKMS%3Ad30e1be8628c099669671d4da56cdce4187790ba \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/welt.de.txt b/inc/3rdparty/site_config/standard/welt.de.txt new file mode 100644 index 00000000..6e4f828f --- /dev/null +++ b/inc/3rdparty/site_config/standard/welt.de.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | # set body | ||
2 | tidy: no | ||
3 | body: //div[contains(@class, 'articleContent')] | ||
4 | |||
5 | # remove clutter | ||
6 | strip: //div[@class='advertising'] | ||
7 | strip: //div[@class='themenalarm'] | ||
8 | strip: //div[contains(@class, 'inTextTeaser')] | ||
9 | |||
10 | # remove captions | ||
11 | strip: //span[@class='copyRight'] | ||
12 | |||
13 | # remove photo galleries and extras | ||
14 | strip: //div[contains(@class, 'textGallery')] | ||
15 | strip: //div[contains(@class, 'videoGallery')] | ||
16 | strip: //div[contains(@class, 'imageGallery')] | ||
17 | strip: //div[contains(@class, 'openContent')] | ||
18 | |||
19 | # remove comments | ||
20 | strip: //div[@id = 'writeComment'] | ||
21 | |||
22 | test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/westhamtillidie.com.txt b/inc/3rdparty/site_config/standard/westhamtillidie.com.txt new file mode 100644 index 00000000..b9343029 --- /dev/null +++ b/inc/3rdparty/site_config/standard/westhamtillidie.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: substring-before(//title, '«') | ||
2 | |||
3 | body: //div[@class='entry'] | ||
4 | strip: //div[@class='sharing_label'] | ||
5 | strip: //div[@class='snap_nopreview sharing robots-nocontent'] | ||
6 | test_url: http://www.westhamtillidie.com/2012/03/11/twelve-things-we-learned-from-the-doncaster-game/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt b/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt new file mode 100644 index 00000000..a88a02c9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/what-if.xkcd.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | autodetect_next_page: no | ||
2 | test_url: http://what-if.xkcd.com/1/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt b/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt new file mode 100644 index 00000000..52c5cf1b --- /dev/null +++ b/inc/3rdparty/site_config/standard/whatever.scalzi.com.txt | |||
@@ -0,0 +1,7 @@ | |||
1 | strip: //div[@class="navigation"] | ||
2 | strip: //div[@id="sidebar"] | ||
3 | strip: //div[@id="post-extra-content"] | ||
4 | strip: //div[@id="footer"] | ||
5 | strip: //div[contains(@class, "sharing")] | ||
6 | |||
7 | test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wheelyric.com.txt b/inc/3rdparty/site_config/standard/wheelyric.com.txt new file mode 100644 index 00000000..aa9783cf --- /dev/null +++ b/inc/3rdparty/site_config/standard/wheelyric.com.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | body://div[contains(@class,'oAndtLyrics')] | ||
2 | strip://div[contains(@class,'info')] | ||
3 | strip://div[contains(@id,'romanization')] | ||
4 | strip://div[contains(@id,'youtube')] | ||
5 | strip://div[contains(@id,'romanizationSelector')] | ||
6 | strip://div[contains(@id,'langSelectWrap')] | ||
7 | strip://div[contains(@id,'requestTranslationWrap')] | ||
8 | strip://div[contains(@id,'viewMore')] | ||
9 | strip://div[contains(@class,'lyricsListInMainContent')] | ||
10 | strip://div[contains(@class,'descIpNoti')] | ||
11 | test_url: http://wheelyric.com/lyrics/121#2 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt b/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt new file mode 100644 index 00000000..1f262a0a --- /dev/null +++ b/inc/3rdparty/site_config/standard/wiki.guildwars.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='content'] | ||
3 | strip_id_or_class: editsection | ||
4 | strip_id_or_class: toc | ||
5 | strip: //div[@id='siteNotice'] | ||
6 | strip: //div[@id='content']//table[last()] | ||
7 | prune: no | ||
8 | test_url: http://wiki.guildwars.com/wiki/Monk \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt b/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt new file mode 100644 index 00000000..e176907e --- /dev/null +++ b/inc/3rdparty/site_config/standard/wiki.guildwars2.com.txt | |||
@@ -0,0 +1,8 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id='content'] | ||
3 | strip_id_or_class: editsection | ||
4 | strip_id_or_class: toc | ||
5 | strip: //div[@id='siteNotice'] | ||
6 | strip: //div[@id='content']//table[last()] | ||
7 | prune: no | ||
8 | test_url: http://wiki.guildwars2.com/wiki/Guardian \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wikitravel.org.txt b/inc/3rdparty/site_config/standard/wikitravel.org.txt new file mode 100644 index 00000000..da5bd0b5 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wikitravel.org.txt | |||
@@ -0,0 +1,14 @@ | |||
1 | # copied from .wikipedia.org.txt | ||
2 | title: //h1[@id='firstHeading' or @class='firstHeading'] | ||
3 | body: //div[@id = 'bodyContent'] | ||
4 | strip_id_or_class: editsection | ||
5 | #strip_id_or_class: toc | ||
6 | strip_id_or_class: vertical-navbox | ||
7 | strip: //table[@id='toc'] | //div[@id='p-toc'] | ||
8 | strip: //div[@id='catlinks' or @id='contentSub'] | ||
9 | strip: //div[@id='jump-to-nav'] | ||
10 | strip: //div[@class='thumbcaption']//div[@class='magnify'] | ||
11 | strip: //table[@class='navbox'] | ||
12 | prune: no | ||
13 | tidy: no | ||
14 | test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/will-self.com.txt b/inc/3rdparty/site_config/standard/will-self.com.txt new file mode 100644 index 00000000..24467c22 --- /dev/null +++ b/inc/3rdparty/site_config/standard/will-self.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | strip: //div[@class="widget-area"] | ||
2 | title: //*[@class="entry-title"] | ||
3 | date: //time[@class="entry-date"] | ||
4 | test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/williampfaff.com.txt b/inc/3rdparty/site_config/standard/williampfaff.com.txt new file mode 100644 index 00000000..fb5f92ed --- /dev/null +++ b/inc/3rdparty/site_config/standard/williampfaff.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: substring-after(//span[@class='itemTitle'], ':') | ||
2 | body: //div[@id='content'] | ||
3 | test_url: http://www.williampfaff.com/modules/news/article.php?storyid=491 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/winfuture.de.txt b/inc/3rdparty/site_config/standard/winfuture.de.txt new file mode 100644 index 00000000..bc936370 --- /dev/null +++ b/inc/3rdparty/site_config/standard/winfuture.de.txt | |||
@@ -0,0 +1,12 @@ | |||
1 | title: //h1/span | ||
2 | |||
3 | body: //div[@id="news_content"] | ||
4 | |||
5 | author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text() | ||
6 | |||
7 | date: //span[@class='date'] | ||
8 | |||
9 | # Rubrikenbild entfernen | ||
10 | strip: //div[@id="news_content"]/a[1] | ||
11 | |||
12 | test_url: http://winfuture.de/news,69672.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/winrumors.com.txt b/inc/3rdparty/site_config/standard/winrumors.com.txt new file mode 100644 index 00000000..cedb4390 --- /dev/null +++ b/inc/3rdparty/site_config/standard/winrumors.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h1[@class='page-heading'] | ||
2 | author: //small/strong/a | ||
3 | #their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time' | ||
4 | date: substring-before(substring-after(//small,'on'),'with') | ||
5 | body: //div[@class='entry'] | ||
6 | test_url: http://www.winrumors.com/chinese-windows-phone-launch-still-on-track-for-early-2012/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/winsupersite.com.txt b/inc/3rdparty/site_config/standard/winsupersite.com.txt new file mode 100644 index 00000000..db6a6fc9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/winsupersite.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | date: //*[@class='kicker'] | ||
2 | body: //*[@class='KonaBody'] | ||
3 | test_url: http://www.winsupersite.com/article/paul-thurrotts-wininfo/android-malware-surges-separate-studies-141364 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wired.com.txt b/inc/3rdparty/site_config/standard/wired.com.txt new file mode 100644 index 00000000..69bbf5b7 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wired.com.txt | |||
@@ -0,0 +1,22 @@ | |||
1 | title: //meta[@property="og:title"]/@content | ||
2 | title: //h1 | ||
3 | title: //*[@class='posttitle'] | ||
4 | author: //*[@class='entryAuthor']/a[1] | ||
5 | author://*[@class='member-title'] | ||
6 | author://li[@class='author']/a[contains(@href, '/author/')] | ||
7 | date: substring-after(//div[@class='entryAuthor'], '·') | ||
8 | date: substring-before(//*[@class='entryDate'], '|') | ||
9 | body: //div[@class='entry'] | ||
10 | strip: //span[contains(@class, 'nextprev')] | ||
11 | #strip_id_or_class: ngg-galleryoverview | ||
12 | # ngg-galleryoverview is the whole content sometimes, e.g. http://www.wired.com/underwire/2011/12/best-mixtapes-of-2011/?pid=5736&viewall=true | ||
13 | |||
14 | strip: //p[span[contains(@class, 'contentjump')]] | ||
15 | strip: //text()[contains(., 'nextpage')] | ||
16 | |||
17 | prune: no | ||
18 | |||
19 | single_page_link: //a[contains(@href, '/all/1') and contains(@class, 'contentjumpall')] | ||
20 | |||
21 | test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/ | ||
22 | test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/1 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wmnf.org.txt b/inc/3rdparty/site_config/standard/wmnf.org.txt new file mode 100644 index 00000000..ffb6b2d1 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wmnf.org.txt | |||
@@ -0,0 +1,13 @@ | |||
1 | title: //div[@class="bodyText"]/h1/text() | ||
2 | body: //div[@class="bodyText"] | ||
3 | |||
4 | # author and date are separated by only a newline | ||
5 | # can't figure out how to tokenize that yet | ||
6 | author: //div[@class="bodyText"]/span[@class="info"]/text() | ||
7 | date: //div[@class="bodyText"]/span[@class="info"]/text() | ||
8 | |||
9 | # strip metdata from body text | ||
10 | strip: //div[@class="bodyText"]/h1/text() | ||
11 | strip: //div[@class="bodyText"]/span[@class="info"] | ||
12 | strip: //div[@class="bodyText"]/span[@class="info"] | ||
13 | test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wmpoweruser.com.txt b/inc/3rdparty/site_config/standard/wmpoweruser.com.txt new file mode 100644 index 00000000..d9011d24 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wmpoweruser.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | date://*[@class="entry-date"] | ||
2 | author://*[@class="author vcard"] | ||
3 | strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"] | ||
4 | test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/worldpoultry.net.txt b/inc/3rdparty/site_config/standard/worldpoultry.net.txt new file mode 100644 index 00000000..0e42ca5e --- /dev/null +++ b/inc/3rdparty/site_config/standard/worldpoultry.net.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title: //div[@class="content article"]/h1 | ||
2 | date: substring-after(//*[@class='date'], '//') | ||
3 | body: //*[@class='article-content'] | ||
4 | strip: //*[@id='nomodal'] | ||
5 | test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/worldwidewords.org.txt b/inc/3rdparty/site_config/standard/worldwidewords.org.txt new file mode 100644 index 00000000..733d607f --- /dev/null +++ b/inc/3rdparty/site_config/standard/worldwidewords.org.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //p[@id='content'] | ||
2 | |||
3 | body: //div[@class='contentblock'] | ||
4 | test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wow.joystiq.com.txt b/inc/3rdparty/site_config/standard/wow.joystiq.com.txt new file mode 100644 index 00000000..759fb81f --- /dev/null +++ b/inc/3rdparty/site_config/standard/wow.joystiq.com.txt | |||
@@ -0,0 +1,6 @@ | |||
1 | title: //h2[@class="posttitle"] | ||
2 | body: //div[@class="post"] | ||
3 | strip: //h2[@class="posttitle"] | ||
4 | strip: //p[@class="filed-under"] | ||
5 | convert_double_br_tags: yes | ||
6 | test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt b/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt new file mode 100644 index 00000000..0846be2c --- /dev/null +++ b/inc/3rdparty/site_config/standard/www1.folha.uol.com.br.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | body://div[@id='articleNew'] | ||
2 | strip://div[@id='articleBy'] | ||
3 | strip://div[@id='articleDate'] | ||
4 | strip://td[@class='articleGraphicCredit'] | ||
5 | strip://h1 | ||
6 | strip://div[@id='articleEnd'] | ||
7 | strip://p[@class='tagline'] | ||
8 | strip://div[@class='openBox adslibraryArticle'] | ||
9 | strip_id_or_class:ad-180x150-1 | ||
10 | |||
11 | |||
12 | title: //div[@id="articleNew"]/h1 | ||
13 | author: //div[@id="articleBy"]/p/b | ||
14 | date: substring-before(//div[@id="articleDate"], "-") | ||
15 | test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt b/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt new file mode 100644 index 00000000..71306af2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/www3.imperial.ac.uk.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | strip_id_or_class: hidelabel | ||
2 | test_url: http://www3.imperial.ac.uk/newsandeventspggrp/imperialcollege/newssummary/news_14-7-2010-15-53-18 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wyborcza.pl.txt b/inc/3rdparty/site_config/standard/wyborcza.pl.txt new file mode 100644 index 00000000..f99467c2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wyborcza.pl.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | title:h1 | ||
2 | author: //*[@class = 'author'] | ||
3 | date: //*[@class = 'date'] | ||
4 | body: //*[@id = 'art'] | ||
5 | next_page_link: //*[@id='Str']/a[contains(text(), 'nastepne')] | ||
6 | strip: //*[@class = 'rel_zdjTOP'] | ||
7 | strip: //*[@id = 'rel'] | ||
8 | strip: //*[@class = 'txt_upl'] | ||
9 | strip: //*[@id='Str'] | ||
10 | strip: //*[@id='source'] | ||
11 | test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wyctim.com.txt b/inc/3rdparty/site_config/standard/wyctim.com.txt new file mode 100644 index 00000000..d8c8713b --- /dev/null +++ b/inc/3rdparty/site_config/standard/wyctim.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | body: //div[@class='article-body'] | ||
2 | title: //h1 | ||
3 | test_url: http://wyctim.com/icloud-sync-regebbi-rendszereken/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/wz-newsline.de.txt b/inc/3rdparty/site_config/standard/wz-newsline.de.txt new file mode 100644 index 00000000..fbc1d3d2 --- /dev/null +++ b/inc/3rdparty/site_config/standard/wz-newsline.de.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title://h1 | ||
2 | |||
3 | date://p[@class='articleDate'] | ||
4 | body://div[@class='articleBody wzStandardArticle'] | ||
5 | test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/xoeb.us.txt b/inc/3rdparty/site_config/standard/xoeb.us.txt new file mode 100644 index 00000000..e02960e0 --- /dev/null +++ b/inc/3rdparty/site_config/standard/xoeb.us.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | title: //h1[@class="entry-title"] | ||
2 | author: //span[@class="fn"] | ||
3 | date: //p[@class="meta"] | ||
4 | test_url: http://xoeb.us/blog/2012/03/16/my-mistakes-with-our-first-release/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/yated.com.txt b/inc/3rdparty/site_config/standard/yated.com.txt new file mode 100644 index 00000000..13a3ea64 --- /dev/null +++ b/inc/3rdparty/site_config/standard/yated.com.txt | |||
@@ -0,0 +1,2 @@ | |||
1 | title: //div[@class='pagetitle'] | ||
2 | test_url: http://www.yated.com/content.asp?categoryid=7&contentid=582 \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/yostivanich.com.txt b/inc/3rdparty/site_config/standard/yostivanich.com.txt new file mode 100644 index 00000000..9e24db3c --- /dev/null +++ b/inc/3rdparty/site_config/standard/yostivanich.com.txt | |||
@@ -0,0 +1,5 @@ | |||
1 | title://div[@class='entry-title'] | ||
2 | body://div[@class='entry-content'] | ||
3 | strip_comments:yes | ||
4 | convert_double_br_tags:yes | ||
5 | test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/youtube.com.txt b/inc/3rdparty/site_config/standard/youtube.com.txt new file mode 100644 index 00000000..d52b7356 --- /dev/null +++ b/inc/3rdparty/site_config/standard/youtube.com.txt | |||
@@ -0,0 +1,15 @@ | |||
1 | title: //title | ||
2 | body: //iframe | ||
3 | |||
4 | find_string: <html><iframe | ||
5 | replace_string: <iframe id="video" | ||
6 | |||
7 | find_string: ></iframe></html> | ||
8 | replace_string: ></iframe> | ||
9 | |||
10 | single_page_link: //link[@type='text/xml+oembed'] | ||
11 | |||
12 | prune: no | ||
13 | tidy: no | ||
14 | |||
15 | test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zdnet.com.txt b/inc/3rdparty/site_config/standard/zdnet.com.txt new file mode 100644 index 00000000..b244b229 --- /dev/null +++ b/inc/3rdparty/site_config/standard/zdnet.com.txt | |||
@@ -0,0 +1,10 @@ | |||
1 | title: //h1[@class="h s-1"] | ||
2 | author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|') | ||
3 | author: substring-after(//div[@class="bio"]//h3, 'About ') | ||
4 | date: substring-after(//p[@class="meta s-10"], '|') | ||
5 | date: substring-after(//p[@class="meta"], '|') | ||
6 | body: //div[@class="content-1 entry space-1 clear"] | ||
7 | body: //div[@class="storyBody"] | ||
8 | |||
9 | test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920 | ||
10 | test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zeit.de.txt b/inc/3rdparty/site_config/standard/zeit.de.txt new file mode 100644 index 00000000..66a7f1ac --- /dev/null +++ b/inc/3rdparty/site_config/standard/zeit.de.txt | |||
@@ -0,0 +1,44 @@ | |||
1 | # 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions | ||
2 | # 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section) | ||
3 | # 2011-12-09 [carlo@...] Removed "related articles" block | ||
4 | # 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications. | ||
5 | # 2011-08-20 [carlo@...] added author, fixed date | ||
6 | |||
7 | |||
8 | single_page_link: //a[@title='Druckversion'] | ||
9 | tidy: no | ||
10 | |||
11 | title: //title | ||
12 | date: substring-before( //li[@class="date"], " " ) | ||
13 | author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text() | ||
14 | author: substring-after(//li[@class='source first '], 'Quelle: ') | ||
15 | |||
16 | strip_id_or_class: articleheader | ||
17 | strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"] | ||
18 | |||
19 | #Removes author and date from the start | ||
20 | strip: //ul[@class="tools"] | ||
21 | #Removes copyright statement - often disturb as first line of the news | ||
22 | strip: //p[@class="copyright"] | ||
23 | strip: //div[@class="copyright"] | ||
24 | #Removes pagination links at the end | ||
25 | strip: //div[@class="pagination"] | ||
26 | |||
27 | # Fix picture captions | ||
28 | wrap_in(small): //p[@class="caption"]/text() | ||
29 | |||
30 | # Fix sub-headlines | ||
31 | wrap_in(h2): //p/strong | ||
32 | dissolve: //h2/strong | ||
33 | |||
34 | #Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here. | ||
35 | strip_id_or_class:"informatives" | ||
36 | strip_id_or_class:"bottom" | ||
37 | strip_id_or_class:"teasermosaic" | ||
38 | strip_id_or_class:"comments" | ||
39 | strip_id_or_class:"articlefooter af" | ||
40 | strip_id_or_class:"relateds" | ||
41 | strip_id_or_class:"pagination" | ||
42 | |||
43 | footnotes: no | ||
44 | test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zerodistraction.com.txt b/inc/3rdparty/site_config/standard/zerodistraction.com.txt new file mode 100644 index 00000000..d3b60c7d --- /dev/null +++ b/inc/3rdparty/site_config/standard/zerodistraction.com.txt | |||
@@ -0,0 +1,4 @@ | |||
1 | author: //span[@class='author']//a | ||
2 | date: //span[@class='date'] | ||
3 | test_url: http://zerodistraction.com/blog/2012/3/11/retina-ipad-that-means-i-am-going-digital-only-for-comic-boo.html | ||
4 | test_url: http://zerodistraction.com/notes/unreasonably-grumpy \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zerokspot.com.txt b/inc/3rdparty/site_config/standard/zerokspot.com.txt new file mode 100644 index 00000000..ea9132aa --- /dev/null +++ b/inc/3rdparty/site_config/standard/zerokspot.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: //h1 | ||
2 | body: //div[@id="primarycontent"] | ||
3 | test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/ \ No newline at end of file | ||
diff --git a/inc/3rdparty/site_config/standard/zingtrain.com.txt b/inc/3rdparty/site_config/standard/zingtrain.com.txt new file mode 100644 index 00000000..2a2f58a8 --- /dev/null +++ b/inc/3rdparty/site_config/standard/zingtrain.com.txt | |||
@@ -0,0 +1,3 @@ | |||
1 | title: substring-after(id, 'post')/h2 | ||
2 | body://div[@class = 'entry'] | ||
3 | test_url: http://www.zingtrain.com/category/ontrack/january-2007/ \ No newline at end of file | ||